Repository: oxidecomputer/propolis
Branch: master
Commit: 3b21bdc2aaf3
Files: 383
Total size: 3.3 MB

Directory structure:
gitextract_ra4peprf/

├── .cargo/
│   └── config.toml
├── .git-blame-ignore-revs
├── .github/
│   ├── buildomat/
│   │   ├── config.toml
│   │   ├── jobs/
│   │   │   ├── check-headers.sh
│   │   │   ├── falcon-build.sh
│   │   │   ├── image.sh
│   │   │   ├── phd-build.sh
│   │   │   ├── phd-run-migrate-from-base.sh
│   │   │   ├── phd-run.sh
│   │   │   └── test-gimlet.sh
│   │   └── phd-run-with-args.sh
│   └── workflows/
│       └── rust.yml
├── .gitignore
├── .licenserc.yaml
├── Cargo.toml
├── LICENSE
├── README.md
├── bin/
│   ├── dropshot-apis/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── main.rs
│   ├── mock-server/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── lib/
│   │       │   ├── api_types.rs
│   │       │   └── lib.rs
│   │       └── main.rs
│   ├── propolis-cli/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       └── main.rs
│   ├── propolis-server/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       ├── lib/
│   │       │   ├── config.rs
│   │       │   ├── initializer.rs
│   │       │   ├── lib.rs
│   │       │   ├── migrate/
│   │       │   │   ├── codec.rs
│   │       │   │   ├── destination.rs
│   │       │   │   ├── memx.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── preamble.rs
│   │       │   │   ├── protocol.rs
│   │       │   │   └── source.rs
│   │       │   ├── serial/
│   │       │   │   ├── history_buffer.rs
│   │       │   │   └── mod.rs
│   │       │   ├── server.rs
│   │       │   ├── spec/
│   │       │   │   ├── api_spec_v0.rs
│   │       │   │   ├── builder.rs
│   │       │   │   └── mod.rs
│   │       │   ├── stats/
│   │       │   │   ├── mod.rs
│   │       │   │   ├── network_interface.rs
│   │       │   │   ├── pvpanic.rs
│   │       │   │   ├── virtual_disk.rs
│   │       │   │   └── virtual_machine.rs
│   │       │   ├── vcpu_tasks.rs
│   │       │   ├── vm/
│   │       │   │   ├── active.rs
│   │       │   │   ├── ensure.rs
│   │       │   │   ├── guest_event.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── objects.rs
│   │       │   │   ├── request_queue.rs
│   │       │   │   ├── services.rs
│   │       │   │   ├── state_driver.rs
│   │       │   │   └── state_publisher.rs
│   │       │   └── vnc.rs
│   │       ├── main.rs
│   │       └── proptest-regressions/
│   │           └── vm/
│   │               └── request_queue.txt
│   ├── propolis-standalone/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       ├── cidata.rs
│   │       ├── config.rs
│   │       ├── main.rs
│   │       └── snapshot.rs
│   └── propolis-utils/
│       ├── Cargo.toml
│       ├── README.md
│       └── src/
│           └── bin/
│               ├── cpuid-gen.rs
│               └── rsrvrctl.rs
├── crates/
│   ├── bhyve-api/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── header-check/
│   │   │   ├── Cargo.toml
│   │   │   ├── README.md
│   │   │   ├── build.rs
│   │   │   └── test/
│   │   │       └── main.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── sys/
│   │       ├── Cargo.toml
│   │       └── src/
│   │           ├── enums.rs
│   │           ├── ioctls.rs
│   │           ├── lib.rs
│   │           ├── structs.rs
│   │           └── vmm_data.rs
│   ├── cpuid-profile-config/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── cpuid-utils/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── bits.rs
│   │       ├── host.rs
│   │       ├── instance_spec.rs
│   │       └── lib.rs
│   ├── dladm/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── lib.rs
│   │       └── sys.rs
│   ├── nvpair/
│   │   ├── Cargo.toml
│   │   ├── header-check/
│   │   │   ├── Cargo.toml
│   │   │   ├── build.rs
│   │   │   └── test/
│   │   │       └── main.rs
│   │   ├── src/
│   │   │   └── lib.rs
│   │   └── sys/
│   │       ├── Cargo.toml
│   │       └── src/
│   │           └── lib.rs
│   ├── pbind/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── propolis-api-types/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── disk.rs
│   │       ├── instance.rs
│   │       ├── instance_spec/
│   │       │   ├── components/
│   │       │   │   ├── backends.rs
│   │       │   │   ├── board.rs
│   │       │   │   ├── devices.rs
│   │       │   │   └── mod.rs
│   │       │   └── mod.rs
│   │       ├── lib.rs
│   │       ├── migration.rs
│   │       └── serial.rs
│   ├── propolis-api-types-versions/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── add_vsock/
│   │       │   ├── api.rs
│   │       │   ├── components/
│   │       │   │   ├── devices.rs
│   │       │   │   └── mod.rs
│   │       │   ├── instance_spec.rs
│   │       │   └── mod.rs
│   │       ├── crucible_volume_info/
│   │       │   ├── disk.rs
│   │       │   └── mod.rs
│   │       ├── impls/
│   │       │   ├── instance.rs
│   │       │   ├── instance_spec.rs
│   │       │   └── mod.rs
│   │       ├── initial/
│   │       │   ├── components/
│   │       │   │   ├── backends.rs
│   │       │   │   ├── board.rs
│   │       │   │   ├── devices.rs
│   │       │   │   └── mod.rs
│   │       │   ├── disk.rs
│   │       │   ├── instance.rs
│   │       │   ├── instance_spec.rs
│   │       │   ├── migration.rs
│   │       │   ├── mod.rs
│   │       │   └── serial.rs
│   │       ├── latest.rs
│   │       ├── lib.rs
│   │       └── programmable_smbios/
│   │           ├── api.rs
│   │           ├── instance_spec.rs
│   │           └── mod.rs
│   ├── propolis-config-toml/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── lib.rs
│   │       └── spec.rs
│   ├── propolis-server-api/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── propolis-types/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── rfb/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   ├── examples/
│   │   │   ├── shared.rs
│   │   │   ├── socket.rs
│   │   │   └── websock.rs
│   │   └── src/
│   │       ├── encodings.rs
│   │       ├── keysym.rs
│   │       ├── lib.rs
│   │       ├── proto.rs
│   │       ├── server.rs
│   │       └── tungstenite.rs
│   ├── rgb-frame/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   └── viona-api/
│       ├── Cargo.toml
│       ├── header-check/
│       │   ├── Cargo.toml
│       │   ├── build.rs
│       │   └── test/
│       │       └── main.rs
│       └── src/
│           ├── ffi.rs
│           └── lib.rs
├── docs/
│   ├── lifecycle.md
│   ├── migrate-with-crucible.md
│   ├── server-send-vcr.md
│   └── standalone-with-crucible.md
├── lib/
│   ├── propolis/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── accessors.rs
│   │       ├── api_version.rs
│   │       ├── attestation/
│   │       │   ├── boot_digest/
│   │       │   │   ├── crucible.rs
│   │       │   │   └── mod.rs
│   │       │   ├── mod.rs
│   │       │   └── server.rs
│   │       ├── block/
│   │       │   ├── attachment.rs
│   │       │   ├── crucible.rs
│   │       │   ├── file.rs
│   │       │   ├── id.rs
│   │       │   ├── in_memory.rs
│   │       │   ├── mem_async.rs
│   │       │   ├── minder.rs
│   │       │   └── mod.rs
│   │       ├── chardev/
│   │       │   ├── file_out.rs
│   │       │   ├── mod.rs
│   │       │   ├── pollers.rs
│   │       │   └── sock.rs
│   │       ├── common.rs
│   │       ├── cpuid.rs
│   │       ├── enlightenment/
│   │       │   ├── bhyve.rs
│   │       │   ├── hyperv/
│   │       │   │   ├── bits.rs
│   │       │   │   ├── hypercall.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── overlay.rs
│   │       │   │   └── tsc.rs
│   │       │   └── mod.rs
│   │       ├── exits.rs
│   │       ├── firmware/
│   │       │   ├── mod.rs
│   │       │   └── smbios/
│   │       │       ├── bits.rs
│   │       │       ├── mod.rs
│   │       │       └── table.rs
│   │       ├── hw/
│   │       │   ├── bhyve/
│   │       │   │   ├── atpic.rs
│   │       │   │   ├── atpit.rs
│   │       │   │   ├── hpet.rs
│   │       │   │   ├── ioapic.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── pmtimer.rs
│   │       │   │   └── rtc.rs
│   │       │   ├── chipset/
│   │       │   │   ├── i440fx.rs
│   │       │   │   └── mod.rs
│   │       │   ├── ibmpc.rs
│   │       │   ├── ids.rs
│   │       │   ├── mod.rs
│   │       │   ├── nvme/
│   │       │   │   ├── admin.rs
│   │       │   │   ├── bits.rs
│   │       │   │   ├── cmds.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── queue.rs
│   │       │   │   └── requests.rs
│   │       │   ├── pci/
│   │       │   │   ├── bar.rs
│   │       │   │   ├── bits.rs
│   │       │   │   ├── bridge.rs
│   │       │   │   ├── bus.rs
│   │       │   │   ├── cfgspace.rs
│   │       │   │   ├── device.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── test.rs
│   │       │   │   └── topology.rs
│   │       │   ├── ps2/
│   │       │   │   ├── ctrl.rs
│   │       │   │   ├── keyboard/
│   │       │   │   │   ├── mod.rs
│   │       │   │   │   ├── scan_code_1.rs
│   │       │   │   │   └── scan_code_2.rs
│   │       │   │   └── mod.rs
│   │       │   ├── qemu/
│   │       │   │   ├── debug.rs
│   │       │   │   ├── fwcfg.rs
│   │       │   │   ├── mod.rs
│   │       │   │   ├── pvpanic.rs
│   │       │   │   └── ramfb.rs
│   │       │   ├── testdev.rs
│   │       │   ├── uart/
│   │       │   │   ├── lpc.rs
│   │       │   │   ├── mod.rs
│   │       │   │   └── uart16550.rs
│   │       │   └── virtio/
│   │       │       ├── bits.rs
│   │       │       ├── block.rs
│   │       │       ├── mod.rs
│   │       │       ├── p9fs.rs
│   │       │       ├── pci.rs
│   │       │       ├── queue.rs
│   │       │       ├── softnpu.rs
│   │       │       ├── testutil.rs
│   │       │       ├── viona.rs
│   │       │       └── vsock.rs
│   │       ├── intr_pins.rs
│   │       ├── lib.rs
│   │       ├── lifecycle.rs
│   │       ├── migrate.rs
│   │       ├── mmio.rs
│   │       ├── msr.rs
│   │       ├── pio.rs
│   │       ├── tasks.rs
│   │       ├── util/
│   │       │   ├── aspace.rs
│   │       │   ├── id.rs
│   │       │   ├── mod.rs
│   │       │   └── regmap.rs
│   │       ├── vcpu.rs
│   │       ├── vmm/
│   │       │   ├── hdl.rs
│   │       │   ├── machine.rs
│   │       │   ├── mem.rs
│   │       │   ├── mod.rs
│   │       │   └── time.rs
│   │       └── vsock/
│   │           ├── buffer.rs
│   │           ├── mod.rs
│   │           ├── packet.rs
│   │           ├── poller.rs
│   │           ├── poller_stub.rs
│   │           └── proxy.rs
│   └── propolis-client/
│       ├── Cargo.toml
│       └── src/
│           ├── lib.rs
│           └── support.rs
├── openapi/
│   └── propolis-server/
│       ├── propolis-server-1.0.0-833484.json.gitstub
│       ├── propolis-server-2.0.0-d68a9f.json.gitstub
│       ├── propolis-server-3.0.0-10da2b.json.gitstub
│       ├── propolis-server-4.0.0-5ce09a.json.gitstub
│       └── propolis-server-5.0.0-0c6dd9.json
├── packaging/
│   ├── package-manifest.toml
│   ├── propolis-package/
│   │   ├── Cargo.toml
│   │   ├── README.md
│   │   └── src/
│   │       └── main.rs
│   └── smf/
│       ├── method_script.sh
│       └── propolis-server/
│           └── manifest.xml
├── phd-tests/
│   ├── .gitignore
│   ├── README.md
│   ├── artifacts.toml
│   ├── framework/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       ├── artifacts/
│   │       │   ├── buildomat.rs
│   │       │   ├── manifest.rs
│   │       │   ├── mod.rs
│   │       │   └── store.rs
│   │       ├── disk/
│   │       │   ├── crucible.rs
│   │       │   ├── fat.rs
│   │       │   ├── file.rs
│   │       │   ├── in_memory.rs
│   │       │   └── mod.rs
│   │       ├── guest_os/
│   │       │   ├── alpine.rs
│   │       │   ├── debian11_nocloud.rs
│   │       │   ├── linux.rs
│   │       │   ├── mod.rs
│   │       │   ├── shell_commands.rs
│   │       │   ├── ubuntu22_04.rs
│   │       │   ├── windows.rs
│   │       │   ├── windows_server_2016.rs
│   │       │   ├── windows_server_2019.rs
│   │       │   └── windows_server_2022.rs
│   │       ├── host_api/
│   │       │   ├── kvm.rs
│   │       │   ├── mod.rs
│   │       │   └── stubs.rs
│   │       ├── lib.rs
│   │       ├── lifecycle.rs
│   │       ├── log_config.rs
│   │       ├── port_allocator.rs
│   │       ├── serial/
│   │       │   ├── mod.rs
│   │       │   ├── raw_buffer.rs
│   │       │   └── vt80x24.rs
│   │       ├── test_vm/
│   │       │   ├── config.rs
│   │       │   ├── environment.rs
│   │       │   ├── metrics.rs
│   │       │   ├── mod.rs
│   │       │   ├── server.rs
│   │       │   └── spec.rs
│   │       └── zfs.rs
│   ├── quickstart.sh
│   ├── runner/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── config.rs
│   │       ├── execute.rs
│   │       ├── fixtures.rs
│   │       └── main.rs
│   ├── testcase/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   ├── testcase_macro/
│   │   ├── Cargo.toml
│   │   └── src/
│   │       └── lib.rs
│   └── tests/
│       ├── Cargo.toml
│       ├── src/
│       │   ├── boot_order/
│       │   │   └── efi_utils.rs
│       │   ├── boot_order.rs
│       │   ├── cpuid.rs
│       │   ├── crucible/
│       │   │   ├── migrate.rs
│       │   │   ├── mod.rs
│       │   │   └── smoke.rs
│       │   ├── disk.rs
│       │   ├── framework.rs
│       │   ├── hw.rs
│       │   ├── hyperv.rs
│       │   ├── lib.rs
│       │   ├── migrate.rs
│       │   ├── server_state_machine.rs
│       │   ├── smoke.rs
│       │   ├── stats.rs
│       │   └── vsock.rs
│       └── testdata/
│           └── dirt.sh
├── rust-toolchain.toml
├── rustfmt.toml
├── scripts/
│   ├── README.md
│   ├── cpuid-queries.d
│   ├── live-migration-times.d
│   ├── nvme-trace.d
│   ├── time-adjustments.d
│   ├── viona.d
│   └── vm-exit-codes.d
├── tools/
│   ├── check_headers
│   └── install_builder_prerequisites.sh
└── xtask/
    ├── Cargo.toml
    └── src/
        ├── external.rs
        ├── main.rs
        ├── task_clippy.rs
        ├── task_fmt.rs
        ├── task_license.rs
        ├── task_phd.rs
        ├── task_prepush.rs
        ├── task_style.rs
        └── util.rs

================================================
FILE CONTENTS
================================================

================================================
FILE: .cargo/config.toml
================================================
[alias]
xtask = "run --package xtask --quiet --"

[env]
# Currently required by Falcon due to
# https://github.com/rust-lang/cargo/issues/3946#issuecomment-973132993
CARGO_WORKSPACE_DIR = { value = "", relative = true }

[build]
# Tokio's unstable features are required by `tokio-dtrace` probes, and for
# disabling the LIFO slot optimization.
#
# See here for details:
# https://github.com/oxidecomputer/oxide-tokio-rt/blob/main/README.md#enabling-tokio_unstable-features
rustflags = ["--cfg", "tokio_unstable"]


================================================
FILE: .git-blame-ignore-revs
================================================
# Whitespace-only changes
da65a727cceee1b386ded8609309015c366cca2a


================================================
FILE: .github/buildomat/config.toml
================================================
#
# This file, with this flag, must be present in the default branch in order for
# the buildomat integration to create check suites.
#
enable = true

#
# Require approval for pull requests made by users outside our organisation.
#
org_only = true

#
# Allow jobs on pull requests from automated services specified here.
#
allow_users = [
	"dependabot[bot]",
]


================================================
FILE: .github/buildomat/jobs/check-headers.sh
================================================
#!/bin/bash
#:
#: name = "header-check"
#: variety = "basic"
#: target = "helios-2.0"
#: rust_toolchain = true
#:

# Run the various `header-check` tests across Propolis' crates.
#
# These tests are run on an illumos target for best fidelity: while the
# immediate struct and function definitions could in theory be analyzed
# anywhere, they may contain definitions that vary across target OSes. We must
# ensure, at a minimum, that FFI definitions are correct w.r.t these headers'
# interpretation on illumos. Anywhere else is just a convenience.

set -e

GATE_REF="$(./tools/check_headers gate_ref)"

# TODO: `--branch` is overly restrictive, but it's what we've got. In git 2.49
# the --revision flag was added to `git-clone`, and can clone an arbitrary
# revision, which is more appropriate here. We might be tracking an arbitrary
# commit with some changes in illumos-gate that isn't yet merged, after all.
git clone --depth 1 --branch "$GATE_REF" \
	https://code.oxide.computer/illumos-gate ./gate_src

./tools/check_headers run ./gate_src


================================================
FILE: .github/buildomat/jobs/falcon-build.sh
================================================
#!/bin/bash
#:
#: name = "falcon"
#: variety = "basic"
#: target = "helios-2.0"
#: rust_toolchain = "stable"
#: output_rules = [
#:   "/work/release/*",
#: ]
#:
#: [[publish]]
#: series = "falcon"
#: name = "propolis-server"
#: from_output = "/work/release/propolis-server"
#:
#: [[publish]]
#: series = "falcon"
#: name = "propolis-server.sha256.txt"
#: from_output = "/work/release/propolis-server.sha256.txt"
#:
#: [[publish]]
#: series = "falcon"
#: name = "propolis-cli"
#: from_output = "/work/release/propolis-cli"
#:
#: [[publish]]
#: series = "falcon"
#: name = "propolis-cli.sha256.txt"
#: from_output = "/work/release/propolis-cli.sha256.txt"

set -o errexit
set -o pipefail
set -o xtrace

cargo --version
rustc --version

banner prerequisites
ptime -m ./tools/install_builder_prerequisites.sh -y

banner check
ptime -m cargo check --features falcon
ptime -m cargo clippy --features falcon --all-targets

banner build
ptime -m cargo build --features falcon --release \
	-p propolis-server -p propolis-cli

OUTDIR=/work/release
mkdir -p $OUTDIR
cp target/release/propolis-cli $OUTDIR/propolis-cli
cp target/release/propolis-server $OUTDIR/propolis-server

cd $OUTDIR
digest -a sha256 propolis-cli > propolis-cli.sha256.txt
digest -a sha256 propolis-server > propolis-server.sha256.txt


================================================
FILE: .github/buildomat/jobs/image.sh
================================================
#!/bin/bash
#:
#: name = "image"
#: variety = "basic"
#: target = "helios-2.0"
#: rust_toolchain = "stable"
#: output_rules = [
#:   "/out/*",
#: ]
#:
#: [[publish]]
#: series = "image"
#: name = "propolis-server.tar.gz"
#: from_output = "/out/propolis-server.tar.gz"
#:
#: [[publish]]
#: series = "image"
#: name = "propolis-server.sha256.txt"
#: from_output = "/out/propolis-server.sha256.txt"
#:

set -o errexit
set -o pipefail
set -o xtrace

cargo --version
rustc --version

banner prerequisites
ptime -m ./tools/install_builder_prerequisites.sh -y

banner build

# Enable the "omicron-build" feature to indicate this is an artifact destined
# for production use on an appropriately configured Oxide machine
#
# The 'release' profile is configured for abort-on-panic, so we get an
# immediate coredump rather than unwinding in the case of an error.
ptime -m cargo build --release --verbose -p propolis-server --features omicron-build

banner image
ptime -m cargo run -p propolis-package

banner contents
tar tvfz out/propolis-server.tar.gz

banner copy
pfexec mkdir -p /out
pfexec chown "$UID" /out
mv out/propolis-server.tar.gz /out/propolis-server.tar.gz
cd /out
digest -a sha256 propolis-server.tar.gz > propolis-server.sha256.txt


================================================
FILE: .github/buildomat/jobs/phd-build.sh
================================================
#!/bin/bash
#:
#: name = "phd-build"
#: variety = "basic"
#: target = "helios-2.0"
#: rust_toolchain = "stable"
#: output_rules = [
#:   "/out/*",
#: ]
#:
#: [[publish]]
#: series = "phd_build"
#: name = "propolis-server.tar.gz"
#: from_output = "/out/propolis-server-debug.tar.gz"
#:
#: [[publish]]
#: series = "phd_build"
#: name = "propolis-server.sha256.txt"
#: from_output = "/out/propolis-server-debug.sha256.txt"
#:
#: [[publish]]
#: series = "propolis_tests"
#: name = "propolis-tests.tar.gz"
#: from_output = "/out/propolis-tests-debug.tar.gz"

set -o errexit
set -o pipefail
set -o xtrace

outdir="/out"

cargo --version
rustc --version

banner prerequisites
ptime -m ./tools/install_builder_prerequisites.sh -y

# Build the Propolis server binary with 'dev' profile to enable assertions that
# should fire during tests.
banner build-propolis

# We'll do a few cargo builds, keeping features the same means we reuse build
# artifacts from crates these configure.
TEST_FEATURES="omicron-build,failure-injection"

# Compile propolis-server so that it allows development features to be used even
# though the `omicron-build` feature is enabled. This should be a relatively
# small incremental step after building and running tests with the same
# features, above.
export PHD_BUILD="true"
ptime -m cargo build --verbose -p propolis-server \
	--features "$TEST_FEATURES"

# Build Propolis test binaries, but they won't run here. The path to get here
# is unfortunate:
# * we don't have `git` on a Gimlet target.
# * or `pkg`.
# * `uname -m` is "oxide", which confuses rustup too.
# * some tests need a VMM, but this job is probably run inside a VM.
#
# Doing the build on a bare gimlet host is "possible", but it's much easier to
# just build the test binaries here and squirrel them off to *run* on a Gimlet.
# So do that.
ptime -m cargo build --tests --verbose --features "$TEST_FEATURES"

# The PHD runner requires unwind-on-panic to catch certain test failures, so
# build it with the 'dev' profile which is so configured.
banner build-phd
ptime -m cargo build --verbose -p phd-runner

banner contents
tar -czvf target/debug/propolis-server-debug.tar.gz \
	-C target/debug propolis-server

tar -czvf target/debug/phd-runner.tar.gz \
	-C target/debug phd-runner \
	-C phd-tests artifacts.toml

tar -czvf target/debug/propolis-tests-debug.tar.gz \
	$(find target/debug/deps/ -perm -111 -type f -name 'propolis-*')

banner copy
pfexec mkdir -p $outdir
pfexec chown "$UID" $outdir
cp .github/buildomat/phd-run-with-args.sh $outdir/phd-run-with-args.sh
mv target/debug/propolis-server-debug.tar.gz \
	$outdir/propolis-server-debug.tar.gz
mv target/debug/phd-runner.tar.gz $outdir/phd-runner.tar.gz
mv target/debug/propolis-tests-debug.tar.gz $outdir/propolis-tests-debug.tar.gz
cd $outdir
digest -a sha256 propolis-server-debug.tar.gz > \
	propolis-server-debug.sha256.txt
digest -a sha256 phd-runner.tar.gz > phd-runner.sha256.txt


================================================
FILE: .github/buildomat/jobs/phd-run-migrate-from-base.sh
================================================
#!/bin/bash
#:
#: name = "phd-run-migrate-from-base"
#: variety = "basic"
#: target = "lab-2.0-gimlet"
#: output_rules = [
#:	"/tmp/phd-runner.log",
#:	"/tmp/phd-tmp-files.tar.gz",
#: ]
#: skip_clone = true
#:
#: [dependencies.phd-build]
#: job = "phd-build"
#:

# This job runs the PHD migrate-from-base tests, which test upgrading from the
# current mainline propolis-server to the propolis-server version under test.
#
# PHD always uses the propolis-client from the commit under test, so these tests
# will fail if there is a breaking change to the Propolis API. They'll also fail
# if there's a breaking change to the migration protocol. These changes may be
# expected, in which case this run will fail. However, the "regular" phd-run
# job should always be green before merging new PRs.
#
# This job will be removed once API breaking changes are no longer allowed.

cp /input/phd-build/out/phd-run-with-args.sh /tmp/phd-run-with-args.sh
chmod a+x /tmp/phd-run-with-args.sh
exec /tmp/phd-run-with-args.sh \
    --include-filter "phd_tests::migrate::from_base" \
    --base-propolis-branch master


================================================
FILE: .github/buildomat/jobs/phd-run.sh
================================================
#!/bin/bash
#:
#: name = "phd-run"
#: variety = "basic"
#: target = "lab-2.0-gimlet"
#: output_rules = [
#:	"/tmp/phd-runner.log",
#:	"/tmp/phd-tmp-files.tar.gz",
#: ]
#: skip_clone = true
#:
#: [dependencies.phd-build]
#: job = "phd-build"
#:

# This job runs all the PHD test cases that don't involve upgrading from an
# earlier version of Propolis.
#
# These tests should always pass even in the presence of breaking changes to the
# Propolis API or live migration protocol.

cp /input/phd-build/out/phd-run-with-args.sh /tmp/phd-run-with-args.sh
chmod a+x /tmp/phd-run-with-args.sh
exec /tmp/phd-run-with-args.sh --exclude-filter "phd_tests::migrate::from_base"


================================================
FILE: .github/buildomat/jobs/test-gimlet.sh
================================================
#!/bin/bash
#:
#: name = "test-gimlet"
#: variety = "basic"
#: target = "lab-2.0-gimlet"
#: rust_toolchain = false
#: skip_clone = true
#:
#: [dependencies.phd-build]
#: job = "phd-build"
#
# That buildomat frontmatter is kinda absurd. We're going to run Propolis
# tests, but not build them? And `lab-2.0-gimlet` specifically?
#
# # The Target
#
# Propolis has a handful of tests that would like to create and destroy real
# VMMs. In particular the tests involving virtio-nic set up a viona device
# which itself requires an actual VMM.
#
# This means we have effectively the same constraints as the `phd-run` jobs to
# be able to run *all* of the Propolis tests.
#
# # The Skips
#
# The Gimlet target doesn't have git, or pkg to get git. Installing the Rust
# toolchain by hand is doable, but pretty funky. Instead, we *build* Propolis
# tests in the `phd-build` job, and only *run* them here. We've built most of
# the dependency tree there already anyway, so we're not going too far out of
# our way for this.

set -o errexit
set -o pipefail
set -o xtrace

banner prepare

TEST_TAR="propolis-tests-debug.tar.gz"
cp /input/phd-build/out/"$TEST_TAR" .
tar xvf "$TEST_TAR"

banner test-propolis

# Set up an etherstub to use for VNICs in virtio-nic tests. We might want the
# tests to run on a real link one day to do actual networking, but we don't need
# that yet!
TEST_DEV="prop_viona_test0"
pfexec dladm create-etherstub "$TEST_DEV"

for testbin in ./target/debug/deps/propolis-*; do
	VIONA_TEST_NIC="$TEST_DEV" pfexec ptime -m "$testbin"
done


================================================
FILE: .github/buildomat/phd-run-with-args.sh
================================================
#!/bin/bash

# Unpacks and executes the PHD runner in the Buildomat environment, passing
# through to the runner any arguments that were passed to this script.

set -o errexit
set -o pipefail
set -o xtrace

indir="/input"
indir_suffix="phd-build/out/*.tar.gz"
phddir="$PWD/phd-test"

# Put artifacts on the runner's SSDs (the /work ramdisk is small by design, too
# small for images of any appreciable size).

# Find usable disks to make a zpool on. Note that this only works on Oxide
# compute sled runners such as `lab-2.0-gimlet`.
disks=( $(pilot local disk list -H -o type,disk | grep -v 'M.2' | cut -f 2) )
pfexec zpool create -f phd-artifacts ${disks[@]}
artifactdir="/phd-artifacts"

banner 'Inputs'
find $indir -ls

rm -rf "$phddir"
mkdir "$phddir"

for p in $indir/$indir_suffix; do
	tar xzvf $p -C $phddir
	for f in $(tar tf "$p"); do
		chmod +x "$phddir/$f"
	done
done

ls $phddir

banner 'Setup'
tmpdir="/tmp/propolis-phd"
if [ ! -d "$tmpdir" ]; then
	mkdir $tmpdir
fi

# We'll be using the reservoir, so set it to something higher than the default
# 0MiB. Most tests would only need 512MiB (the default PHD guest VM memory
# size), some tests want to run multiple VMs concurrently (particularly around
# migration). 4096MiB is an arbitrary number intended to support the above and
# that we might want to run those tests concurrently at some point.
#
# Currently the lab host these tests will run on is well-known and has much
# more memory than this. Hopefully we won't have Propolis CI running on a
# machine with ~4GiB of memory, but this number could be tuned down if the need
# arises.
pfexec /usr/lib/rsrvrctl -s 4096

banner 'Tests'

runner="$phddir/phd-runner"
artifacts="$phddir/artifacts.toml"
propolis="$phddir/propolis-server"

ls $runner
ls $artifacts
ls $propolis

args=(
    $runner '--emit-bunyan' 'run'
    '--propolis-server-cmd' $propolis
    '--crucible-downstairs-commit' 'auto'
    '--artifact-toml-path' $artifacts
    '--tmp-directory' $tmpdir
    '--artifact-directory' $artifactdir
    $@
)

# Disable errexit so that we still upload logs on failure
set +e
(RUST_BACKTRACE=1 RUST_LOG="info,phd=debug" ptime -m pfexec "${args[@]}" | \
    tee /tmp/phd-runner.log)
failcount=$?
set -e

tar -czvf /tmp/phd-tmp-files.tar.gz \
	-C "$tmpdir" .

exitcode=0
if [ $failcount -eq 0 ]; then
	echo
	echo "ALL TESTS PASSED"
	echo
else
	echo
	echo "SOME TESTS FAILED"
	echo
	exitcode=1
fi

if find /dev/vmm -mindepth 1 -maxdepth 1 | read ; then
	echo "VMM handles leaked:"
	find /dev/vmm -type c -exec basename {} \;
	exitcode=2
fi

exit $exitcode


================================================
FILE: .github/workflows/rust.yml
================================================
name: Rust

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

env:
  CARGO_TERM_COLOR: always

jobs:
  check-style:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Report rustfmt version
      run: cargo fmt -- --version
    - name: Check style
      run: cargo fmt -- --check
    - name: Check misc. style
      run: cargo xtask style
  check-clippy:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Report clippy version
      run: cargo clippy -- --version
    - name: Check clippy
      run: cargo xtask clippy --strict
  check-license:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Check license headers
      uses: apache/skywalking-eyes/header@5c5b974209f0de5d905f37deb69369068ebfc15c # v0.7.0
  build-docs:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Test build documentation
      run: cargo doc --workspace --no-deps
  build-and-test:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
      with:
        # see https://github.com/oxidecomputer/omicron/issues/4461
        # (by default the action picks a merge commit with the target branch
        # rather than the actual PR tip)
        ref: ${{ github.event.pull_request.head.sha }}
        # `test_apis_up_to_date` needs a full history for the gitstubs
        fetch-depth: 0
    - name: Build
      run: cargo build --verbose
    - name: Build mock-only server
      run: cargo build -p propolis-mock-server --verbose
    - name: Test Libraries
      run: cargo test --lib --verbose
    - name: Test everything
      run: cargo test --locked
  # Build and test propolis-the-library on its own; `cargo test --lib` as used
  # above builds the entire workspace, meaning propolis-server default features
  # are used to build and run propolis-lib tests. Instead, this check uses
  # `cargo build -p propolis` to only operate with that one crate.
  build-and-test-propolis-lib:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
    - name: Build
      run: cargo build -p propolis --verbose
    - name: Test Propolis-lib
      run: cargo test -p propolis --verbose


================================================
FILE: .gitignore
================================================
/target
/crates/*/header-check/target
/crates/*/header-check/Cargo.lock
debug.out
core
out/


================================================
FILE: .licenserc.yaml
================================================
header:
  license:
    spdx-id: MPL-2.0

    content: |
      This Source Code Form is subject to the terms of the Mozilla Public
      License, v. 2.0. If a copy of the MPL was not distributed with this
      file, You can obtain one at https://mozilla.org/MPL/2.0/.
  paths:
    - '**/*.rs'
  paths-ignore:
    - 'target/**/*.rs'

  comment: on-failure


================================================
FILE: Cargo.toml
================================================
[workspace]
resolver = "2"

members = [
  "crates/*",
  "crates/*/sys",
  "bin/*",
  "lib/*",
  "packaging/propolis-package",
  "phd-tests/*",
  "xtask",
]

default-members = [
  "crates/*",
  "crates/*/sys",
  "lib/*",
  "bin/dropshot-apis",
  "bin/propolis-cli",
  "bin/propolis-server",
  "bin/propolis-standalone",
  "xtask",
]

# `header-check` crates are excluded because they require an external checkout
# of illumos-gate (perhaps even to a specific revision) to run. So, to support
# more Propolis-local development, exclude these and leave them for more
# specific tools (see `tools/check_headers`, which is used to run these in CI)
exclude = [
  "crates/bhyve-api/header-check",
  "crates/nvpair/header-check",
  "crates/viona-api/header-check",
]

# If one wants the 'dev' profile, but with "panic = abort" semantics, they
# should opt in with this profile.  Unwinding is required by PHD and
# should_abort cargo tests, and so remains the default for the 'dev' profile.
[profile.dev-abort]
inherits = "dev"
panic = "abort"

# Building for 'release' implies running on a real illumos system, where we
# certainly want (rust) panics to cause an immediate abort and coredump.
[profile.release]
panic = "abort"

[workspace.dependencies]
# Internal crates
bhyve_api = { path = "crates/bhyve-api" }
bhyve_api_sys = { path = "crates/bhyve-api/sys" }
cpuid_utils = { path = "crates/cpuid-utils" }
cpuid_profile_config = { path = "crates/cpuid-profile-config" }
dladm = { path = "crates/dladm" }
nvpair = { path = "crates/nvpair" }
nvpair_sys = { path = "crates/nvpair/sys" }
pbind = { path = "crates/pbind" }
propolis-config-toml = { path = "crates/propolis-config-toml" }
propolis_api_types = { path = "crates/propolis-api-types" }
propolis-api-types-versions = { path = "crates/propolis-api-types-versions" }
propolis-server-api = { path = "crates/propolis-server-api" }
propolis_types = { path = "crates/propolis-types" }
rfb = { path = "crates/rfb" }
rgb_frame = { path = "crates/rgb-frame" }
viona_api = { path = "crates/viona-api" }

# PHD testing framework
phd-framework = { path = "phd-tests/framework" }
phd-testcase = { path = "phd-tests/testcase" }
phd-testcase-macros = { path = "phd-tests/testcase_macro" }
phd-tests = { path = "phd-tests/tests" }

# Public library crates
propolis = { path = "lib/propolis", default-features = false }
propolis-client = { path = "lib/propolis-client" }

# Propolis cfg(feature = "falcon")
dlpi = { git = "https://github.com/oxidecomputer/dlpi-sys", branch = "main" }
ispf = { git = "https://github.com/oxidecomputer/ispf" }
libloading = "0.7"
p9ds = { git = "https://github.com/oxidecomputer/p9fs" }
softnpu = { git = "https://github.com/oxidecomputer/softnpu" }

# Omicron-related
internal-dns-resolver = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
internal-dns-types = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
nexus-client = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
omicron-common = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
omicron-zone-package = "0.12.2"
oximeter-instruments = { git = "https://github.com/oxidecomputer/omicron", branch = "main", default-features = false, features = ["kstat"] }
oximeter-producer = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
oximeter = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }
sled-agent-client = { git = "https://github.com/oxidecomputer/omicron", branch = "main" }

# Crucible
crucible = { git = "https://github.com/oxidecomputer/crucible", rev = "3c1708d86e10f0370807388a1efe092edd99d431" }
crucible-client-types = { git = "https://github.com/oxidecomputer/crucible", rev = "3c1708d86e10f0370807388a1efe092edd99d431" }

# Attestation
dice-verifier = { git = "https://github.com/oxidecomputer/dice-util", rev = "1d3084b514389847e8e0f5d966d2be4f18d02d32", features = ["sled-agent"] }
vm-attest = { git = "https://github.com/oxidecomputer/vm-attest", rev = "2cdd17580a4fc6c871d24797016af8dbaac9421d", default-features = false }

# External dependencies
anyhow = "1.0"
async-trait = "0.1.88"
atty = "0.2.14"
backoff = "0.4.0"
backtrace = "0.3.66"
base64 = "0.21"
bit_field = "0.10.1"
bitflags = "2.4"
bitstruct = "0.1"
bitvec = "1.0"
byteorder = "1"
bytes = "1.7.1"
camino = "1.1.6"
cargo_metadata = "0.18.1"
cc = "1.0.73"
cfg-if = "1.0.0"
chrono = "0.4.19"
clap = "4.2"
const_format = "0.2"
crossbeam-channel = "0.5"
ctrlc = "3.2"
dropshot = "0.17.0"
dropshot-api-manager = "0.7.1"
dropshot-api-manager-types = "0.7.1"
erased-serde = "0.4"
errno = "0.2.8"
escargot = "0.5.8"
expectorate = "1.0.5"
fatfs = "0.3.6"
futures = "0.3"
futures-util = "0.3.21"
flate2 = "1.0.28"
heck = "0.5.0"
hex = "0.4.3"
http = "1.1.0"
hyper = "1.0"
linkme = "0.3.33"
iddqd = "0.3"
itertools = "0.13.0"
kstat-rs = "0.2.4"
lazy_static = "1.4"
libc = "0.2"
mockall = "0.12"
newtype_derive = "0.1.6"
newtype-uuid = { version = "1.0.1", features = [ "v4" ] }
# "feature" for `utsname`, "poll" for `PollFlags`
nix = { version = "0.31", features = [ "feature", "poll" ] }
owo-colors = "4"
oxide-tokio-rt = "0.1.2"
paste = "1.0.15"
pin-project-lite = "0.2.13"
proc-macro2 = "1.0"
proc-macro-error = "1"
progenitor = "0.14.0"
progenitor-client = "0.14.0"
proptest = "1.5.0"
quote = "1.0"
rand = "0.9.1"
reqwest = { version = "0.13", default-features = false }
ring = "0.17"
ron = "0.8"
schemars = "0.8.10"
semver = "1.0"
serde = "1.0"
serde_arrays = "0.1"
serde_derive = "1.0"
serde_json = "1.0"
serde_test = "1.0.138"
sha2 = "0.10.9"
slog = "2.7"
slog-async = "2.8"
slog-bunyan = "2.4.0"
slog-dtrace = "0.3"
slog-term = "2.8"
strum = "0.26"
syn = "1.0"
tar = "0.4"
tempfile = "3.2"
termwiz = "0.20"
thiserror = "1.0"
tokio = "1"
tokio-tungstenite = "0.21"
tokio-util = "0.7"
toml = "0.7.8"
tracing = "0.1.35"
tracing-appender = "0.2.2"
tracing-bunyan-formatter = "0.3.3"
tracing-subscriber = "0.3.14"
usdt = { version = "0.6", default-features = false }
uuid = "1.3.2"
zerocopy = "0.8.25"


#
# It's common during development to use a local copy of various complex
# dependencies.  If you want to use those, uncomment one of these blocks.
#
# [patch."https://github.com/oxidecomputer/omicron"]
# internal-dns = { path = "../omicron/internal-dns" }
# nexus-client = { path = "../omicron/clients/nexus-client" }
# omicron-common = { path = "../omicron/common" }
# oximeter-instruments = { path = "../omicron/oximeter/instruments" }
# oximeter-producer = { path = "../omicron/oximeter/producer" }
# oximeter = { path = "../omicron/oximeter/oximeter" }
# [patch."https://github.com/oxidecomputer/crucible"]
# crucible = { path = "../crucible/upstairs" }
# crucible-client-types = { path = "../crucible/crucible-client-types" }


================================================
FILE: LICENSE
================================================
Mozilla Public License Version 2.0
==================================

1. Definitions
--------------

1.1. "Contributor"
    means each individual or legal entity that creates, contributes to
    the creation of, or owns Covered Software.

1.2. "Contributor Version"
    means the combination of the Contributions of others (if any) used
    by a Contributor and that particular Contributor's Contribution.

1.3. "Contribution"
    means Covered Software of a particular Contributor.

1.4. "Covered Software"
    means Source Code Form to which the initial Contributor has attached
    the notice in Exhibit A, the Executable Form of such Source Code
    Form, and Modifications of such Source Code Form, in each case
    including portions thereof.

1.5. "Incompatible With Secondary Licenses"
    means

    (a) that the initial Contributor has attached the notice described
        in Exhibit B to the Covered Software; or

    (b) that the Covered Software was made available under the terms of
        version 1.1 or earlier of the License, but not also under the
        terms of a Secondary License.

1.6. "Executable Form"
    means any form of the work other than Source Code Form.

1.7. "Larger Work"
    means a work that combines Covered Software with other material, in 
    a separate file or files, that is not Covered Software.

1.8. "License"
    means this document.

1.9. "Licensable"
    means having the right to grant, to the maximum extent possible,
    whether at the time of the initial grant or subsequently, any and
    all of the rights conveyed by this License.

1.10. "Modifications"
    means any of the following:

    (a) any file in Source Code Form that results from an addition to,
        deletion from, or modification of the contents of Covered
        Software; or

    (b) any new file in Source Code Form that contains any Covered
        Software.

1.11. "Patent Claims" of a Contributor
    means any patent claim(s), including without limitation, method,
    process, and apparatus claims, in any patent Licensable by such
    Contributor that would be infringed, but for the grant of the
    License, by the making, using, selling, offering for sale, having
    made, import, or transfer of either its Contributions or its
    Contributor Version.

1.12. "Secondary License"
    means either the GNU General Public License, Version 2.0, the GNU
    Lesser General Public License, Version 2.1, the GNU Affero General
    Public License, Version 3.0, or any later versions of those
    licenses.

1.13. "Source Code Form"
    means the form of the work preferred for making modifications.

1.14. "You" (or "Your")
    means an individual or a legal entity exercising rights under this
    License. For legal entities, "You" includes any entity that
    controls, is controlled by, or is under common control with You. For
    purposes of this definition, "control" means (a) the power, direct
    or indirect, to cause the direction or management of such entity,
    whether by contract or otherwise, or (b) ownership of more than
    fifty percent (50%) of the outstanding shares or beneficial
    ownership of such entity.

2. License Grants and Conditions
--------------------------------

2.1. Grants

Each Contributor hereby grants You a world-wide, royalty-free,
non-exclusive license:

(a) under intellectual property rights (other than patent or trademark)
    Licensable by such Contributor to use, reproduce, make available,
    modify, display, perform, distribute, and otherwise exploit its
    Contributions, either on an unmodified basis, with Modifications, or
    as part of a Larger Work; and

(b) under Patent Claims of such Contributor to make, use, sell, offer
    for sale, have made, import, and otherwise transfer either its
    Contributions or its Contributor Version.

2.2. Effective Date

The licenses granted in Section 2.1 with respect to any Contribution
become effective for each Contribution on the date the Contributor first
distributes such Contribution.

2.3. Limitations on Grant Scope

The licenses granted in this Section 2 are the only rights granted under
this License. No additional rights or licenses will be implied from the
distribution or licensing of Covered Software under this License.
Notwithstanding Section 2.1(b) above, no patent license is granted by a
Contributor:

(a) for any code that a Contributor has removed from Covered Software;
    or

(b) for infringements caused by: (i) Your and any other third party's
    modifications of Covered Software, or (ii) the combination of its
    Contributions with other software (except as part of its Contributor
    Version); or

(c) under Patent Claims infringed by Covered Software in the absence of
    its Contributions.

This License does not grant any rights in the trademarks, service marks,
or logos of any Contributor (except as may be necessary to comply with
the notice requirements in Section 3.4).

2.4. Subsequent Licenses

No Contributor makes additional grants as a result of Your choice to
distribute the Covered Software under a subsequent version of this
License (see Section 10.2) or under the terms of a Secondary License (if
permitted under the terms of Section 3.3).

2.5. Representation

Each Contributor represents that the Contributor believes its
Contributions are its original creation(s) or it has sufficient rights
to grant the rights to its Contributions conveyed by this License.

2.6. Fair Use

This License is not intended to limit any rights You have under
applicable copyright doctrines of fair use, fair dealing, or other
equivalents.

2.7. Conditions

Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
in Section 2.1.

3. Responsibilities
-------------------

3.1. Distribution of Source Form

All distribution of Covered Software in Source Code Form, including any
Modifications that You create or to which You contribute, must be under
the terms of this License. You must inform recipients that the Source
Code Form of the Covered Software is governed by the terms of this
License, and how they can obtain a copy of this License. You may not
attempt to alter or restrict the recipients' rights in the Source Code
Form.

3.2. Distribution of Executable Form

If You distribute Covered Software in Executable Form then:

(a) such Covered Software must also be made available in Source Code
    Form, as described in Section 3.1, and You must inform recipients of
    the Executable Form how they can obtain a copy of such Source Code
    Form by reasonable means in a timely manner, at a charge no more
    than the cost of distribution to the recipient; and

(b) You may distribute such Executable Form under the terms of this
    License, or sublicense it under different terms, provided that the
    license for the Executable Form does not attempt to limit or alter
    the recipients' rights in the Source Code Form under this License.

3.3. Distribution of a Larger Work

You may create and distribute a Larger Work under terms of Your choice,
provided that You also comply with the requirements of this License for
the Covered Software. If the Larger Work is a combination of Covered
Software with a work governed by one or more Secondary Licenses, and the
Covered Software is not Incompatible With Secondary Licenses, this
License permits You to additionally distribute such Covered Software
under the terms of such Secondary License(s), so that the recipient of
the Larger Work may, at their option, further distribute the Covered
Software under the terms of either this License or such Secondary
License(s).

3.4. Notices

You may not remove or alter the substance of any license notices
(including copyright notices, patent notices, disclaimers of warranty,
or limitations of liability) contained within the Source Code Form of
the Covered Software, except that You may alter any license notices to
the extent required to remedy known factual inaccuracies.

3.5. Application of Additional Terms

You may choose to offer, and to charge a fee for, warranty, support,
indemnity or liability obligations to one or more recipients of Covered
Software. However, You may do so only on Your own behalf, and not on
behalf of any Contributor. You must make it absolutely clear that any
such warranty, support, indemnity, or liability obligation is offered by
You alone, and You hereby agree to indemnify every Contributor for any
liability incurred by such Contributor as a result of warranty, support,
indemnity or liability terms You offer. You may include additional
disclaimers of warranty and limitations of liability specific to any
jurisdiction.

4. Inability to Comply Due to Statute or Regulation
---------------------------------------------------

If it is impossible for You to comply with any of the terms of this
License with respect to some or all of the Covered Software due to
statute, judicial order, or regulation then You must: (a) comply with
the terms of this License to the maximum extent possible; and (b)
describe the limitations and the code they affect. Such description must
be placed in a text file included with all distributions of the Covered
Software under this License. Except to the extent prohibited by statute
or regulation, such description must be sufficiently detailed for a
recipient of ordinary skill to be able to understand it.

5. Termination
--------------

5.1. The rights granted under this License will terminate automatically
if You fail to comply with any of its terms. However, if You become
compliant, then the rights granted under this License from a particular
Contributor are reinstated (a) provisionally, unless and until such
Contributor explicitly and finally terminates Your grants, and (b) on an
ongoing basis, if such Contributor fails to notify You of the
non-compliance by some reasonable means prior to 60 days after You have
come back into compliance. Moreover, Your grants from a particular
Contributor are reinstated on an ongoing basis if such Contributor
notifies You of the non-compliance by some reasonable means, this is the
first time You have received notice of non-compliance with this License
from such Contributor, and You become compliant prior to 30 days after
Your receipt of the notice.

5.2. If You initiate litigation against any entity by asserting a patent
infringement claim (excluding declaratory judgment actions,
counter-claims, and cross-claims) alleging that a Contributor Version
directly or indirectly infringes any patent, then the rights granted to
You by any and all Contributors for the Covered Software under Section
2.1 of this License shall terminate.

5.3. In the event of termination under Sections 5.1 or 5.2 above, all
end user license agreements (excluding distributors and resellers) which
have been validly granted by You or Your distributors under this License
prior to termination shall survive termination.

************************************************************************
*                                                                      *
*  6. Disclaimer of Warranty                                           *
*  -------------------------                                           *
*                                                                      *
*  Covered Software is provided under this License on an "as is"       *
*  basis, without warranty of any kind, either expressed, implied, or  *
*  statutory, including, without limitation, warranties that the       *
*  Covered Software is free of defects, merchantable, fit for a        *
*  particular purpose or non-infringing. The entire risk as to the     *
*  quality and performance of the Covered Software is with You.        *
*  Should any Covered Software prove defective in any respect, You     *
*  (not any Contributor) assume the cost of any necessary servicing,   *
*  repair, or correction. This disclaimer of warranty constitutes an   *
*  essential part of this License. No use of any Covered Software is   *
*  authorized under this License except under this disclaimer.         *
*                                                                      *
************************************************************************

************************************************************************
*                                                                      *
*  7. Limitation of Liability                                          *
*  --------------------------                                          *
*                                                                      *
*  Under no circumstances and under no legal theory, whether tort      *
*  (including negligence), contract, or otherwise, shall any           *
*  Contributor, or anyone who distributes Covered Software as          *
*  permitted above, be liable to You for any direct, indirect,         *
*  special, incidental, or consequential damages of any character      *
*  including, without limitation, damages for lost profits, loss of    *
*  goodwill, work stoppage, computer failure or malfunction, or any    *
*  and all other commercial damages or losses, even if such party      *
*  shall have been informed of the possibility of such damages. This   *
*  limitation of liability shall not apply to liability for death or   *
*  personal injury resulting from such party's negligence to the       *
*  extent applicable law prohibits such limitation. Some               *
*  jurisdictions do not allow the exclusion or limitation of           *
*  incidental or consequential damages, so this exclusion and          *
*  limitation may not apply to You.                                    *
*                                                                      *
************************************************************************

8. Litigation
-------------

Any litigation relating to this License may be brought only in the
courts of a jurisdiction where the defendant maintains its principal
place of business and such litigation shall be governed by laws of that
jurisdiction, without reference to its conflict-of-law provisions.
Nothing in this Section shall prevent a party's ability to bring
cross-claims or counter-claims.

9. Miscellaneous
----------------

This License represents the complete agreement concerning the subject
matter hereof. If any provision of this License is held to be
unenforceable, such provision shall be reformed only to the extent
necessary to make it enforceable. Any law or regulation which provides
that the language of a contract shall be construed against the drafter
shall not be used to construe this License against a Contributor.

10. Versions of the License
---------------------------

10.1. New Versions

Mozilla Foundation is the license steward. Except as provided in Section
10.3, no one other than the license steward has the right to modify or
publish new versions of this License. Each version will be given a
distinguishing version number.

10.2. Effect of New Versions

You may distribute the Covered Software under the terms of the version
of the License under which You originally received the Covered Software,
or under the terms of any subsequent version published by the license
steward.

10.3. Modified Versions

If you create software not governed by this License, and you want to
create a new license for such software, you may create and use a
modified version of this License if you rename the license and remove
any references to the name of the license steward (except to note that
such modified license differs from this License).

10.4. Distributing Source Code Form that is Incompatible With Secondary
Licenses

If You choose to distribute Source Code Form that is Incompatible With
Secondary Licenses under the terms of this version of the License, the
notice described in Exhibit B of this License must be attached.

Exhibit A - Source Code Form License Notice
-------------------------------------------

  This Source Code Form is subject to the terms of the Mozilla Public
  License, v. 2.0. If a copy of the MPL was not distributed with this
  file, You can obtain one at http://mozilla.org/MPL/2.0/.

If it is not possible or desirable to put the notice in a particular
file, then You may include the notice in a location (such as a LICENSE
file in a relevant directory) where a recipient would be likely to look
for such a notice.

You may add additional accurate notices of copyright ownership.

Exhibit B - "Incompatible With Secondary Licenses" Notice
---------------------------------------------------------

  This Source Code Form is "Incompatible With Secondary Licenses", as
  defined by the Mozilla Public License, v. 2.0.


================================================
FILE: README.md
================================================
# Propolis

Propolis VMM userspace for use with illumos bhyve.

## Prerequisites

Given the current tight coupling of the `bhyve-api` component to the ioctl
interface presented by the bhyve kernel component, running on recent illumos
bits is required.

Propolis works best (and its CI tests run) on AMD hosts, but it can also be used
to run VMs on Intel hosts. Live migration is primarily supported on AMD hosts
but may work on Intel hosts as well.

## Components

Programs:
- [propolis-server](bin/propolis-server): Run a Propolis VM instance, operated
  via REST API calls (typically by
  [omicron](https://github.com/oxidecomputer/omicron))
- [propolis-cli](bin/propolis-cli): CLI wrapper interface for `propolis-server`
  API calls
- [propolis-standalone](bin/propolis-standalone): Simple standalone program to
  run a Propolis VM instance, operated via a local config file

Libraries:
- [propolis-client](lib/propolis-client): Rust crate for `propolis-server` API
- [propolis](lib/propolis): Represents the bulk of the emulation logic required
  to implement a userspace VMM.  Both `propolis-server` and
  `propolis-standalone` are built around this.

## Internal Crates

These are not meant as committed public interfaces, but rather internal
implementation details, consumed by Propolis components.

- bhyve-api: API (ioctls & structs) for the illumos bhyve kernel VMM
- dladm: Some thin wrappers around `dladm` queries
- propolis-server-config: Type definitions for `propolis-server` config file
- propolis-types: Publically exposed (via `propolis-server`) types, intergral
  to the `propolis` library
- viona-api: API (ioctls & structs) for the illumos viona driver

## xtasks

Propolis uses the `cargo xtask` pattern in order to conveniently expose certain
tasks to developers.

- `clippy`: Run suite of clippy checks.  This performs more than a simple
  `cargo clippy`, since there are several combinations of feature flags which
  must be checked.
- `fmt`: Check style according to `rustfmt`
- `license`:  Check (crudely) that files bear appropriate license headers
- `phd`: Run the PHD test suite
- `style`: Perform miscellaneous style checks
- `prepush`: Preform pre-push checks (`clippy`, `fmt`, `license`, `style`) in a
  manner which resembles (but does not exactly match) how they are run in CI.
  Running tests (unit, integration, or phd) are not included and are left to
  the user.

It is recommended that developers run the `prepush` test before pushing a
branch which will be subsequently checked by CI.  Doing so currently requires
an x86\_64 UNIX/Linux machine.

## License

Unless otherwise noted, all components are licensed under the [Mozilla Public
License Version 2.0](LICENSE).


================================================
FILE: bin/dropshot-apis/Cargo.toml
================================================
[package]
name = "propolis-dropshot-apis"
version = "0.1.0"
edition = "2024"
license = "MPL-2.0"

[dependencies]
anyhow.workspace = true
camino.workspace = true
clap.workspace = true
dropshot-api-manager-types.workspace = true
dropshot-api-manager.workspace = true
propolis-server-api.workspace = true
semver.workspace = true


================================================
FILE: bin/dropshot-apis/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::process::ExitCode;

use anyhow::Context;
use camino::Utf8PathBuf;
use clap::Parser;
use dropshot_api_manager::{Environment, ManagedApiConfig, ManagedApis};
use dropshot_api_manager_types::{ManagedApiMetadata, Versions};
use propolis_server_api::*;

pub fn environment() -> anyhow::Result<Environment> {
    // The workspace root is two levels up from this crate's directory.
    let workspace_root = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR"))
        .parent()
        .unwrap()
        .parent()
        .unwrap()
        .to_path_buf();
    let env = Environment::new(
        // This is the command used to run the OpenAPI manager.
        "cargo xtask openapi",
        workspace_root,
        // This is the location within the workspace root where the OpenAPI
        // documents are stored.
        "openapi",
    )?
    .with_default_git_branch("origin/master".to_owned());
    Ok(env)
}

/// The list of APIs managed by the OpenAPI manager.
pub fn all_apis() -> anyhow::Result<ManagedApis> {
    let apis = vec![ManagedApiConfig {
        ident: "propolis-server",
        versions: Versions::Versioned {
            supported_versions: propolis_server_api::supported_versions(),
        },
        title: "Oxide Propolis Server API",
        metadata: ManagedApiMetadata {
            description: Some(
                "API for interacting with the Propolis hypervisor frontend.",
            ),
            contact_url: Some("https://oxide.computer"),
            contact_email: Some("api@oxide.computer"),
            ..Default::default()
        },
        api_description: propolis_server_api_mod::stub_api_description,
    }];

    let apis = ManagedApis::new(apis)
        .context("error creating ManagedApis")?
        .with_git_stub_storage();
    Ok(apis)
}

fn main() -> anyhow::Result<ExitCode> {
    let app = dropshot_api_manager::App::parse();
    let env = environment()?;
    let apis = all_apis()?;

    Ok(app.exec(&env, &apis))
}

#[cfg(test)]
mod test {
    use dropshot_api_manager::test_util::check_apis_up_to_date;

    use super::*;

    // Also recommended: a test which ensures documents are up-to-date. The
    // OpenAPI manager comes with a helper function for this, called
    // `check_apis_up_to_date`.
    #[test]
    fn test_apis_up_to_date() -> anyhow::Result<ExitCode> {
        let env = environment()?;
        let apis = all_apis()?;

        let result = check_apis_up_to_date(&env, &apis)?;
        Ok(result.to_exit_code())
    }
}


================================================
FILE: bin/mock-server/Cargo.toml
================================================
[package]
name = "propolis-mock-server"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
name = "propolis_mock_server"
path = "src/lib/lib.rs"
doc = false
doctest = false
test = false

[[bin]]
name = "propolis-mock-server"
path = "src/main.rs"
doc = false
doctest = false
test = false

[dependencies]
atty.workspace = true
anyhow.workspace = true
clap = { workspace = true, features = ["derive"] }
base64.workspace = true
dropshot = { workspace = true }
futures.workspace = true
hyper.workspace = true
serde.workspace = true
propolis_api_types.workspace = true
propolis-api-types-versions.workspace = true
propolis_types.workspace = true
semver.workspace = true
serde_json.workspace = true
slog.workspace = true
slog-async.workspace = true
slog-dtrace.workspace = true
slog-term.workspace = true
slog-bunyan.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
tokio-tungstenite.workspace = true

# Progenitor is used to instantiate copies of the API types without exposing
# internal dependencies (like crucible-client-types).  Although we are not
# using the client itself, its deps (reqwest, etc) are required.
progenitor.workspace = true
reqwest.workspace = true
schemars.workspace = true
rand.workspace = true
uuid.workspace = true


================================================
FILE: bin/mock-server/src/lib/api_types.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

progenitor::generate_api!(
    spec = "../../openapi/propolis-server/propolis-server-latest.json",
    derives = [schemars::JsonSchema],
    replace = {
        SpecKey = propolis_api_types_versions::latest::instance_spec::SpecKey,
    },
    patch = {
        InstanceMetadata = { derives = [Clone, Eq, PartialEq] },
        InstanceProperties = { derives = [ Clone, Eq, PartialEq ] },
        Slot = { derives = [Copy] },
    },
);

impl TryFrom<types::PciPath> for propolis_types::PciPath {
    type Error = String;
    fn try_from(value: types::PciPath) -> Result<Self, Self::Error> {
        propolis_types::PciPath::new(value.bus, value.device, value.function)
            .map_err(|e| e.to_string())
    }
}

// Duplicate the parameter types for the endpoints related to the serial console

#[derive(JsonSchema, Serialize, Deserialize)]
pub struct InstanceSerialParams {
    /// Character index in the serial buffer from which to read, counting the bytes output since
    /// instance start. If this is provided, `most_recent` must *not* be provided.
    pub from_start: Option<u64>,
    /// Character index in the serial buffer from which to read, counting *backward* from the most
    /// recently buffered data retrieved from the instance. (See note on `from_start` about mutual
    /// exclusivity)
    pub most_recent: Option<u64>,
}

#[derive(JsonSchema, Serialize, Deserialize)]
pub struct InstanceSerialHistoryParams {
    /// Character index in the serial buffer from which to read, counting the bytes output since
    /// instance start. If this is not provided, `most_recent` must be provided, and if this *is*
    /// provided, `most_recent` must *not* be provided.
    pub from_start: Option<u64>,
    /// Character index in the serial buffer from which to read, counting *backward* from the most
    /// recently buffered data retrieved from the instance. (See note on `from_start` about mutual
    /// exclusivity)
    pub most_recent: Option<u64>,
    /// Maximum number of bytes of buffered serial console contents to return. If the requested
    /// range runs to the end of the available buffer, the data returned will be shorter than
    /// `max_bytes`.
    pub max_bytes: Option<u64>,
}

#[derive(
    Copy, Clone, Debug, PartialEq, Eq, JsonSchema, Serialize, Deserialize,
)]
pub enum MockMode {
    /// The mock server should run freely, advancing the state every time the
    /// instance_state_monitor endpoint is requested while new state
    /// transitions are queued.
    Run,
    /// The mock server should only advance the current state when the
    /// /mock/step endpoint is requested.
    SingleStep,
}


================================================
FILE: bin/mock-server/src/lib/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implementation of a mock Propolis server

use std::sync::Arc;

use dropshot::{
    channel, endpoint, ApiDescription, HttpError, HttpResponseCreated,
    HttpResponseOk, HttpResponseUpdatedNoContent, Query, RequestContext,
    TypedBody, WebsocketConnection,
};
use futures::SinkExt;
use slog::{error, o, Logger};
use std::collections::BTreeMap;
use thiserror::Error;
use tokio::sync::{watch, Mutex};
use tokio_tungstenite::tungstenite::protocol::{Role, WebSocketConfig};
use tokio_tungstenite::tungstenite::Message;
use tokio_tungstenite::WebSocketStream;

mod api_types;
use api_types::types::{self as api, InstanceEnsureRequest};
pub use api_types::MockMode;

#[derive(Debug, Eq, PartialEq, Error)]
pub enum Error {
    #[error("Failed to send simulated state change update through channel")]
    TransitionSendFail,
    #[error("Cannot request any new mock instance state once it is stopped/destroyed/failed")]
    TerminalState,
    #[error("Cannot transition to {requested:?} from {current:?}")]
    InvalidTransition {
        current: api::InstanceState,
        requested: api::InstanceStateRequested,
    },
}

/// simulated instance properties
pub struct InstanceContext {
    /// The instance's current generation last observed by the
    /// `instance-state-monitor` endpoint.
    curr_gen: u64,
    pub properties: api::InstanceProperties,
    serial: Arc<serial::Serial>,
    serial_task: serial::SerialTask,
    state_watcher_rx: watch::Receiver<MockState>,
    state_watcher_tx: watch::Sender<MockState>,
}

struct MockState {
    queue: BTreeMap<u64, api::InstanceStateMonitorResponse>,
    /// The next generation to use when inserting new state(s) into the queue.
    next_queue_gen: u64,
    /// Current generation when single-stepping.
    ///
    /// This is set when setting the single-step mock mode, and unset if not in
    /// that mode.
    single_step_gen: Option<u64>,
}

impl InstanceContext {
    pub fn new(properties: api::InstanceProperties, _log: &Logger) -> Self {
        let (state_watcher_tx, state_watcher_rx) = {
            let mut queue = BTreeMap::new();
            queue.insert(
                0,
                api::InstanceStateMonitorResponse {
                    gen_: 0,
                    state: api::InstanceState::Creating,
                    migration: api::InstanceMigrateStatusResponse {
                        migration_in: None,
                        migration_out: None,
                    },
                },
            );
            watch::channel(MockState {
                queue,
                single_step_gen: None,
                next_queue_gen: 1,
            })
        };
        let serial = serial::Serial::new(&properties.name);

        let serial_task = serial::SerialTask::spawn();

        Self {
            curr_gen: 0,
            properties,
            serial,
            serial_task,
            state_watcher_rx,
            state_watcher_tx,
        }
    }

    /// Updates the state of the mock instance.
    ///
    /// Returns an error if the state transition is invalid.
    pub async fn set_target_state(
        &mut self,
        log: &Logger,
        target: api::InstanceStateRequested,
    ) -> Result<(), Error> {
        match (self.current_state(), target) {
            (
                api::InstanceState::Stopped
                | api::InstanceState::Destroyed
                | api::InstanceState::Failed,
                _,
            ) => {
                // Cannot request any state once the target is halt/destroy
                Err(Error::TerminalState)
            }
            (
                api::InstanceState::Rebooting,
                api::InstanceStateRequested::Run,
            ) => {
                // Requesting a run when already on the road to reboot is an
                // immediate success.
                Ok(())
            }
            (api::InstanceState::Running, api::InstanceStateRequested::Run) => {
                Ok(())
            }
            (
                api::InstanceState::Running,
                api::InstanceStateRequested::Reboot,
            ) => {
                self.queue_states(
                    log,
                    &[
                        api::InstanceState::Rebooting,
                        api::InstanceState::Running,
                    ],
                )
                .await;
                Ok(())
            }
            (current, api::InstanceStateRequested::Reboot) => {
                Err(Error::InvalidTransition {
                    current,
                    requested: api::InstanceStateRequested::Reboot,
                })
            }
            (_, api::InstanceStateRequested::Run) => {
                self.queue_states(log, &[api::InstanceState::Running]).await;
                Ok(())
            }
            (
                api::InstanceState::Stopping,
                api::InstanceStateRequested::Stop,
            ) => Ok(()),
            (_, api::InstanceStateRequested::Stop) => {
                self.queue_states(
                    log,
                    &[
                        api::InstanceState::Stopping,
                        api::InstanceState::Stopped,
                    ],
                )
                .await;
                self.serial_task.shutdown().await;
                Ok(())
            }
        }
    }

    fn current_state(&self) -> api::InstanceState {
        self.state_watcher_rx.borrow().queue
            .get(&self.curr_gen)
            .expect("current generation must be in the queue, this is weird 'n' bad")
            .state
    }

    async fn queue_states(
        &mut self,
        log: &Logger,
        states: &[api::InstanceState],
    ) {
        self.state_watcher_tx.send_modify(|mock_state| {
            for &state in states {
                let generation = mock_state.next_queue_gen;
                mock_state.next_queue_gen += 1;
                mock_state.queue.insert(
                    generation,
                    api::InstanceStateMonitorResponse {
                        gen_: generation,
                        migration: api::InstanceMigrateStatusResponse {
                            migration_in: None,
                            migration_out: None,
                        },
                        state,
                    },
                );
                slog::info!(
                    log,
                    "queued instance state transition";
                    "state" => ?state,
                    "gen" => ?generation,
                );
            }
        })
    }
}

/// Contextual information accessible from mock HTTP callbacks.
pub struct Context {
    instance: Mutex<Option<InstanceContext>>,
    log: Logger,
}

impl Context {
    pub fn new(log: Logger) -> Self {
        Context { instance: Mutex::new(None), log }
    }
}

#[endpoint {
    method = PUT,
    path = "/instance",
}]
async fn instance_ensure(
    rqctx: RequestContext<Arc<Context>>,
    request: TypedBody<api::InstanceEnsureRequest>,
) -> Result<HttpResponseCreated<api::InstanceEnsureResponse>, HttpError> {
    let server_context = rqctx.context();
    let request = request.into_inner();
    let InstanceEnsureRequest { properties, .. } = request;

    // Handle an already-initialized instance
    let mut instance = server_context.instance.lock().await;
    if let Some(instance) = &*instance {
        if instance.properties != properties {
            return Err(HttpError::for_internal_error(
                "Cannot update running server".to_string(),
            ));
        }
        return Ok(HttpResponseCreated(api::InstanceEnsureResponse {
            migrate: None,
        }));
    }
    *instance = Some(InstanceContext::new(properties, &server_context.log));
    Ok(HttpResponseCreated(api::InstanceEnsureResponse { migrate: None }))
}

#[endpoint {
    method = GET,
    path = "/instance",
}]
async fn instance_get(
    rqctx: RequestContext<Arc<Context>>,
) -> Result<HttpResponseOk<api::InstanceGetResponse>, HttpError> {
    let instance = rqctx.context().instance.lock().await;
    let instance = instance.as_ref().ok_or_else(|| {
        HttpError::for_internal_error(
            "Server not initialized (no instance)".to_string(),
        )
    })?;
    let instance_info = api::Instance {
        properties: instance.properties.clone(),
        state: instance.current_state(),
    };
    Ok(HttpResponseOk(api::InstanceGetResponse { instance: instance_info }))
}

#[endpoint {
    method = GET,
    path = "/instance/state-monitor",
}]
async fn instance_state_monitor(
    rqctx: RequestContext<Arc<Context>>,
    request: TypedBody<api::InstanceStateMonitorRequest>,
) -> Result<HttpResponseOk<api::InstanceStateMonitorResponse>, HttpError> {
    let (mut state_watcher, gen) = {
        let instance = rqctx.context().instance.lock().await;
        let instance = instance.as_ref().ok_or_else(|| {
            HttpError::for_internal_error(
                "Server not initialized (no instance)".to_string(),
            )
        })?;
        let gen = request.into_inner().gen_;
        let state_watcher = instance.state_watcher_rx.clone();
        (state_watcher, gen)
    };

    slog::debug!(
        rqctx.log,
        "instance state monitor request";
        "request_gen" => gen,
    );
    loop {
        let state = {
            let mock_state = state_watcher.borrow_and_update();
            match mock_state.single_step_gen {
                // We are single-stepping, and have not yet reached the
                // requested generation. Keep waiting until single-stepped to
                // where we need to be.
                Some(g) if gen > g => {
                    slog::info!(
                        rqctx.log,
                        "instance state monitor: wait for single step...";
                        "request_gen" => gen,
                        "current_gen" => g,
                    );
                    None
                }
                // Otherwise, if we have stepped to the requested generation, or
                // if we are not in single-step mode, just return the current
                // thing.
                _ => mock_state.queue.get(&gen).cloned(),
            }
        };

        if let Some(state) = state {
            slog::info!(
                rqctx.log,
                "instance state monitor";
                "request_gen" => gen,
                "state" => ?state.state,
            );
            // Advance to the state with the generation we showed to the
            // watcher, for use in `instance_get` and when determining what
            // state transitions are valid.
            rqctx
                .context()
                .instance
                .lock()
                .await
                .as_mut()
                .expect("if we didn't have an instance, we shouldn't have gotten here")
                .curr_gen = gen;
            return Ok(HttpResponseOk(state));
        }

        state_watcher.changed().await.unwrap();
    }
}

#[endpoint {
    method = PUT,
    path = "/instance/state",
}]
async fn instance_state_put(
    rqctx: RequestContext<Arc<Context>>,
    request: TypedBody<api::InstanceStateRequested>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
    let mut instance = rqctx.context().instance.lock().await;
    let instance = instance.as_mut().ok_or_else(|| {
        HttpError::for_internal_error(
            "Server not initialized (no instance)".to_string(),
        )
    })?;
    let requested_state = request.into_inner();
    instance.set_target_state(&rqctx.log, requested_state).await.map_err(
        |err| {
            HttpError::for_internal_error(format!(
                "Failed to transition: {err}"
            ))
        },
    )?;
    Ok(HttpResponseUpdatedNoContent {})
}

// TODO: mock the "Serial" struct itself instead?
#[channel {
    protocol = WEBSOCKETS,
    path = "/instance/serial",
}]
async fn instance_serial(
    rqctx: RequestContext<Arc<Context>>,
    query: Query<api_types::InstanceSerialParams>,
    websock: WebsocketConnection,
) -> dropshot::WebsocketChannelResult {
    let config = WebSocketConfig::default();
    let mut ws_stream = WebSocketStream::from_raw_socket(
        websock.into_inner(),
        Role::Server,
        Some(config),
    )
    .await;

    match rqctx.context().instance.lock().await.as_ref() {
        None => {
            ws_stream.send(Message::Close(None)).await?;
            Err("Instance not yet created!".into())
        }
        Some(instance_ctx)
            if instance_ctx.current_state() != api::InstanceState::Running =>
        {
            ws_stream.send(Message::Close(None)).await?;
            Err(format!(
                "Instance isn't Running! ({:?})",
                instance_ctx.current_state()
            )
            .into())
        }
        Some(instance_ctx) => {
            let serial = instance_ctx.serial.clone();

            let query_params = query.into_inner();
            let history_query = serial::HistoryQuery::from_query(
                query_params.from_start,
                query_params.most_recent,
            );
            if let Some(mut hq) = history_query {
                loop {
                    let (data, offset) = serial.history_vec(hq, None).await?;
                    if data.is_empty() {
                        break;
                    }
                    ws_stream.send(Message::Binary(data)).await?;
                    hq = serial::HistoryQuery::FromStart(offset);
                }
            }
            instance_ctx.serial_task.new_conn(ws_stream).await;
            Ok(())
        }
    }
}

#[endpoint {
    method = GET,
    path = "/instance/serial/history",
}]
async fn instance_serial_history_get(
    rqctx: RequestContext<Arc<Context>>,
    query: Query<api_types::InstanceSerialHistoryParams>,
) -> Result<HttpResponseOk<api::InstanceSerialConsoleHistoryResponse>, HttpError>
{
    let query_params = query.into_inner();

    let history_query = serial::HistoryQuery::from_query(
        query_params.from_start,
        query_params.most_recent,
    )
    .ok_or_else(|| {
        HttpError::for_bad_request(
            None,
            "Exactly one of 'from_start' or 'most_recent' must be specified."
                .to_string(),
        )
    })?;
    let max_bytes = query_params.max_bytes.map(|x| x as usize);

    let ctx = rqctx.context();
    let (data, end) = ctx
        .instance
        .lock()
        .await
        .as_ref()
        .ok_or(HttpError::for_internal_error(
            "No mock instance instantiated".to_string(),
        ))?
        .serial
        .history_vec(history_query, max_bytes)
        .await
        .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?;

    Ok(HttpResponseOk(api::InstanceSerialConsoleHistoryResponse {
        data,
        last_byte_offset: end as u64,
    }))
}

#[endpoint {
    method = GET,
    path = "/mock/mode"
}]
async fn mock_mode_get(
    rqctx: RequestContext<Arc<Context>>,
) -> Result<HttpResponseOk<MockMode>, HttpError> {
    let instance = rqctx.context().instance.lock().await;
    let instance = instance.as_ref().ok_or_else(|| {
        HttpError::for_internal_error(
            "Server not initialized (no instance)".to_string(),
        )
    })?;
    let mode = if instance.state_watcher_rx.borrow().single_step_gen.is_some() {
        MockMode::SingleStep
    } else {
        MockMode::Run
    };
    Ok(HttpResponseOk(mode))
}

#[endpoint {
    method = PUT,
    path = "/mock/mode"
}]
async fn mock_mode_set(
    rqctx: RequestContext<Arc<Context>>,
    request: TypedBody<MockMode>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
    let instance = rqctx.context().instance.lock().await;
    let instance = instance.as_ref().ok_or_else(|| {
        HttpError::for_internal_error(
            "Server not initialized (no instance)".to_string(),
        )
    })?;
    let mode = request.into_inner();
    instance.state_watcher_tx.send_if_modified(|mock_state| {
        match mode {
            MockMode::Run => {
                mock_state.single_step_gen = None;
                true
            }
            // If we're already in single-step mode, don't clobber the existing
            // single-step generation.
            MockMode::SingleStep if mock_state.single_step_gen.is_some() => {
                false
            }
            // Otherwise, start single-stepping from the current generation.
            MockMode::SingleStep => {
                mock_state.single_step_gen = Some(instance.curr_gen);
                true
            }
        }
    });
    Ok(HttpResponseUpdatedNoContent())
}

#[endpoint {
    method = PUT,
    path = "/mock/step"
}]
async fn mock_step(
    rqctx: RequestContext<Arc<Context>>,
) -> Result<HttpResponseUpdatedNoContent, HttpError> {
    let instance = rqctx.context().instance.lock().await;
    let instance = instance.as_ref().ok_or_else(|| {
        HttpError::for_internal_error(
            "Server not initialized (no instance)".to_string(),
        )
    })?;
    if instance.state_watcher_rx.borrow().single_step_gen.is_none() {
        return Err(HttpError::for_bad_request(
            None,
            "not in single-step mode".to_string(),
        ));
    }

    instance.state_watcher_tx.send_modify(|state| {
        let g = state
            .single_step_gen
            .as_mut()
            .expect("we just checked that it's set");
        *g += 1;
        slog::info!(
            rqctx.log,
            "instance state stepped to generation {g}";
            "gen" => *g,
        );
    });
    Ok(HttpResponseUpdatedNoContent())
}

mod serial {
    use std::sync::atomic::{AtomicBool, Ordering};
    use std::sync::Arc;

    use dropshot::WebsocketConnectionRaw;
    use futures::StreamExt;
    use tokio::sync::{mpsc, Notify};
    use tokio_tungstenite::tungstenite::protocol::{
        frame::coding::CloseCode, CloseFrame,
    };
    use tokio_tungstenite::WebSocketStream;

    type WsConn = WebSocketStream<WebsocketConnectionRaw>;

    const DEFAULT_MAX_LEN: usize = 1024;

    pub(crate) enum HistoryQuery {
        FromStart(usize),
        MostRecent(usize),
    }
    impl HistoryQuery {
        pub(crate) const fn from_query(
            from_start: Option<u64>,
            most_recent: Option<u64>,
        ) -> Option<Self> {
            match (from_start, most_recent) {
                (Some(from_start), None) => {
                    Some(Self::FromStart(from_start as usize))
                }
                (None, Some(most_recent)) => {
                    Some(Self::MostRecent(most_recent as usize))
                }
                _ => None,
            }
        }
    }

    /// Fake serial task
    pub(crate) struct SerialTask {
        chan_ctrl: mpsc::Sender<()>,
        chan_ws: mpsc::Sender<WsConn>,
        is_shutdown: AtomicBool,
    }
    impl SerialTask {
        pub fn spawn() -> Self {
            let (ctrl_send, ctrl_recv) = mpsc::channel(1);
            let (ws_send, ws_recv) = mpsc::channel::<WsConn>(1);

            tokio::spawn(async move {
                Self::serial_task_work(ctrl_recv, ws_recv).await
            });
            Self {
                chan_ctrl: ctrl_send,
                chan_ws: ws_send,
                is_shutdown: AtomicBool::new(false),
            }
        }

        /// Drive client connections to the UART websocket
        ///
        /// At this time, there is no real data being emitted from the mock
        /// instance besides what's made up in the [`Serial`] below.  Because of
        /// that, the serial task has little to do besides holding the websocket
        /// connections open until the mock instance enters shutdown.
        async fn serial_task_work(
            mut chan_ctrl: mpsc::Receiver<()>,
            mut chan_ws: mpsc::Receiver<WsConn>,
        ) {
            let bail = Notify::new();
            let mut connections = futures::stream::FuturesUnordered::new();
            let mut is_shutdown = false;

            /// Send appropriate shutdown notice
            async fn close_for_shutdown(mut conn: WsConn) {
                let _ = conn
                    .close(Some(CloseFrame {
                        code: CloseCode::Away,
                        reason: "VM stopped".into(),
                    }))
                    .await;
            }

            /// Wait for a client connection to close (while discarding any
            /// input from it), or a signal that the VM is shutting down.
            async fn wait_for_close(
                mut conn: WsConn,
                bail: &Notify,
            ) -> Option<WsConn> {
                let mut pconn = std::pin::Pin::new(&mut conn);

                loop {
                    tokio::select! {
                        msg = pconn.next() => {
                            // Discard input (if any) and keep truckin'
                            msg.as_ref()?;
                        },
                        _ = bail.notified() => {
                            return Some(conn);
                        }
                    }
                }
            }

            loop {
                tokio::select! {
                    _vm_shutdown = chan_ctrl.recv() => {
                        // We've been signaled that the VM is shutdown
                        bail.notify_waiters();
                        chan_ws.close();
                        is_shutdown = true;
                        if connections.is_empty() {
                            return;
                        }
                    }
                    conn = chan_ws.recv() => {
                        // A new client connection has been passed to us
                        if conn.is_none() {
                            continue;
                        }
                        let conn = conn.unwrap();
                        if is_shutdown {
                            close_for_shutdown(conn).await;
                            continue;
                        }
                        connections
                            .push(async { wait_for_close(conn, &bail).await });
                    }
                    disconnect = connections.next(), if !connections.is_empty() => {
                        match disconnect {
                            None => {
                                // last open client
                                assert!(connections.is_empty());
                                if is_shutdown {
                                    return;
                                }
                            }
                            Some(Some(conn)) => {
                                // client needs disconnect due to shutdown
                                close_for_shutdown(conn).await;
                            }
                            _ => {
                                // client disconnected itself
                                continue
                            }
                        }
                    }
                }
            }
        }

        pub async fn new_conn(&self, ws: WsConn) {
            if let Err(mut ws) = self.chan_ws.send(ws).await.map_err(|e| e.0) {
                let _ = ws
                    .close(Some(CloseFrame {
                        code: CloseCode::Away,
                        reason: "VM stopped".into(),
                    }))
                    .await;
            }
        }

        pub async fn shutdown(&self) {
            if !self.is_shutdown.swap(true, Ordering::Relaxed) {
                self.chan_ctrl.send(()).await.unwrap();
            }
        }
    }

    /// Mock source of UART data from the guest, including history
    pub(crate) struct Serial {
        mock_data: Vec<u8>,
    }
    impl Serial {
        pub(super) fn new(name: &str) -> Arc<Self> {
            Arc::new(Self { mock_data: Self::mock_data(name) })
        }

        // Conjure up some fake console output
        fn mock_data(name: &str) -> Vec<u8> {
            use std::collections::hash_map::DefaultHasher;
            use std::hash::{Hash, Hasher};

            let mut buf = Vec::with_capacity(1024);
            #[rustfmt::skip]
            let gerunds = [
                "Loading", "Reloading", "Advancing", "Reticulating",
                "Defeating", "Spoiling", "Cooking", "Destroying", "Resenting",
                "Introducing", "Reiterating", "Blasting", "Tolling",
                "Delivering", "Engendering", "Establishing",
            ];
            #[rustfmt::skip]
            let nouns = [
                "canon", "browsers", "meta", "splines", "villains", "plot",
                "books", "evidence", "decisions", "chaos", "points",
                "processors", "bells", "value", "gender", "shots",
            ];
            let mut hasher = DefaultHasher::new();
            name.hash(&mut hasher);
            let mut entropy = hasher.finish();
            buf.extend(
                format!(
                    "This is simulated serial console output for {name}.\r\n"
                )
                .as_bytes(),
            );
            while entropy != 0 {
                let gerund = gerunds[entropy as usize % gerunds.len()];
                entropy /= gerunds.len() as u64;
                let noun = nouns[entropy as usize % nouns.len()];
                entropy /= nouns.len() as u64;
                buf.extend(
                    format!(
                        "{} {}... {}[\x1b[92m 0K \x1b[m]\r\n",
                        gerund,
                        noun,
                        " ".repeat(40 - gerund.len() - noun.len())
                    )
                    .as_bytes(),
                );
            }
            buf.extend(
                format!(
                    "\x1b[2J\x1b[HOS/478 ({name}) (ttyl)\r\n\r\n{name} login: "
                )
                .as_bytes(),
            );
            buf
        }

        pub async fn history_vec(
            &self,
            query: HistoryQuery,
            max_bytes: Option<usize>,
        ) -> Result<(Vec<u8>, usize), &'static str> {
            let end = self.mock_data.len();
            let byte_limit = max_bytes.unwrap_or(DEFAULT_MAX_LEN);

            match query {
                HistoryQuery::FromStart(n) => {
                    if n > self.mock_data.len() {
                        Err("requesting data beyond history")
                    } else {
                        let data = &self.mock_data[n..];
                        let truncated = &data[..(data.len().min(byte_limit))];
                        Ok((truncated.to_vec(), end))
                    }
                }
                HistoryQuery::MostRecent(n) => {
                    let clamped = n.min(self.mock_data.len());
                    let data =
                        &self.mock_data[(self.mock_data.len() - clamped)..];
                    let truncated = &data[..(data.len().min(byte_limit))];
                    Ok((truncated.to_vec(), end))
                }
            }
        }
    }
}

/// Returns a Dropshot [`ApiDescription`] object to launch a mock Propolis
/// server.
///
/// This function should be avoided in favor of `start()` because using this
/// function requires that the consumer and Propolis update Dropshot
/// dependencies in lockstep due to the sharing of various types.
pub fn api() -> ApiDescription<Arc<Context>> {
    let mut api = ApiDescription::new();
    api.register(instance_ensure).unwrap();
    api.register(instance_get).unwrap();
    api.register(instance_state_monitor).unwrap();
    api.register(instance_state_put).unwrap();
    api.register(instance_serial).unwrap();
    api.register(instance_serial_history_get).unwrap();
    api.register(mock_mode_get).unwrap();
    api.register(mock_mode_set).unwrap();
    api.register(mock_step).unwrap();
    api
}

// These types need to be exposed so that consumers have names for them without
// having to maintain a dropshot dependency in lockstep with their dependency on
// this crate.

/// configuration for the dropshot server
pub type Config = dropshot::ConfigDropshot;
/// the dropshot server itself
pub type Server = dropshot::HttpServer<Arc<Context>>;
/// errors returned from attempting to start a dropshot server
// Dropshot should expose this, but it's going to be removed anyway.
pub type ServerStartError = Box<dyn std::error::Error + Send + Sync>;

/// Starts a Propolis mock server
pub fn start(config: Config, log: Logger) -> Result<Server, ServerStartError> {
    let propolis_log = log.new(o!("component" => "propolis-server-mock"));
    let dropshot_log = log.new(o!("component" => "dropshot"));
    let private = Arc::new(Context::new(propolis_log));
    let starter = dropshot::HttpServerStarter::new(
        &config,
        api(),
        private,
        &dropshot_log,
    )?;
    Ok(starter.start())
}


================================================
FILE: bin/mock-server/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;

use anyhow::anyhow;
use clap::Parser;
use dropshot::{
    CompressionConfig, ConfigDropshot, HandlerTaskMode, HttpServerStarter,
};
use slog::{info, Drain};

#[derive(Debug, Parser)]
#[clap(about, version)]
/// An HTTP server providing access to Propolis
enum Args {
    /// Generates the OpenAPI specification.
    OpenApi,
    /// Runs the Propolis server.
    Run {
        #[clap(action)]
        cfg: PathBuf,

        #[clap(name = "PROPOLIS_IP:PORT", action)]
        propolis_addr: SocketAddr,

        /// IP:Port for the Oximeter register address
        #[clap(long, action)]
        metric_addr: Option<SocketAddr>,
    },
}

fn build_logger() -> slog::Logger {
    let main_drain = if atty::is(atty::Stream::Stdout) {
        let decorator = slog_term::TermDecorator::new().build();
        let drain = slog_term::FullFormat::new(decorator).build().fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    } else {
        let drain =
            slog_bunyan::with_name("propolis-server", std::io::stdout())
                .build()
                .fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    };

    let (dtrace_drain, probe_reg) = slog_dtrace::Dtrace::new();

    let filtered_main = slog::LevelFilter::new(main_drain, slog::Level::Info);

    let log = slog::Logger::root(
        slog::Duplicate::new(filtered_main.fuse(), dtrace_drain.fuse()).fuse(),
        slog::o!(),
    );

    if let slog_dtrace::ProbeRegistration::Failed(err) = probe_reg {
        slog::error!(&log, "Error registering slog-dtrace probes: {:?}", err);
    }

    log
}

pub fn run_openapi() -> Result<(), String> {
    propolis_mock_server::api()
        .openapi("Oxide Propolis Server API", semver::Version::new(0, 0, 1))
        .description(
            "API for interacting with the Propolis hypervisor frontend.",
        )
        .contact_url("https://oxide.computer")
        .contact_email("api@oxide.computer")
        .write(&mut std::io::stdout())
        .map_err(|e| e.to_string())
}

async fn run_server(
    config_dropshot: dropshot::ConfigDropshot,
    _metrics_addr: Option<SocketAddr>,
    log: slog::Logger,
) -> anyhow::Result<()> {
    let context = propolis_mock_server::Context::new(log.new(slog::o!()));

    info!(log, "Starting server...");

    let server = HttpServerStarter::new(
        &config_dropshot,
        propolis_mock_server::api(),
        Arc::new(context),
        &log,
    )
    .map_err(|error| anyhow!("Failed to start server: {error}"))?
    .start();

    let server_res = server.await;
    server_res.map_err(|e| anyhow!("Server exited with an error: {e}"))
}

#[tokio::main]
async fn main() -> anyhow::Result<()> {
    // Command line arguments.
    let args = Args::parse();

    match args {
        Args::OpenApi => run_openapi()
            .map_err(|e| anyhow!("Cannot generate OpenAPI spec: {e}")),
        Args::Run { cfg: _cfg, propolis_addr, metric_addr } => {
            // Dropshot configuration.
            let config_dropshot = ConfigDropshot {
                bind_address: propolis_addr,
                default_request_body_max_bytes: 1024 * 1024, // 1M for ISO bytes
                default_handler_task_mode: HandlerTaskMode::Detached,
                log_headers: vec![],
                compression: CompressionConfig::None,
            };

            let log = build_logger();

            run_server(config_dropshot, metric_addr, log).await
        }
    }
}


================================================
FILE: bin/propolis-cli/Cargo.toml
================================================
[package]
name = "propolis-cli"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"

[dependencies]
anyhow.workspace = true
clap = { workspace = true, features = ["derive"] }
crucible-client-types.workspace = true
futures.workspace = true
libc.workspace = true
newtype-uuid.workspace = true
propolis-client.workspace = true
propolis-config-toml.workspace = true
slog.workspace = true
slog-async.workspace = true
slog-term.workspace = true
tokio = { workspace = true, features = ["full"] }
tokio-tungstenite.workspace = true
uuid.workspace = true
reqwest.workspace = true
serde.workspace = true
serde_json.workspace = true
base64.workspace = true


================================================
FILE: bin/propolis-cli/README.md
================================================
# Propolis CLI

The `propolis-cli` utility provides a user-friendly frontend to the
[`propolis-server`](../propolis-server) REST API.

## Getting started

The easiest way to launch a VM via the CLI is to write a TOML file describing
the VM's configuration. An example of such a file might be the following:

```toml
[block_dev.alpine_iso]
type = "file"
path = "/path/to/alpine-extended-3.12.0-x86_64.iso"

[dev.block0]
driver = "pci-virtio-block"
block_dev = "alpine_iso"
pci-path = "0.4.0"

[dev.net0]
driver = "pci-virtio-viona"
vnic = "vnic_name"
pci-path = "0.5.0"
```

To create and run a Propolis VM using this configuration:

```
# propolis-cli -s <server ip> -p <port> new --config-toml <path> <VM name>
# propolis-cli -s <server ip> -p <port> state run
```

To connect to the VM's serial console:

```
# propolis-cli -s <server ip> -p <port> serial
```

Run `propolis-cli --help` to see the full list of supported commands and their
arguments.


================================================
FILE: bin/propolis-cli/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};
use std::{
    net::{IpAddr, SocketAddr, ToSocketAddrs},
    os::unix::prelude::AsRawFd,
    time::Duration,
};

use anyhow::{anyhow, Context};
use clap::{Args, Parser, Subcommand};
use futures::{future, SinkExt};
use newtype_uuid::{GenericUuid, TypedUuid, TypedUuidKind, TypedUuidTag};
use propolis_client::instance_spec::{
    BlobStorageBackend, Board, Chipset, Component, CrucibleStorageBackend,
    GuestHypervisorInterface, HyperVFeatureFlag, I440Fx, InstanceMetadata,
    InstanceProperties, InstanceSpec, InstanceSpecGetResponse, NvmeDisk,
    PciPath, QemuPvpanic, ReplacementComponent, SerialPort, SerialPortNumber,
    SpecKey, VirtioDisk,
};
use propolis_client::support::nvme_serial_from_str;
use propolis_client::types::{
    InstanceEnsureRequest, InstanceInitializationMethod,
};
use propolis_config_toml::spec::toml_cpuid_to_spec_cpuid;
use propolis_config_toml::spec::SpecConfig;
use serde::{Deserialize, Serialize};
use slog::{o, Drain, Level, Logger};
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio_tungstenite::tungstenite::{
    protocol::{frame::coding::CloseCode, CloseFrame},
    Message,
};
use uuid::Uuid;

use propolis_client::{
    support::{InstanceSerialConsoleHelper, WSClientOffset},
    types::{InstanceStateRequested, InstanceVcrReplace, MigrationState},
    Client,
};

#[derive(Debug, Parser)]
#[clap(about, version)]
/// A simple CLI tool to manipulate propolis-server
struct Opt {
    /// propolis-server address
    #[clap(short, long, value_parser = resolve_host)]
    server: IpAddr,

    /// propolis-server port
    #[clap(short, long, default_value = "12400", action)]
    port: u16,

    /// Enable debugging
    #[clap(short, long, action)]
    debug: bool,

    #[clap(subcommand)]
    cmd: Command,
}

// `New`, via `VmConfig`, is large enough to trip this lint. This enum is
// created exactly once, so we don't need to be picky about the layout..
#[allow(clippy::large_enum_variant)]
#[derive(Debug, Subcommand)]
enum Command {
    /// Create a new propolis instance
    New {
        /// Instance name
        #[clap(action)]
        name: String,

        /// Instance uuid (if specified)
        #[clap(short = 'u', action)]
        uuid: Option<Uuid>,

        #[clap(flatten)]
        config: VmConfig,

        /// A UUID to use for the instance's silo, attached to instance metrics.
        #[clap(long)]
        silo_id: Option<TypedUuid<SiloKind>>,

        /// A UUID to use for the instance's project, attached to instance metrics.
        #[clap(long)]
        project_id: Option<TypedUuid<ProjectKind>>,

        /// A UUID to use for the instance's hosting sled, attached to instance
        /// metrics.
        #[clap(long)]
        sled_id: Option<TypedUuid<SledKind>>,

        /// A model number to use for the instance's hosting sled, attached to
        /// instance metrics.
        #[clap(long, default_value_t = String::from("fake-gimlet"))]
        sled_model: String,

        /// A revision number to use for the instance's hosting sled, attached to
        /// instance metrics.
        #[clap(long, default_value_t = 1)]
        sled_revision: u32,

        /// A serial number to use for the instance's hosting sled, attached to
        /// instance metrics.
        #[clap(long, default_value_t = String::from("fake-serial"))]
        sled_serial: String,
    },

    /// Get the properties of a propolis instance
    Get,

    /// Transition the instance to a new state
    State {
        /// The requested state
        #[clap(value_parser = parse_state)]
        state: InstanceStateRequested,
    },

    /// Drop to a Serial console connected to the instance
    Serial {
        /// The offset since boot (or if negative, the current end of the
        /// buffered data) from which to retrieve output history.
        /// Defaults to the most recent 16 KiB of console output (-16384).
        #[clap(long, short)]
        byte_offset: Option<i64>,
    },

    /// Migrate instance to new propolis-server
    Migrate {
        /// Destination propolis-server address
        #[clap(value_parser = resolve_host)]
        dst_server: IpAddr,

        /// Destination propolis-server port
        #[clap(short = 'p', default_value = "12400", action)]
        dst_port: u16,

        /// Uuid for the destination instance
        #[clap(short = 'u', action)]
        dst_uuid: Option<Uuid>,

        /// File with a JSON array of DiskRequest structs
        #[clap(long, action)]
        crucible_disks: Option<PathBuf>,
    },

    /// Monitor an instance's state in real time
    Monitor,

    /// Inject an NMI into the instance
    InjectNmi,

    /// Call the VolumeConstructionRequest replace endpoint
    Vcr {
        /// Uuid for the disk
        #[clap(short = 'd', action)]
        disk_id: String,

        /// File with a JSON InstanceVcrReplace struct
        #[clap(long, action)]
        vcr_replace: PathBuf,
    },
}

#[derive(Args, Clone, Debug)]
struct VmConfig {
    /// A path to a file containing a JSON-formatted instance spec
    #[clap(short = 's', long, action, group = "spec_group")]
    spec: Option<PathBuf>,

    /// Number of vCPUs allocated to instance
    #[clap(short = 'c', default_value = "4", action, requires = "config_toml")]
    vcpus: u8,

    /// Memory allocated to instance (MiB)
    #[clap(short, default_value = "1024", action, requires = "config_toml")]
    memory: u64,

    /// CPUID profile to use.
    ///
    /// The named profile must be defined in `config_toml`.
    #[clap(long, requires = "config_toml")]
    cpuid_profile: Option<String>,

    /// A path to a file containing a config TOML
    #[clap(short = 't', long, action, group = "config_group", requires_all = ["vcpus", "memory"])]
    config_toml: Option<PathBuf>,

    /// File with a JSON array of DiskRequest structs
    #[clap(long, action, conflicts_with = "spec")]
    crucible_disks: Option<PathBuf>,

    // cloud_init ISO file
    #[clap(long, action, conflicts_with = "spec")]
    cloud_init: Option<PathBuf>,

    /// enable Hyper-V compatible enlightenments for this VM
    #[clap(long, action)]
    hyperv: bool,
}

fn add_component_to_spec(
    spec: &mut InstanceSpec,
    id: SpecKey,
    component: Component,
) -> anyhow::Result<()> {
    use std::collections::btree_map::Entry;
    match spec.components.entry(id) {
        Entry::Vacant(vacant_entry) => {
            vacant_entry.insert(component);
            Ok(())
        }
        Entry::Occupied(occupied_entry) => Err(anyhow::anyhow!(
            "duplicate component ID {:?}",
            occupied_entry.key()
        )),
    }
}

/// A legacy Propolis API disk request, preserved here for compatibility with
/// the `--crucible-disks` option.
#[derive(Clone, Debug, Deserialize, Serialize)]
struct DiskRequest {
    name: String,
    slot: u8,
    read_only: bool,
    device: String,
    volume_construction_request: propolis_client::VolumeConstructionRequest,
}

#[derive(Clone, Debug)]
struct ParsedDiskRequest {
    device_id: SpecKey,
    device_spec: Component,
    backend_id: SpecKey,
    backend_spec: CrucibleStorageBackend,
}

impl DiskRequest {
    fn parse(&self) -> anyhow::Result<ParsedDiskRequest> {
        // Preserve compatibility with the old Propolis API by adding 16 to the
        // slot number, which must be between 0 and 7 inclusive.
        if !(0..8).contains(&self.slot) {
            anyhow::bail!("disk request slots must be in [0..7]");
        }

        let slot = self.slot + 0x10;
        let backend_id = SpecKey::Name(format!("{}-backend", self.name));
        let pci_path = PciPath::new(0, slot, 0).with_context(|| {
            format!("processing disk request {:?}", self.name)
        })?;
        let device_spec = match self.device.as_ref() {
            "virtio" => Component::VirtioDisk(VirtioDisk {
                backend_id: backend_id.clone(),
                pci_path,
            }),
            "nvme" => Component::NvmeDisk(NvmeDisk {
                backend_id: backend_id.clone(),
                pci_path,
                serial_number: nvme_serial_from_str(&self.name, b' '),
            }),
            _ => anyhow::bail!(
                "invalid device type in disk request: {:?}",
                self.device
            ),
        };

        let backend_spec = CrucibleStorageBackend {
            readonly: self.read_only,
            request_json: serde_json::to_string(
                &self.volume_construction_request,
            )?,
        };

        Ok(ParsedDiskRequest {
            device_id: SpecKey::Name(self.name.clone()),
            device_spec,
            backend_id,
            backend_spec,
        })
    }
}

impl VmConfig {
    fn instance_spec(&self) -> anyhow::Result<InstanceSpec> {
        // If the configuration specifies an instance spec path, just read the
        // spec from that path and return it. Otherwise, construct a spec from
        // this configuration's component parts.
        if let Some(path) = &self.spec {
            return parse_json_file(path);
        }

        let parsed_toml = self
            .config_toml
            .as_ref()
            .map(propolis_config_toml::parse)
            .transpose()?;

        let from_toml =
            parsed_toml.as_ref().map(SpecConfig::try_from).transpose()?;

        let enable_pcie =
            from_toml.as_ref().map(|cfg| cfg.enable_pcie).unwrap_or(false);

        let cpuid_profile = parsed_toml
            .as_ref()
            .and_then(|cfg| {
                self.cpuid_profile.as_ref().map(|profile| {
                    let profile =
                        cfg.cpuid_profiles.get(profile).ok_or_else(|| {
                            anyhow!(
                                "CPUID profile not defined in {}: {profile}",
                                self.config_toml.as_ref().unwrap().display()
                            )
                        })?;

                    toml_cpuid_to_spec_cpuid(profile)
                        .map_err(Into::<anyhow::Error>::into)
                })
            })
            .transpose()?;

        let mut spec = InstanceSpec {
            board: Board {
                chipset: Chipset::I440Fx(I440Fx { enable_pcie }),
                cpuid: cpuid_profile,
                cpus: self.vcpus,
                memory_mb: self.memory,
                guest_hv_interface: if self.hyperv {
                    GuestHypervisorInterface::HyperV {
                        features: [HyperVFeatureFlag::ReferenceTsc]
                            .into_iter()
                            .collect(),
                    }
                } else {
                    Default::default()
                },
            },
            components: Default::default(),
            smbios: None,
        };

        if let Some(from_toml) = from_toml {
            for (id, component) in from_toml.components.iter() {
                add_component_to_spec(
                    &mut spec,
                    id.clone(),
                    component.clone(),
                )?;
            }
        }

        for disk_request in self
            .crucible_disks
            .as_ref()
            .map(|path| parse_json_file::<Vec<DiskRequest>>(path))
            .transpose()?
            .iter()
            .flatten()
        {
            let ParsedDiskRequest {
                device_id,
                device_spec,
                backend_id,
                backend_spec,
            } = disk_request.parse()?;
            add_component_to_spec(&mut spec, device_id, device_spec)?;
            add_component_to_spec(
                &mut spec,
                backend_id,
                Component::CrucibleStorageBackend(backend_spec),
            )?;
        }

        if let Some(cloud_init) = self.cloud_init.as_ref() {
            let bytes = base64::Engine::encode(
                &base64::engine::general_purpose::STANDARD,
                std::fs::read(cloud_init)?,
            );

            const CLOUD_INIT_NAME: &str = "cloud-init";
            const CLOUD_INIT_BACKEND_NAME: &str = "cloud-init-backend";

            add_component_to_spec(
                &mut spec,
                SpecKey::Name(CLOUD_INIT_NAME.to_owned()),
                Component::VirtioDisk(VirtioDisk {
                    backend_id: SpecKey::Name(
                        CLOUD_INIT_BACKEND_NAME.to_owned(),
                    ),
                    pci_path: PciPath::new(0, 0x18, 0).unwrap(),
                }),
            )?;

            add_component_to_spec(
                &mut spec,
                SpecKey::Name(CLOUD_INIT_BACKEND_NAME.to_owned()),
                Component::BlobStorageBackend(BlobStorageBackend {
                    base64: bytes,
                    readonly: true,
                }),
            )?;
        }

        for (name, port) in [
            ("com1", SerialPortNumber::Com1),
            ("com2", SerialPortNumber::Com2),
            ("com3", SerialPortNumber::Com3),
        ] {
            add_component_to_spec(
                &mut spec,
                SpecKey::Name(name.to_owned()),
                Component::SerialPort(SerialPort { num: port }),
            )?;
        }

        // If there are no SoftNPU devices, also enable COM4.
        if !spec
            .components
            .iter()
            .any(|(_, c)| matches!(c, Component::SoftNpuPort(_)))
        {
            add_component_to_spec(
                &mut spec,
                SpecKey::Name("com4".to_owned()),
                Component::SerialPort(SerialPort {
                    num: SerialPortNumber::Com4,
                }),
            )?;
        }

        add_component_to_spec(
            &mut spec,
            SpecKey::Name("pvpanic".to_owned()),
            Component::QemuPvpanic(QemuPvpanic { enable_isa: true }),
        )?;

        Ok(spec)
    }
}

fn parse_state(state: &str) -> anyhow::Result<InstanceStateRequested> {
    match state.to_lowercase().as_str() {
        "run" => Ok(InstanceStateRequested::Run),
        "stop" => Ok(InstanceStateRequested::Stop),
        "reboot" => Ok(InstanceStateRequested::Reboot),
        _ => Err(anyhow!(
            "invalid requested state, must be one of: 'run', 'stop', 'reboot"
        )),
    }
}

fn parse_json_file<T: serde::de::DeserializeOwned>(
    path: &Path,
) -> anyhow::Result<T> {
    let file = File::open(path)?;
    let reader = BufReader::new(file);
    serde_json::from_reader(reader).map_err(|e| e.into())
}

/// Given a string representing an host, attempts to resolve it to a specific IP address
fn resolve_host(server: &str) -> anyhow::Result<IpAddr> {
    (server, 0)
        .to_socket_addrs()?
        .map(|sock_addr| sock_addr.ip())
        .next()
        .ok_or_else(|| anyhow!("failed to resolve server argument '{server}'"))
}

/// Create a top-level logger that outputs to stderr
fn create_logger(opt: &Opt) -> Logger {
    let decorator = slog_term::TermDecorator::new().stderr().build();
    let drain = slog_term::FullFormat::new(decorator).build().fuse();
    let level = if opt.debug { Level::Debug } else { Level::Info };
    let drain = slog::LevelFilter(drain, level).fuse();
    let drain = slog_async::Async::new(drain).build().fuse();

    Logger::root(drain, o!())
}

// Implement typed UUID wrappers for the project / silo IDs, to avoid conflating
// them.
enum ProjectKind {}

impl TypedUuidKind for ProjectKind {
    fn tag() -> TypedUuidTag {
        const TAG: TypedUuidTag = TypedUuidTag::new("project");
        TAG
    }
}

enum SiloKind {}

impl TypedUuidKind for SiloKind {
    fn tag() -> TypedUuidTag {
        const TAG: TypedUuidTag = TypedUuidTag::new("silo");
        TAG
    }
}

enum SledKind {}

impl TypedUuidKind for SledKind {
    fn tag() -> TypedUuidTag {
        const TAG: TypedUuidTag = TypedUuidTag::new("sled");
        TAG
    }
}

#[allow(clippy::too_many_arguments)]
async fn new_instance(
    client: &Client,
    name: String,
    id: Uuid,
    spec: InstanceSpec,
    metadata: InstanceMetadata,
) -> anyhow::Result<()> {
    let properties = InstanceProperties {
        id,
        name,
        description: "propolis-cli generated instance".to_string(),
        metadata,
    };

    let request = InstanceEnsureRequest {
        properties,
        init: InstanceInitializationMethod::Spec { spec },
    };

    // Try to create the instance
    client
        .instance_ensure()
        .body(request)
        .send()
        .await
        .with_context(|| anyhow!("failed to create instance"))?;

    Ok(())
}

async fn replace_vcr(
    client: &Client,
    id: String,
    vcr_replace: InstanceVcrReplace,
) -> anyhow::Result<()> {
    // Try to call the endpoint
    client
        .instance_issue_crucible_vcr_request()
        .id(id)
        .body(vcr_replace)
        .send()
        .await
        .with_context(|| anyhow!("failed to issue vcr request"))?;

    Ok(())
}

async fn get_instance(client: &Client) -> anyhow::Result<()> {
    let res = client
        .instance_get()
        .send()
        .await
        .with_context(|| anyhow!("failed to get instance properties"))?;

    println!("{:#?}", res.instance);

    Ok(())
}

async fn put_instance(
    client: &Client,
    state: InstanceStateRequested,
) -> anyhow::Result<()> {
    client
        .instance_state_put()
        .body(state)
        .send()
        .await
        .with_context(|| anyhow!("failed to set instance state"))?;

    Ok(())
}

async fn stdin_to_websockets_task(
    mut stdinrx: tokio::sync::mpsc::Receiver<Vec<u8>>,
    wstx: tokio::sync::mpsc::Sender<Vec<u8>>,
) {
    // next_raw must live outside loop, because Ctrl-A should work across
    // multiple inbuf reads.
    let mut next_raw = false;

    loop {
        let inbuf = if let Some(inbuf) = stdinrx.recv().await {
            inbuf
        } else {
            continue;
        };

        // Put bytes from inbuf to outbuf, but don't send Ctrl-A unless
        // next_raw is true.
        let mut outbuf = Vec::with_capacity(inbuf.len());

        let mut exit = false;
        for c in inbuf {
            match c {
                // Ctrl-A means send next one raw
                b'\x01' => {
                    if next_raw {
                        // Ctrl-A Ctrl-A should be sent as Ctrl-A
                        outbuf.push(c);
                        next_raw = false;
                    } else {
                        next_raw = true;
                    }
                }
                b'\x03' => {
                    if !next_raw {
                        // Exit on non-raw Ctrl-C
                        exit = true;
                        break;
                    } else {
                        // Otherwise send Ctrl-C
                        outbuf.push(c);
                        next_raw = false;
                    }
                }
                _ => {
                    outbuf.push(c);
                    next_raw = false;
                }
            }
        }

        // Send what we have, even if there's a Ctrl-C at the end.
        if !outbuf.is_empty() {
            wstx.send(outbuf).await.unwrap();
        }

        if exit {
            break;
        }
    }
}

async fn serial(
    addr: SocketAddr,
    byte_offset: Option<i64>,
    log: Logger,
) -> anyhow::Result<()> {
    let mut ws_console = serial_connect(addr, byte_offset, log).await?;

    let _raw_guard = RawTermiosGuard::stdio_guard()
        .with_context(|| anyhow!("failed to set raw mode"))?;

    let mut stdout = tokio::io::stdout();

    // https://docs.rs/tokio/latest/tokio/io/trait.AsyncReadExt.html#method.read_exact
    // is not cancel safe! Meaning reads from tokio::io::stdin are not cancel
    // safe. Spawn a separate task to read and put bytes onto this channel.
    let (stdintx, stdinrx) = tokio::sync::mpsc::channel(16);
    let (wstx, mut wsrx) = tokio::sync::mpsc::channel(16);

    tokio::spawn(async move {
        let mut stdin = tokio::io::stdin();
        let mut inbuf = [0u8; 1024];

        loop {
            let n = match stdin.read(&mut inbuf).await {
                Err(_) | Ok(0) => break,
                Ok(n) => n,
            };

            stdintx.send(inbuf[0..n].to_vec()).await.unwrap();
        }
    });

    tokio::spawn(async move { stdin_to_websockets_task(stdinrx, wstx).await });

    loop {
        tokio::select! {
            c = wsrx.recv() => {
                match c {
                    None => {
                        // channel is closed
                        break;
                    }
                    Some(c) => {
                        ws_console.send(Message::Binary(c)).await?;
                    },
                }
            }
            msg = ws_console.recv() => {
                match msg {
                    Some(Ok(msg)) => {
                        match msg.process().await {
                            Ok(Message::Binary(input)) => {
                                stdout.write_all(&input).await?;
                                stdout.flush().await?;
                            }
                            Ok(Message::Close(Some(CloseFrame {code, reason}))) => {
                                eprint!("\r\nConnection closed: {code:?}\r\n");
                                match code {
                                    CloseCode::Abnormal
                                    | CloseCode::Error
                                    | CloseCode::Extension
                                    | CloseCode::Invalid
                                    | CloseCode::Policy
                                    | CloseCode::Protocol
                                    | CloseCode::Size
                                    | CloseCode::Unsupported => {
                                        anyhow::bail!("{reason}");
                                    }
                                    _ => break,
                                }
                            }
                            Ok(Message::Close(None)) => {
                                eprint!("\r\nConnection closed.\r\n");
                                break;
                            }
                            // note: migration events via Message::Text are
                            // already handled within ws_console.recv(), but
                            // would still be available to match here if we want
                            // to indicate that it happened to the user
                            _ => continue,
                        }
                    }
                    None => {
                        eprint!("\r\nConnection lost.\r\n");
                        break;
                    }
                    _ => continue,
                }
            }
        }
    }

    Ok(())
}

async fn serial_connect(
    addr: SocketAddr,
    byte_offset: Option<i64>,
    log: Logger,
) -> anyhow::Result<InstanceSerialConsoleHelper> {
    let offset = match byte_offset {
        Some(x) if x >= 0 => WSClientOffset::FromStart(x as u64),
        Some(x) => WSClientOffset::MostRecent(-x as u64),
        None => WSClientOffset::MostRecent(16384),
    };

    Ok(InstanceSerialConsoleHelper::new(addr, offset, Some(log)).await?)
}

async fn migrate_instance(
    src_client: Client,
    dst_client: Client,
    src_addr: SocketAddr,
    dst_uuid: Uuid,
    disks: Vec<DiskRequest>,
) -> anyhow::Result<()> {
    // Grab the instance details
    let InstanceSpecGetResponse { mut properties, .. } = src_client
        .instance_spec_get()
        .send()
        .await
        .with_context(|| anyhow!("failed to get src instance properties"))?
        .into_inner();
    let src_uuid = properties.id;
    properties.id = dst_uuid;

    let mut replace_components = HashMap::new();
    for disk in disks {
        let ParsedDiskRequest { backend_id, backend_spec, .. } =
            disk.parse()?;

        let old = replace_components.insert(
            backend_id.to_string(),
            ReplacementComponent::CrucibleStorageBackend(backend_spec),
        );

        if old.is_some() {
            anyhow::bail!(
                "duplicate backend name {backend_id} in replacement disk \
                list"
            );
        }
    }

    let request = InstanceEnsureRequest {
        properties,
        init: InstanceInitializationMethod::MigrationTarget {
            migration_id: Uuid::new_v4(),
            src_addr: src_addr.to_string(),
            replace_components,
        },
    };

    // Initiate the migration via the destination instance
    let migration_res =
        dst_client.instance_ensure().body(request).send().await?;
    let migration_id = migration_res
        .migrate
        .as_ref()
        .ok_or_else(|| anyhow!("no migrate id on response"))?
        .migration_id;

    // Wait for the migration to complete by polling both source and destination
    // TODO: replace with into_iter method call after edition upgrade
    let handles = IntoIterator::into_iter([
        ("src", src_client, src_uuid),
        ("dst", dst_client, dst_uuid),
    ])
    .map(|(role, client, id)| {
        tokio::spawn(async move {
            loop {
                let state =
                    client.instance_migrate_status().send().await?.into_inner();

                let migration = if role == "src" {
                    state.migration_out
                } else {
                    state.migration_in
                };

                // The destination should start reporting migration status as
                // soon as the ensure request completes. The source may not
                // have a migration status yet because the request from the
                // destination needs to arrive first.
                let Some(migration) = migration else {
                    if role == "dst" {
                        anyhow::bail!("dst instance's migration ID wasn't set");
                    } else {
                        println!("src hasn't received migration request yet");
                        tokio::time::sleep(Duration::from_secs(1)).await;
                        continue;
                    }
                };

                if migration.id != migration_id {
                    anyhow::bail!(
                        "{role} instance's migration ID is wrong: \
                                  got {}, expected {migration_id}",
                        migration.id
                    );
                }

                let state = migration.state;
                println!("{role}({id}) migration state={state:?}");
                if state == MigrationState::Finish {
                    return Ok::<_, anyhow::Error>(());
                } else if state == MigrationState::Error {
                    return Err(anyhow::anyhow!(
                        "{role} instance ran into error during migration"
                    ));
                }
                tokio::time::sleep(Duration::from_secs(1)).await;
            }
        })
    });

    future::join_all(handles)
        .await
        // Hoist out any JoinErrors
        .into_iter()
        .collect::<Result<Vec<_>, _>>()?
        // Then any errors from polling the source/destination
        .into_iter()
        .collect::<anyhow::Result<()>>()?;

    Ok(())
}

async fn monitor(addr: SocketAddr) -> anyhow::Result<()> {
    // We use a custom client builder here because the default progenitor
    // one has a timeout of 15s but we want to be able to wait indefinitely.
    let client = reqwest::ClientBuilder::new().build().unwrap();
    let client = propolis_client::Client::new_with_client(
        &format!("http://{addr}"),
        client,
    );
    let mut gen = 0;
    loop {
        // State monitoring always returns the most recent state/gen pair
        // known to Propolis.
        let response = client
            .instance_state_monitor()
            .body(propolis_client::types::InstanceStateMonitorRequest {
                gen_: gen,
            })
            .send()
            .await
            .with_context(|| anyhow!("failed to get new instance state"))?;

        println!("InstanceState: {:?}", response.state);

        if response.state == propolis_client::types::InstanceState::Destroyed {
            return Ok(());
        }

        // Update the generation number we're asking for, to ensure the
        // Propolis will only return more recent values.
        gen = response.gen_ + 1;
    }
}

async fn inject_nmi(client: &Client) -> anyhow::Result<()> {
    client
        .instance_issue_nmi()
        .send()
        .await
        .with_context(|| anyhow!("failed to inject NMI"))?;
    Ok(())
}

#[tokio::main]
async fn main() -> anyhow::Result<()> {
    let opt = Opt::parse();
    let log = create_logger(&opt);

    let addr = SocketAddr::new(opt.server, opt.port);
    let client = Client::new(&format!("http://{addr}"));

    match opt.cmd {
        Command::New {
            name,
            uuid,
            config,
            silo_id,
            project_id,
            sled_id,
            sled_model,
            sled_revision,
            sled_serial,
        } => {
            let metadata = InstanceMetadata {
                project_id: project_id
                    .unwrap_or_else(TypedUuid::new_v4)
                    .into_untyped_uuid(),
                silo_id: silo_id
                    .unwrap_or_else(TypedUuid::new_v4)
                    .into_untyped_uuid(),
                sled_id: sled_id
                    .unwrap_or_else(TypedUuid::new_v4)
                    .into_untyped_uuid(),
                sled_model,
                sled_revision,
                sled_serial,
            };
            new_instance(
                &client,
                name.to_string(),
                uuid.unwrap_or_else(Uuid::new_v4),
                config.instance_spec()?,
                metadata,
            )
            .await?
        }
        Command::Get => get_instance(&client).await?,
        Command::State { state } => put_instance(&client, state).await?,
        Command::Serial { byte_offset } => {
            serial(addr, byte_offset, log).await?
        }
        Command::Migrate { dst_server, dst_port, dst_uuid, crucible_disks } => {
            let dst_addr = SocketAddr::new(dst_server, dst_port);
            let dst_client = Client::new(&format!("http://{dst_addr}"));
            let dst_uuid = dst_uuid.unwrap_or_else(Uuid::new_v4);
            let disks = if let Some(crucible_disks) = crucible_disks {
                parse_json_file(&crucible_disks)?
            } else {
                vec![]
            };
            migrate_instance(client, dst_client, addr, dst_uuid, disks).await?
        }
        Command::Monitor => monitor(addr).await?,
        Command::InjectNmi => inject_nmi(&client).await?,
        Command::Vcr { disk_id, vcr_replace } => {
            let replace: InstanceVcrReplace = parse_json_file(&vcr_replace)?;
            replace_vcr(&client, disk_id, replace).await?
        }
    }

    Ok(())
}

/// Guard object that will set the terminal to raw mode and restore it
/// to its previous state when it's dropped
struct RawTermiosGuard(libc::c_int, libc::termios);

impl RawTermiosGuard {
    fn stdio_guard() -> Result<RawTermiosGuard, std::io::Error> {
        let fd = std::io::stdout().as_raw_fd();
        let termios = unsafe {
            let mut curr_termios = std::mem::zeroed();
            let r = libc::tcgetattr(fd, &mut curr_termios);
            if r == -1 {
                return Err(std::io::Error::last_os_error());
            }
            curr_termios
        };
        let guard = RawTermiosGuard(fd, termios);
        unsafe {
            let mut raw_termios = termios;
            libc::cfmakeraw(&mut raw_termios);
            let r = libc::tcsetattr(fd, libc::TCSAFLUSH, &raw_termios);
            if r == -1 {
                return Err(std::io::Error::last_os_error());
            }
        }
        Ok(guard)
    }
}
impl Drop for RawTermiosGuard {
    fn drop(&mut self) {
        let r = unsafe { libc::tcsetattr(self.0, libc::TCSADRAIN, &self.1) };
        if r == -1 {
            panic!("{:?}", std::io::Error::last_os_error());
        }
    }
}

#[cfg(test)]
mod test {
    use super::stdin_to_websockets_task;

    #[tokio::test]
    async fn test_stdin_to_websockets_task() {
        use tokio::sync::mpsc::error::TryRecvError;

        let (stdintx, stdinrx) = tokio::sync::mpsc::channel(16);
        let (wstx, mut wsrx) = tokio::sync::mpsc::channel(16);

        tokio::spawn(
            async move { stdin_to_websockets_task(stdinrx, wstx).await },
        );

        // send characters, receive characters
        stdintx
            .send("test post please ignore".chars().map(|c| c as u8).collect())
            .await
            .unwrap();
        let actual = wsrx.recv().await.unwrap();
        assert_eq!(
            String::from_utf8(actual).unwrap(),
            "test post please ignore"
        );

        // don't send ctrl-a
        stdintx.send("\x01".chars().map(|c| c as u8).collect()).await.unwrap();
        assert_eq!(wsrx.try_recv(), Err(TryRecvError::Empty));

        // the "t" here is sent "raw" because of last ctrl-a but that doesn't change anything
        stdintx.send("test".chars().map(|c| c as u8).collect()).await.unwrap();
        let actual = wsrx.recv().await.unwrap();
        assert_eq!(String::from_utf8(actual).unwrap(), "test");

        // ctrl-a ctrl-c = only ctrl-c sent
        stdintx
            .send("\x01\x03".chars().map(|c| c as u8).collect())
            .await
            .unwrap();
        let actual = wsrx.recv().await.unwrap();
        assert_eq!(String::from_utf8(actual).unwrap(), "\x03");

        // same as above, across two messages
        stdintx.send("\x01".chars().map(|c| c as u8).collect()).await.unwrap();
        stdintx.send("\x03".chars().map(|c| c as u8).collect()).await.unwrap();
        assert_eq!(wsrx.try_recv(), Err(TryRecvError::Empty));
        let actual = wsrx.recv().await.unwrap();
        assert_eq!(String::from_utf8(actual).unwrap(), "\x03");

        // ctrl-a ctrl-a = only ctrl-a sent
        stdintx
            .send("\x01\x01".chars().map(|c| c as u8).collect())
            .await
            .unwrap();
        let actual = wsrx.recv().await.unwrap();
        assert_eq!(String::from_utf8(actual).unwrap(), "\x01");

        // ctrl-c on its own means exit
        stdintx.send("\x03".chars().map(|c| c as u8).collect()).await.unwrap();
        assert_eq!(wsrx.try_recv(), Err(TryRecvError::Empty));

        // channel is closed
        assert!(wsrx.recv().await.is_none());
    }
}


================================================
FILE: bin/propolis-server/Cargo.toml
================================================
[package]
name = "propolis-server"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"

[lib]
name = "propolis_server"
path = "src/lib/lib.rs"
doctest = false

[[bin]]
name = "propolis-server"
path = "src/main.rs"
doc = false
doctest = false
test = false

[dependencies]
atty.workspace = true
anyhow.workspace = true
async-trait.workspace = true
bit_field.workspace = true
bitvec.workspace = true
bytes.workspace = true
chrono = { workspace = true, features = [ "serde" ] }
clap = { workspace = true, features = ["derive"] }
const_format.workspace = true
cpuid_utils = { workspace = true, features = ["instance-spec"] }
crucible-client-types.workspace = true
dropshot = { workspace = true, features = ["usdt-probes"] }
erased-serde.workspace = true
futures.workspace = true
hyper.workspace = true
internal-dns-resolver.workspace = true
internal-dns-types.workspace = true
itertools.workspace = true
kstat-rs.workspace = true
lazy_static.workspace = true
nexus-client.workspace = true
omicron-common.workspace = true
oxide-tokio-rt.workspace = true
oximeter-instruments.workspace = true
oximeter-producer.workspace = true
oximeter.workspace = true
pbind.workspace = true
ron.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
tokio-tungstenite.workspace = true
tokio-util = { workspace = true, features = ["codec"] }
toml.workspace = true
semver.workspace = true
serde.workspace = true
serde_derive.workspace = true
serde_json.workspace = true
slog.workspace = true
slog-async.workspace = true
slog-bunyan.workspace = true
slog-dtrace.workspace = true
slog-term.workspace = true
strum = { workspace = true, features = ["derive"] }
propolis = { workspace = true, features = ["crucible-full", "oximeter"] }
propolis_api_types = { workspace = true }
propolis-api-types-versions.workspace = true
propolis-server-api.workspace = true
propolis_types.workspace = true
rgb_frame.workspace = true
rfb = { workspace = true, features = ["tungstenite"] }
uuid.workspace = true
usdt.workspace = true
vm-attest.workspace = true
base64.workspace = true
schemars = { workspace = true, features = ["chrono", "uuid1"] }

[dev-dependencies]
hex.workspace = true
reqwest = { workspace = true, features = ["rustls"] }
ring.workspace = true
slog = { workspace = true, features = [ "max_level_trace", "release_max_level_debug" ] }
expectorate.workspace = true
mockall.workspace = true
proptest.workspace = true

[features]
default = []

# When building to be packaged for inclusion in the production ramdisk
# (nominally an Omicron package), certain code is compiled in or out.
omicron-build = ["propolis/omicron-build"]

# Falcon builds require corresponding bits turned on in the dependency libs
falcon = ["propolis/falcon"]
# Testing necessitates injecting failures which should hopefully be rare or even
# never occur on real otherwise-unperturbed systems. We conditionally compile
# code supporting failure injection to avoid the risk of somehow injecting
# failures into a real system not under test.
failure-injection = []


================================================
FILE: bin/propolis-server/README.md
================================================
# Propolis Server

This binary provides a REST API to create and manage a Propolis VM. It typically
runs in the context of a complete Oxide control plane deployment, but it can
also be run as a freestanding binary for ad hoc testing of Propolis VMs.

## Running

The server requires a path to a [guest bootrom
image](../propolis-standalone#guest-bootrom) on the local filesystem. It also
must be run with privileges sufficient to create bhyve virtual machines. The
`pfexec(1)` utility can help enable these privileges.

To build and run the server:

```bash
cargo build --bin propolis-server
pfexec target/debug/propolis-server <path_to_bootrom> <ip:port> <vnc_ip:port>
```

The API will be served on `ip:port`. The easiest way to interact with the server
is to use [`propolis-cli`](../propolis-cli), but you can also use tools like
cURL to interact with the API directly. The server's OpenAPI specification is
[checked into the repo](../../openapi/propolis-server).


================================================
FILE: bin/propolis-server/src/lib/config.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Describes a server config which may be parsed from a TOML file.

#[cfg(not(feature = "omicron-build"))]
pub fn reservoir_decide(log: &slog::Logger) -> bool {
    // Automatically enable use of the memory reservoir (rather than transient
    // allocations) for guest memory if it meets some arbitrary size threshold.
    const RESERVOIR_THRESH_MB: usize = 512;

    match propolis::vmm::query_reservoir() {
        Err(e) => {
            slog::error!(log, "could not query reservoir {:?}", e);
            false
        }
        Ok(size) => {
            let size_in_play =
                (size.vrq_alloc_sz + size.vrq_free_sz) / (1024 * 1024);
            if size_in_play > RESERVOIR_THRESH_MB {
                slog::info!(
                    log,
                    "allocating from reservoir ({}MiB) for guest memory",
                    size_in_play
                );
                true
            } else {
                slog::info!(
                    log,
                    "reservoir too small ({}MiB) to use for guest memory",
                    size_in_play
                );
                false
            }
        }
    }
}

#[cfg(feature = "omicron-build")]
pub fn reservoir_decide(_log: &slog::Logger) -> bool {
    // Always use the reservoir in production
    true
}


================================================
FILE: bin/propolis-server/src/lib/initializer.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::convert::TryInto;
use std::fs::File;
use std::num::{NonZeroU8, NonZeroUsize};
use std::os::unix::fs::FileTypeExt;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};

use crate::serial::Serial;
use crate::spec::{self, Spec, StorageBackend, StorageDevice};
use crate::stats::{
    track_network_interface_kstats, track_vcpu_kstats, BlockMetrics,
    VirtualDisk, VirtualMachine,
};
use crate::vm::{
    BlockBackendMap, CrucibleBackendMap, DeviceMap, NetworkInterfaceIds,
};
use anyhow::Context;
use cpuid_utils::CpuidValues;
use crucible_client_types::VolumeConstructionRequest;
pub use nexus_client::Client as NexusClient;
use oximeter::types::ProducerRegistry;
use oximeter_instruments::kstat::KstatSampler;
use propolis::attestation;
use propolis::attestation::server::AttestationServerConfig;
use propolis::attestation::server::AttestationSock;
use propolis::block;
use propolis::chardev::{self, BlockingSource, Source};
use propolis::common::{Lifecycle, GB, MB, PAGE_SIZE};
use propolis::cpuid::TopoKind;
use propolis::enlightenment::Enlightenment;
use propolis::firmware::smbios;
use propolis::hw::bhyve::BhyveHpet;
use propolis::hw::chipset::{i440fx, Chipset};
use propolis::hw::ibmpc;
use propolis::hw::pci;
use propolis::hw::pci::topology::PciTopologyError;
use propolis::hw::ps2::ctrl::PS2Ctrl;
use propolis::hw::qemu::pvpanic::QemuPvpanic;
use propolis::hw::qemu::{
    debug::QemuDebugPort,
    fwcfg::{self, Entry},
    ramfb,
};
use propolis::hw::uart::LpcUart;
use propolis::hw::{nvme, virtio};
use propolis::intr_pins;
use propolis::vmm::{self, Builder, Machine};
use propolis::vsock::GuestCid;
use propolis_api_types::instance::InstanceProperties;
use propolis_api_types::instance_spec::components::devices::SerialPortNumber;
use propolis_api_types::instance_spec::{self, SpecKey};
use propolis_types::{CpuidIdent, CpuidVendor};
use slog::info;
use strum::IntoEnumIterator;
use thiserror::Error;

// XXX: completely arb for now
const MAX_FILE_WORKERS: usize = 32;
const DEFAULT_WORKER_COUNT: usize = 8;

/// An error that can arise while initializing a new machine.
#[derive(Debug, Error)]
pub enum MachineInitError {
    /// Catch-all for `anyhow` errors.
    ///
    /// The machine initializer calls many bhyve functions that return a
    /// [`std::io::Error`]. Instead of forcing each such call site to define its
    /// own error type, this type allows callers to attach an
    /// [`anyhow::Context`] and convert it to this error variant without losing
    /// information about the interior I/O error.
    #[error(transparent)]
    GenericError(#[from] anyhow::Error),

    #[error("bootrom {path:?} length {length:x} not aligned to {align:x}")]
    BootromNotAligned { path: String, length: u64, align: u64 },

    #[error(
        "bootrom read truncated: expected {rom_len} bytes, read {nread} bytes"
    )]
    BootromReadTruncated { rom_len: usize, nread: usize },

    #[error(transparent)]
    PciTopologyError(#[from] PciTopologyError),

    #[error("failed to deserialize volume construction request")]
    VcrDeserializationFailed(#[from] serde_json::Error),

    #[error("failed to decode in-memory storage backend contents")]
    InMemoryBackendDecodeFailed(#[from] base64::DecodeError),

    #[error("multiple Crucible disks with backend ID {0}")]
    DuplicateCrucibleBackendId(SpecKey),

    #[error("boot order entry {0:?} does not refer to an attached disk")]
    BootOrderEntryWithoutDevice(SpecKey),

    #[error(
        "disk device {device_id:?} refers to a \
         non-existent block backend {backend_id:?}"
    )]
    DeviceWithoutBlockBackend { device_id: SpecKey, backend_id: SpecKey },

    #[error("boot entry {0:?} refers to a device on non-zero PCI bus {1}")]
    BootDeviceOnDownstreamPciBus(SpecKey, u8),

    #[error("failed to insert {0} fwcfg entry")]
    FwcfgInsertFailed(&'static str, #[source] fwcfg::InsertError),

    #[error("failed to specialize CPUID for vcpu {0}")]
    CpuidSpecializationFailed(i32, #[source] propolis::cpuid::SpecializeError),

    #[error("failed to start attestation server")]
    AttestationServer(#[source] std::io::Error),

    #[cfg(feature = "falcon")]
    #[error("softnpu p9 device missing")]
    SoftNpuP9Missing,
}

/// Arbitrary ROM limit for now
const MAX_ROM_SIZE: usize = 0x20_0000;

fn get_spec_guest_ram_limits(spec: &Spec) -> (usize, usize) {
    let memsize = spec.board.memory_mb as usize * MB;
    let lowmem = memsize.min(3 * GB);
    let highmem = memsize.saturating_sub(3 * GB);
    (lowmem, highmem)
}

pub fn build_instance(
    name: &str,
    spec: &Spec,
    use_reservoir: bool,
    guest_hv_interface: Arc<dyn Enlightenment>,
    _log: slog::Logger,
) -> Result<Machine, MachineInitError> {
    let (lowmem, highmem) = get_spec_guest_ram_limits(spec);
    let create_opts = propolis::vmm::CreateOpts {
        force: true,
        use_reservoir,
        track_dirty: true,
    };

    let mut builder = Builder::new(name, create_opts)
        .context("failed to create kernel vmm builder")?
        .max_cpus(spec.board.cpus)
        .context("failed to set max cpus")?
        .guest_hypervisor_interface(guest_hv_interface)
        .add_mem_region(0, lowmem, "lowmem")
        .context("failed to add low memory region")?
        .add_rom_region(0x1_0000_0000 - MAX_ROM_SIZE, MAX_ROM_SIZE, "bootrom")
        .context("failed to add bootrom region")?
        .add_mmio_region(0xc000_0000_usize, 0x2000_0000_usize, "dev32")
        .context("failed to add low device MMIO region")?
        .add_mmio_region(0xe000_0000_usize, 0x1000_0000_usize, "pcicfg")
        .context("failed to add PCI config region")?;

    let highmem_start = 0x1_0000_0000;
    if highmem > 0 {
        builder = builder
            .add_mem_region(highmem_start, highmem, "highmem")
            .context("failed to add high memory region")?;
    }

    let dev64_start = highmem_start + highmem;
    builder = builder
        .add_mmio_region(dev64_start, vmm::MAX_PHYSMEM - dev64_start, "dev64")
        .context("failed to add high device MMIO region")?;

    Ok(builder.finalize().context("failed to finalize kernel vmm")?)
}

pub struct RegisteredChipset {
    chipset: Arc<dyn Chipset>,
    isa: Arc<i440fx::Piix3Lpc>,
}
impl RegisteredChipset {
    pub fn pci_attach(&self, bdf: pci::Bdf, dev: Arc<dyn pci::Endpoint>) {
        self.chipset.pci_attach(bdf, dev, self.isa.route_lintr(bdf));
    }
    pub fn irq_pin(&self, irq: u8) -> Option<Box<dyn intr_pins::IntrPin>> {
        self.isa.irq_pin(irq)
    }
    fn reset_pin(&self) -> Arc<dyn intr_pins::IntrPin> {
        self.chipset.reset_pin()
    }
}

struct StorageBackendInstance {
    be: Arc<dyn block::Backend>,
    crucible: Option<Arc<block::CrucibleBackend>>,
}

#[derive(Default)]
pub struct MachineInitializerState {
    rom_size_bytes: Option<usize>,
}

pub struct MachineInitializer<'a> {
    pub(crate) log: slog::Logger,
    pub(crate) machine: &'a Machine,
    pub(crate) devices: DeviceMap,
    pub(crate) block_backends: BlockBackendMap,
    pub(crate) crucible_backends: CrucibleBackendMap,
    pub(crate) spec: &'a Spec,
    pub(crate) properties: &'a InstanceProperties,
    pub(crate) producer_registry: Option<ProducerRegistry>,
    pub(crate) state: MachineInitializerState,
    pub(crate) kstat_sampler: Option<KstatSampler>,
    pub(crate) stats_vm: crate::stats::VirtualMachine,
}

impl MachineInitializer<'_> {
    pub fn initialize_rom(
        &mut self,
        path: &std::path::Path,
    ) -> Result<(), MachineInitError> {
        fn open_bootrom(
            path: &std::path::Path,
        ) -> Result<(File, usize), MachineInitError> {
            let fp = File::open(path)
                .with_context(|| format!("failed to open bootrom {path:?}"))?;
            let len = fp
                .metadata()
                .with_context(|| {
                    format!("failed to query metadata for bootrom {path:?}")
                })?
                .len();
            if len % (PAGE_SIZE as u64) != 0 {
                Err(MachineInitError::BootromNotAligned {
                    path: path.to_string_lossy().to_string(),
                    length: len,
                    align: PAGE_SIZE as u64,
                })
            } else {
                Ok((fp, len as usize))
            }
        }

        let (romfp, rom_len) = open_bootrom(path)
            .unwrap_or_else(|e| panic!("Cannot open bootrom: {e}"));

        let mem = self.machine.acc_mem.access().unwrap();
        let mapping = mem
            .direct_writable_region_by_name("bootrom")
            .context("failed to map guest bootrom region")?;
        let offset = mapping.len() - rom_len;
        let submapping = mapping.subregion(offset, rom_len).unwrap();
        let nread =
            submapping.pread(&romfp, rom_len, 0).with_context(|| {
                format!(
                    "failed to read bootrom {path:?} into guest memory mapping"
                )
            })?;
        if nread != rom_len {
            return Err(MachineInitError::BootromReadTruncated {
                rom_len,
                nread,
            });
        }
        self.state.rom_size_bytes = Some(rom_len);
        Ok(())
    }

    pub fn initialize_rtc(
        &self,
        chipset: &RegisteredChipset,
    ) -> Result<(), MachineInitError> {
        let (lowmem, highmem) = get_spec_guest_ram_limits(self.spec);

        let rtc = chipset.isa.rtc.as_ref();
        rtc.memsize_to_nvram(lowmem as u32, highmem as u64)
            .context("failed to write guest memory size to RTC NVRAM")?;
        rtc.set_time(
            SystemTime::now()
                .duration_since(UNIX_EPOCH)
                .expect("system time precedes UNIX epoch"),
        )
        .context("failed to set guest real-time clock")?;

        Ok(())
    }

    pub fn initialize_hpet(&mut self) {
        let hpet = BhyveHpet::create(self.machine.hdl.clone());
        self.devices
            .insert(SpecKey::Name(hpet.type_name().into()), hpet.clone());
    }

    pub fn initialize_chipset(
        &mut self,
        event_handler: &Arc<dyn super::vm::guest_event::ChipsetEventHandler>,
    ) -> Result<RegisteredChipset, MachineInitError> {
        let mut pci_builder = pci::topology::Builder::new();
        for bridge in self.spec.pci_pci_bridges.values() {
            let desc = pci::topology::BridgeDescription::new(
                pci::topology::LogicalBusId(bridge.downstream_bus),
                bridge.pci_path.into(),
            );
            pci_builder.add_bridge(desc)?;
        }
        let pci::topology::FinishedTopology { topology: pci_topology, bridges } =
            pci_builder.finish(self.machine)?;

        match self.spec.board.chipset {
            instance_spec::components::board::Chipset::I440Fx(i440fx) => {
                let power_ref = Arc::downgrade(event_handler);
                let reset_ref = Arc::downgrade(event_handler);
                let power_pin = Arc::new(propolis::intr_pins::FuncPin::new(
                    Box::new(move |rising| {
                        if rising {
                            if let Some(handler) = power_ref.upgrade() {
                                handler.chipset_halt();
                            }
                        }
                    }),
                ));
                let reset_pin = Arc::new(propolis::intr_pins::FuncPin::new(
                    Box::new(move |rising| {
                        if rising {
                            if let Some(handler) = reset_ref.upgrade() {
                                handler.chipset_reset();
                            }
                        }
                    }),
                ));

                let chipset_hb = i440fx::I440FxHostBridge::create(
                    pci_topology,
                    i440fx::Opts {
                        power_pin: Some(power_pin),
                        reset_pin: Some(reset_pin),
                        enable_pcie: i440fx.enable_pcie,
                    },
                );
                let chipset_lpc =
                    i440fx::Piix3Lpc::create(self.machine.hdl.clone());

                let chipset_pm = i440fx::Piix3PM::create(
                    self.machine.hdl.clone(),
                    chipset_hb.power_pin(),
                    self.log.new(slog::o!("device" => "piix3pm")),
                );

                let do_pci_attach = |bdf, dev: Arc<dyn pci::Endpoint>| {
                    chipset_hb.pci_attach(
                        bdf,
                        dev,
                        chipset_lpc.route_lintr(bdf),
                    );
                };

                // Attach chipset devices to PCI and buses
                do_pci_attach(i440fx::DEFAULT_HB_BDF, chipset_hb.clone());
                chipset_hb.attach(self.machine);

                do_pci_attach(i440fx::DEFAULT_LPC_BDF, chipset_lpc.clone());
                chipset_lpc.attach(&self.machine.bus_pio);

                do_pci_attach(i440fx::DEFAULT_PM_BDF, chipset_pm.clone());
                chipset_pm.attach(&self.machine.bus_pio);

                self.devices.insert(
                    SpecKey::Name(chipset_hb.type_name().into()),
                    chipset_hb.clone(),
                );
                self.devices.insert(
                    SpecKey::Name(chipset_lpc.type_name().into()),
                    chipset_lpc.clone(),
                );
                self.devices.insert(
                    SpecKey::Name(chipset_pm.type_name().into()),
                    chipset_pm,
                );

                // Record attachment for any bridges in PCI topology too
                for (bdf, bridge) in bridges {
                    let spec_element = self
                        .spec
                        .pci_pci_bridges
                        .iter()
                        .find(|(_, spec_bridge)| {
                            bdf == spec_bridge.pci_path.into()
                        })
                        .expect("all PCI bridges are in the topology");

                    self.devices.insert(spec_element.0.clone(), bridge);
                }

                Ok(RegisteredChipset { chipset: chipset_hb, isa: chipset_lpc })
            }
        }
    }

    pub fn initialize_uart(
        &mut self,
        chipset: &RegisteredChipset,
    ) -> Serial<LpcUart> {
        let mut com1 = None;
        for (name, desc) in self.spec.serial.iter() {
            if desc.device != spec::SerialPortDevice::Uart {
                continue;
            }

            let (irq, port) = match desc.num {
                SerialPortNumber::Com1 => (ibmpc::IRQ_COM1, ibmpc::PORT_COM1),
                SerialPortNumber::Com2 => (ibmpc::IRQ_COM2, ibmpc::PORT_COM2),
                SerialPortNumber::Com3 => (ibmpc::IRQ_COM3, ibmpc::PORT_COM3),
                SerialPortNumber::Com4 => (ibmpc::IRQ_COM4, ibmpc::PORT_COM4),
            };

            let dev = LpcUart::new(chipset.irq_pin(irq).unwrap());
            dev.set_autodiscard(true);
            LpcUart::attach(&dev, &self.machine.bus_pio, port);
            self.devices.insert(name.to_owned(), dev.clone());
            if desc.num == SerialPortNumber::Com1 {
                assert!(com1.is_none());
                com1 = Some(dev);
            }
        }

        let sink_size = NonZeroUsize::new(64).unwrap();
        let source_size = NonZeroUsize::new(1024).unwrap();
        Serial::new(com1.unwrap(), sink_size, source_size)
    }

    pub fn initialize_ps2(
        &mut self,
        chipset: &RegisteredChipset,
    ) -> Arc<PS2Ctrl> {
        let ps2_ctrl = PS2Ctrl::create();

        ps2_ctrl.attach(
            &self.machine.bus_pio,
            chipset.irq_pin(ibmpc::IRQ_PS2_PRI).unwrap(),
            chipset.irq_pin(ibmpc::IRQ_PS2_AUX).unwrap(),
            chipset.reset_pin(),
        );
        self.devices.insert(
            SpecKey::Name(ps2_ctrl.type_name().into()),
            ps2_ctrl.clone(),
        );

        ps2_ctrl
    }

    pub fn initialize_qemu_debug_port(
        &mut self,
    ) -> Result<(), MachineInitError> {
        let dbg = QemuDebugPort::create(&self.machine.bus_pio);
        let debug_file = std::fs::File::create("debug.out")
            .context("failed to create firmware debug port logfile")?;
        let poller = chardev::BlockingFileOutput::new(debug_file);

        poller.attach(Arc::clone(&dbg) as Arc<dyn BlockingSource>);
        self.devices.insert(SpecKey::Name(dbg.type_name().into()), dbg);

        Ok(())
    }

    pub fn initialize_qemu_pvpanic(
        &mut self,
        virtual_machine: VirtualMachine,
    ) -> Result<(), MachineInitError> {
        if let Some(pvpanic) = &self.spec.pvpanic {
            if pvpanic.spec.enable_isa {
                let device = QemuPvpanic::create(
                    self.log.new(slog::o!("dev" => "qemu-pvpanic")),
                );
                device.attach_pio(&self.machine.bus_pio);
                self.devices.insert(pvpanic.id.clone(), device.clone());

                if let Some(ref registry) = self.producer_registry {
                    let producer = crate::stats::PvpanicProducer::new(
                        virtual_machine,
                        device,
                    );
                    registry.register_producer(producer).context(
                        "failed to register PVPANIC Oximeter producer",
                    )?;
                }
            }
        }

        Ok(())
    }

    pub async fn initialize_vsock(
        &mut self,
        chipset: &RegisteredChipset,
        attest_cfg: Option<AttestationServerConfig>,
    ) -> Result<Option<AttestationSock>, MachineInitError> {
        use propolis::vsock::proxy::VsockPortMapping;

        if let Some(vsock) = &self.spec.vsock {
            let bdf: pci::Bdf = vsock.spec.pci_path.into();

            let mappings = vec![VsockPortMapping::new(
                attestation::ATTESTATION_PORT.into(),
                attestation::ATTESTATION_ADDR,
            )];

            let guest_cid = GuestCid::try_from(vsock.spec.guest_cid)
                .context("could not parse guest cid")?;
            // While the spec does not recommend how large the virtio descriptor
            // table should be, we sized this appropriately in testing, so
            // that the guest is able to move vsock packets at a reasonable
            // throughput without the need to be much larger.
            let num_queues = 256;

            let device = virtio::PciVirtioSock::new(
                num_queues,
                guest_cid,
                self.log.new(slog::o!("dev" => "virtio-socket")),
                mappings,
            );

            self.devices.insert(vsock.id.clone(), device.clone());
            chipset.pci_attach(bdf, device);

            // Spawn attestation server that will go over the vsock device
            if let Some(cfg) = attest_cfg {
                let attest = AttestationSock::new(
                    self.log.new(slog::o!("component" => "attestation-server")),
                    cfg.sled_agent_addr,
                )
                .await
                .map_err(MachineInitError::AttestationServer)?;
                return Ok(Some(attest));
            }
        } else {
            info!(self.log, "no vsock device in instance spec");
            return Ok(None);
        }

        Ok(None)
    }

    async fn create_storage_backend_from_spec(
        &mut self,
        backend_spec: &StorageBackend,
        backend_id: &SpecKey,
        nexus_client: &Option<NexusClient>,
        wanted_heap: &mut usize,
    ) -> Result<StorageBackendInstance, MachineInitError> {
        match backend_spec {
            StorageBackend::Crucible(spec) => {
                info!(self.log, "Creating Crucible disk";
                      "backend_id" => %backend_id);

                let vcr: VolumeConstructionRequest =
                    serde_json::from_str(&spec.request_json)
                        .map_err(MachineInitError::VcrDeserializationFailed)?;

                let cru_id = match vcr {
                    VolumeConstructionRequest::Volume { id, .. } => {
                        id.to_string()
                    }
                    VolumeConstructionRequest::File { id, .. } => {
                        id.to_string()
                    }
                    VolumeConstructionRequest::Url { id, .. } => id.to_string(),
                    VolumeConstructionRequest::Region { .. } => {
                        "Region".to_string()
                    }
                };

                // Wild guess: we might collect up to 1MB (assuming we're
                // limited by NVMe MDTS) of data in each Crucible worker. That
                // is accumulated into a BytesMut, which is backed by a
                // Vec::with_capacity. With a power of two capacity it's
                // *probably* not rounded up further.
                const PER_WORKER_HEAP: usize = MB;
                // And Crucible workers are not currently tunable, so this is
                // how many there are
                // (see propolis::block::crucible::Crucible::WORKER_COUNT)
                *wanted_heap += 8 * PER_WORKER_HEAP;

                let be = propolis::block::CrucibleBackend::create(
                    vcr,
                    propolis::block::BackendOpts {
                        read_only: Some(spec.readonly),
                        ..Default::default()
                    },
                    self.producer_registry.clone(),
                    nexus_client.clone(),
                    self.log.new(
                        slog::o!("component" => format!("crucible-{cru_id}")),
                    ),
                )
                .await
                .context("failed to create Crucible backend")?;

                let crucible = Some(be.clone());
                Ok(StorageBackendInstance { be, crucible })
            }
            StorageBackend::File(spec) => {
                info!(self.log, "Creating file disk backend";
                      "path" => &spec.path);

                // Check if raw device is being used and gripe if it isn't
                let meta =
                    std::fs::metadata(&spec.path).with_context(|| {
                        format!(
                            "failed to read file backend metadata for {:?}",
                            spec.path
                        )
                    })?;

                if meta.file_type().is_block_device() {
                    slog::warn!(
                        self.log,
                        "Block backend using standard device rather than raw";
                        "path" => &spec.path
                    );
                }

                let nworkers: NonZeroUsize = match spec.workers {
                    Some(workers) => {
                        if workers.get() <= MAX_FILE_WORKERS {
                            workers
                        } else {
                            slog::warn!(
                                self.log,
                                "workers must be between 1 and {} \
                                    Using default value of {}.",
                                MAX_FILE_WORKERS,
                                DEFAULT_WORKER_COUNT
                            );
                            NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap()
                        }
                    }
                    None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(),
                };

                // Similar to Crucible backends above: we might collect up to
                // 1MB (assuming we're limited by NVMe MDTS) of data in each
                // worker. This is a hack in its own right, see Propolis#985.
                const PER_WORKER_HEAP: usize = MB;
                *wanted_heap += nworkers.get() * PER_WORKER_HEAP;

                let be = propolis::block::FileBackend::create(
                    &spec.path,
                    propolis::block::BackendOpts {
                        read_only: Some(spec.readonly),
                        block_size: Some(spec.block_size),
                        ..Default::default()
                    },
                    nworkers,
                    self.log.clone(),
                )
                .with_context(|| {
                    format!(
                        "failed to create file backend for file {:?}",
                        spec.path
                    )
                })?;

                Ok(StorageBackendInstance { be, crucible: None })
            }
            StorageBackend::Blob(spec) => {
                let bytes = base64::Engine::decode(
                    &base64::engine::general_purpose::STANDARD,
                    &spec.base64,
                )?;

                info!(self.log, "Creating in-memory disk backend";
                      "len" => bytes.len());

                let nworkers = NonZeroUsize::new(8).unwrap();
                let be = propolis::block::InMemoryBackend::create(
                    bytes,
                    propolis::block::BackendOpts {
                        block_size: Some(512),
                        read_only: Some(spec.readonly),
                        ..Default::default()
                    },
                    nworkers,
                )
                .context("failed to create in-memory storage backend")?;

                // In-memory backends need to be registered for lifecycle
                // notifications so that they can export/import changes to the
                // backing disk across migrations.
                self.devices.insert(backend_id.clone(), be.clone());
                Ok(StorageBackendInstance { be, crucible: None })
            }
        }
    }

    /// Collect the necessary information out of the VM under construction into
    /// the provided `AttestationSocketInit`. This is expected to populate
    /// `attest_init` with information so the caller can spawn off
    /// `AttestationSockInit::run`.
    pub fn prepare_rot_initializer(
        &self,
        vm_rot: &mut AttestationSock,
    ) -> Result<(), MachineInitError> {
        let uuid = self.properties.id;

        // The first boot entry is a key into `self.spec.disks`, which is how
        // we'll get to a Crucible volume backing this boot option.
        let boot_disk_entry =
            self.spec.boot_settings.as_ref().and_then(|settings| {
                if settings.order.len() >= 2 {
                    // In a rack we only configure propolis-server with zero or
                    // one boot disks.  It's possible to provide a fuller list,
                    // and in the future the product may actually expose such a
                    // capability. At that time, we'll need to have a reckoning
                    // for what "boot disk measurement" from the RoT actually
                    // means; it probably "should" be "the measurement of the
                    // disk that EDK2 decided to boot into", but that
                    // communication to and from the guest is a little more
                    // complicated than we want or need to build out today.
                    //
                    // Since as the system exists we either have no specific
                    // boot disk (and don't know where the guest is expected to
                    // end up), or one boot disk (and can determine which disk
                    // to collect a measurement of before even running guest
                    // firmware), we encode this expectation up front. If the
                    // product has changed such that this assert is reached,
                    // "that's exciting!" and "sorry for crashing your
                    // Propolis".
                    panic!(
                        "Unsupported VM RoT configuration: \
                            more than one boot disk"
                    );
                }

                settings.order.first()
            });

        let boot_backend = if let Some(entry) = boot_disk_entry {
            let disk_dev =
                self.spec.disks.get(&entry.device_id).ok_or_else(|| {
                    MachineInitError::BootOrderEntryWithoutDevice(
                        entry.device_id.clone(),
                    )
                })?;

            let backend_id = match &disk_dev.device_spec {
                spec::StorageDevice::Virtio(disk) => &disk.backend_id,
                spec::StorageDevice::Nvme(disk) => &disk.backend_id,
            };

            let Some(block_backend) = self.block_backends.get(backend_id)
            else {
                return Err(MachineInitError::DeviceWithoutBlockBackend {
                    device_id: entry.device_id.to_owned(),
                    backend_id: backend_id.to_owned(),
                });
            };

            if let Some(backend) =
                block_backend.as_any().downcast_ref::<block::CrucibleBackend>()
            {
                if backend.is_read_only() {
                    Some(attestation::boot_digest::Backend::Crucible(
                        backend.clone_volume(),
                    ))
                } else {
                    // Disk must be read-only to be used for attestation.
                    slog::info!(
                        self.log,
                        "boot disk is not read-only (and will not be used for attestations)",
                    );
                    None
                }
            } else {
                // Probably fine, just not handled right now.
                slog::warn!(
                    self.log,
                    "VM RoT ignoring boot disk: not a Crucible volume"
                );
                None
            }
        } else {
            None
        };

        vm_rot.prepare_instance_conf(uuid, boot_backend);

        Ok(())
    }

    /// Initializes the storage devices and backends listed in this
    /// initializer's instance spec.
    ///
    /// On success, returns a map from Crucible backend IDs to Crucible
    /// backends.
    pub async fn initialize_storage_devices(
        &mut self,
        chipset: &RegisteredChipset,
        nexus_client: Option<NexusClient>,
    ) -> Result<usize, MachineInitError> {
        let mut wanted_heap = 0usize;

        enum DeviceInterface {
            Virtio,
            Nvme,
        }

        for (device_id, disk) in &self.spec.disks {
            info!(
                self.log,
                "Creating storage device";
                "device_id" => %device_id,
                "spec" => ?disk.device_spec
            );

            let (device_interface, backend_id, pci_path) = match &disk
                .device_spec
            {
                spec::StorageDevice::Virtio(disk) => {
                    (DeviceInterface::Virtio, &disk.backend_id, disk.pci_path)
                }
                spec::StorageDevice::Nvme(disk) => {
                    (DeviceInterface::Nvme, &disk.backend_id, disk.pci_path)
                }
            };

            // For all storage devices we'll have a QueueMinder connecting
            // each emulated device queue to storage backends. The minder and
            // structures in its supporting logic don't have much state, but may
            // do some dynamic allocation. Assume they won't need more than 1KiB
            // of state (`in_flight` has at most nworkers entries currently and
            // will need to grow only once or twice to a small capacity. The
            // number of outstanding boxed requests and responses is at most
            // nworkers. Might be more, but not much).
            //
            // 64 * 1K is a wild over-estimate while we support 1-15 queues
            // across virtio-block and nvme.
            wanted_heap += 64 * 1024;

            let bdf: pci::Bdf = pci_path.into();

            let StorageBackendInstance { be: backend, crucible } = self
                .create_storage_backend_from_spec(
                    &disk.backend_spec,
                    backend_id,
                    &nexus_client,
                    &mut wanted_heap,
                )
                .await?;
            info!(
                self.log,
                "raised balloon size";
                "ballon_size" => wanted_heap
            );

            self.block_backends.insert(backend_id.clone(), backend.clone());
            let block_dev: Arc<dyn block::Device> = match device_interface {
                DeviceInterface::Virtio => {
                    let vioblk = virtio::PciVirtioBlock::new(0x100);

                    self.devices.insert(device_id.clone(), vioblk.clone());
                    block::attach(&vioblk.block_attach, backend.attachment())
                        .unwrap();
                    chipset.pci_attach(bdf, vioblk.clone());
                    vioblk
                }
                DeviceInterface::Nvme => {
                    let spec::StorageDevice::Nvme(nvme_spec) =
                        &disk.device_spec
                    else {
                        unreachable!("disk is known to be an NVMe disk");
                    };

                    // Limit data transfers to 1MiB (2^8 * 4k) in size
                    let mdts = Some(8);
                    let component = format!("nvme-{device_id}");
                    let nvme = nvme::PciNvme::create(
                        &nvme_spec.serial_number,
                        mdts,
                        self.log.new(slog::o!("component" => component)),
                    );
                    self.devices.insert(device_id.clone(), nvme.clone());
                    block::attach(&nvme.block_attach, backend.attachment())
                        .unwrap();
                    chipset.pci_attach(bdf, nvme.clone());
                    nvme
                }
            };

            if let Some(crucible) = crucible {
                let crucible =
                    match self.crucible_backends.entry(backend_id.clone()) {
                        std::collections::btree_map::Entry::Occupied(_) => {
                            return Err(
                                MachineInitError::DuplicateCrucibleBackendId(
                                    backend_id.clone(),
                                ),
                            );
                        }
                        std::collections::btree_map::Entry::Vacant(e) => {
                            e.insert(crucible)
                        }
                    };

                let Some(block_size) = crucible.block_size().await else {
                    slog::error!(
                        self.log,
                        "Could not get Crucible backend block size, \
                        virtual disk metrics can't be reported for it";
                        "disk_id" => %backend_id,
                    );
                    continue;
                };

                let Ok(volume_id) = crucible.get_uuid().await else {
                    slog::error!(
                        self.log,
                        "Could not get Crucible volume ID, \
                        virtual disk metrics can't be reported for it";
                        "disk_id" => %backend_id,
                    );
                    continue;
                };

                if let Some(registry) = &self.producer_registry {
                    let block_metrics = BlockMetrics::new(
                        VirtualDisk {
                            attached_instance_id: self.properties.id,
                            block_size,
                            disk_id: volume_id,
                            project_id: self.properties.metadata.project_id,
                            silo_id: self.properties.metadata.silo_id,
                        },
                        block_dev.attachment().max_queues(),
                    );

                    if let Err(e) =
                        registry.register_producer(block_metrics.producer())
                    {
                        slog::error!(
                            self.log,
                            "Could not register virtual disk producer, \
                            metrics will not be produced";
                            "disk_id" => %backend_id,
                            "volume_id" => %volume_id,
                            "error" => ?e,
                        );
                        continue;
                    };

                    block_dev.attachment().set_metric_consumer(block_metrics);
                };
            }
        }
        Ok(wanted_heap)
    }

    /// Initialize network devices, add them to the device map, and attach them
    /// to the chipset.
    ///
    /// If a KstatSampler is provided, this function will also track network
    /// interface statistics.
    pub async fn initialize_network_devices(
        &mut self,
        chipset: &RegisteredChipset,
    ) -> Result<(), MachineInitError> {
        // Only create the vector if the kstat_sampler exists.
        let mut interface_ids: Option<NetworkInterfaceIds> =
            self.kstat_sampler.as_ref().map(|_| Vec::new());

        for (device_name, nic) in &self.spec.nics {
            info!(self.log, "Creating vNIC {}", device_name);
            let bdf: pci::Bdf = nic.device_spec.pci_path.into();

            // Set viona device parameters. The parameters here (copy_data and
            // header_pad) require `viona::ApiVersion::V3`, below Propolis'
            // minimum of V6, so we can always set them.
            //
            // The values chosen here are tuned to maximize performance when
            // Propolis is used with OPTE in a full Oxide rack deployment,
            // although they should not negatively impact use outside those
            // conditions.  These parameters and their effects (save for
            // performance delta) are not guest-visible.
            let params = Some(virtio::viona::DeviceParams {
                // Loan guest packet data, rather than allocating fresh
                // buffers and copying it.
                copy_data: false,
                // Leave room for underlay encapsulation:
                // - ethernet: 14
                // - IPv6: 40
                // - UDP: 8
                // - Geneve: 8–16 (due to options)
                // - (and then round up to nearest 8)
                header_pad: 80,
            });

            let viona = virtio::PciVirtioViona::new(
                &nic.backend_spec.vnic_name,
                &self.machine.hdl,
                params,
            )
            .with_context(|| {
                format!("failed to create viona device {device_name:?}")
            })?;

            self.devices.insert(device_name.clone(), viona.clone());

            // Only push to interface_ids if kstat_sampler exists
            if let Some(ref mut ids) = interface_ids {
                ids.push((
                    nic.device_spec.interface_id,
                    viona.instance_id().with_context(|| {
                        format!(
                            "failed to get viona instance ID for network \
                                device {device_name:?}"
                        )
                    })?,
                ));
            }

            chipset.pci_attach(bdf, viona);
        }

        if let Some(sampler) = self.kstat_sampler.as_ref() {
            track_network_interface_kstats(
                &self.log,
                sampler,
                &self.stats_vm,
                interface_ids.unwrap(),
            )
            .await
        }

        Ok(())
    }

    #[cfg(feature = "failure-injection")]
    pub fn initialize_test_devices(&mut self) {
        use propolis::hw::testdev::{
            MigrationFailureDevice, MigrationFailures,
        };

        if let Some(mig) = &self.spec.migration_failure {
            if mig.spec.fail_exports == 0 && mig.spec.fail_imports == 0 {
                info!(
                    self.log,
                    "migration failure device's failure counts are both 0";
                    "device_spec" => ?mig.spec
                );
            }

            let dev = MigrationFailureDevice::create(
                &self.log,
                MigrationFailures {
                    exports: mig.spec.fail_exports as usize,
                    imports: mig.spec.fail_imports as usize,
                },
            );

            self.devices.insert(mig.id.clone(), dev);
        }
    }

    #[cfg(feature = "falcon")]
    pub fn initialize_softnpu_ports(
        &mut self,
        chipset: &RegisteredChipset,
    ) -> Result<(), MachineInitError> {
        let softnpu = &self.spec.softnpu;

        // Check to make sure we actually have both a pci port and at least one
        // regular SoftNpu port, otherwise just return.
        let pci_port = match &softnpu.pci_port {
            Some(tfp) => tfp,
            None => return Ok(()),
        };
        if softnpu.ports.is_empty() {
            return Ok(());
        }

        // Get a Vec of references to the ports which will then be sorted by
        // port name.
        let mut ports: Vec<_> = softnpu.ports.iter().collect();

        // SoftNpu ports are named <topology>_<node>_vnic<N> by falcon, where
        // <N> indicates the intended order.
        ports.sort_by_key(|p| p.0);
        let data_links = ports
            .iter()
            .map(|port| port.1.backend_spec.vnic_name.clone())
            .collect();

        // Set up an LPC uart for ASIC management comms from the guest.
        //
        // NOTE: SoftNpu squats on com4.
        let uart = LpcUart::new(chipset.irq_pin(ibmpc::IRQ_COM4).unwrap());
        uart.set_autodiscard(true);
        LpcUart::attach(&uart, &self.machine.bus_pio, ibmpc::PORT_COM4);
        self.devices
            .insert(SpecKey::Name("softnpu-uart".to_string()), uart.clone());

        // Start with no pipeline. The guest must load the initial P4 program.
        let pipeline = Arc::new(std::sync::Mutex::new(None));

        // Set up the p9fs device for guest programs to load P4 programs
        // through.
        let p9_handler = virtio::softnpu::SoftNpuP9Handler::new(
            "/dev/softnpufs".to_owned(),
            "/dev/softnpufs".to_owned(),
            ports.len() as u16,
            pipeline.clone(),
            self.log.clone(),
        );
        let vio9p =
            virtio::p9fs::PciVirtio9pfs::new(0x40, Arc::new(p9_handler));
        self.devices
            .insert(SpecKey::Name("softnpu-p9fs".to_string()), vio9p.clone());
        let bdf = softnpu
            .p9_device
            .as_ref()
            .ok_or(MachineInitError::SoftNpuP9Missing)?
            .pci_path
            .into();
        chipset.pci_attach(bdf, vio9p.clone());

        // Create the SoftNpu device.
        let queue_size = 0x8000;
        let softnpu = virtio::softnpu::SoftNpu::new(
            data_links,
            queue_size,
            uart,
            vio9p,
            pipeline,
            self.log.clone(),
        )
        .context("failed to register softnpu")?;

        self.devices
            .insert(SpecKey::Name("softnpu-main".to_string()), softnpu.clone());

        // Create the SoftNpu PCI port.
        self.devices.insert(
            SpecKey::Name("softnpu-pciport".to_string()),
            softnpu.pci_port.clone(),
        );
        chipset.pci_attach(pci_port.pci_path.into(), softnpu.pci_port.clone());

        Ok(())
    }

    #[cfg(feature = "falcon")]
    pub fn initialize_9pfs(&mut self, chipset: &RegisteredChipset) {
        let softnpu = &self.spec.softnpu;
        // Check that there is actually a p9fs device to register, if not bail
        // early.
        let Some(p9fs) = &softnpu.p9fs else {
            return;
        };

        let handler = virtio::p9fs::HostFSHandler::new(
            p9fs.source.to_owned(),
            p9fs.target.to_owned(),
            p9fs.chunk_size,
            self.log.clone(),
        );
        let vio9p = virtio::p9fs::PciVirtio9pfs::new(0x40, Arc::new(handler));
        self.devices
            .insert(SpecKey::Name("falcon-p9fs".to_string()), vio9p.clone());
        chipset.pci_attach(p9fs.pci_path.into(), vio9p);
    }

    fn generate_smbios(
        &self,
        bootrom_version: &Option<String>,
    ) -> smbios::TableBytes {
        use smbios::table::{type0, type1, type16, type4};

        let rom_size =
            self.state.rom_size_bytes.expect("ROM is already populated");
        let bios_version = bootrom_version
            .as_deref()
            .unwrap_or("v0.8")
            .try_into()
            .expect("bootrom version string doesn't contain NUL bytes");
        let smb_type0 = smbios::table::Type0 {
            vendor: "Oxide".try_into().unwrap(),
            bios_version,
            bios_release_date: "The Aftermath 30, 3185 YOLD"
                .try_into()
                .unwrap(),
            bios_rom_size: ((rom_size / (64 * 1024)) - 1) as u8,
            bios_characteristics: type0::BiosCharacteristics::UNSUPPORTED,
            bios_ext_characteristics: type0::BiosExtCharacteristics::ACPI
                | type0::BiosExtCharacteristics::UEFI
                | type0::BiosExtCharacteristics::IS_VM,
            ..Default::default()
        };

        // If `spec` contains smbios_type1_input then use it. Otherwise use
        // defaults.
        let mut smb_type1 = smbios::table::Type1::default();
        if let Some(smbios) = self.spec.smbios_type1_input.clone() {
            smb_type1.manufacturer =
                smbios.manufacturer.try_into().unwrap_or_default();
            smb_type1.product_name =
                smbios.product_name.try_into().unwrap_or_default();
            smb_type1.serial_number =
                smbios.serial_number.try_into().unwrap_or_default();
            smb_type1.version =
                smbios.version.to_string().try_into().unwrap_or_default();
        } else {
            smb_type1.manufacturer = "Oxide".try_into().unwrap();
            smb_type1.product_name = "OxVM".try_into().unwrap();
            smb_type1.serial_number =
                self.properties.id.to_string().try_into().unwrap_or_default();
        };
        smb_type1.uuid = self.properties.id.to_bytes_le();
        smb_type1.wake_up_type = type1::WakeUpType::PowerSwitch;

        // The processor vendor, family/model/stepping, and brand string should
        // correspond to the values the guest will see if it queries CPUID.
        //
        // Note that all these values are `Option`s, because the spec may
        // contain CPUID values that don't contain all of the input leaves.
        let cpuid_vendor = self.spec.cpuid.get(CpuidIdent::leaf(0)).copied();
        let cpuid_ident = self.spec.cpuid.get(CpuidIdent::leaf(1)).copied();

        // Coerce the array-of-Options into an Option containing the array.
        let cpuid_procname: Option<[CpuidValues; 3]> = [
            self.spec.cpuid.get(CpuidIdent::leaf(0x8000_0002)).copied(),
            self.spec.cpuid.get(CpuidIdent::leaf(0x8000_0003)).copied(),
            self.spec.cpuid.get(CpuidIdent::leaf(0x8000_0004)).copied(),
        ]
        .into_iter()
        // This returns None if any of the input options were None (i.e. if any
        // of the requested leaves weren't found). This implies that if the
        // `collect` returns `Some`, there are necessarily three elements in the
        // `Vec`, so `try_into::<[CpuidValues; 3]>` will always succeed.
        .collect::<Option<Vec<_>>>()
        .map(TryInto::try_into)
        .transpose()
        .expect("output array should always have three elements");

        let family = cpuid_ident
            .map(|ident| {
                match ident.eax & 0xf00 {
                    // If family ID is 0xf, extended family is added to it
                    0xf00 => ((ident.eax >> 20) & 0xff) + 0xf,
                    // ... otherwise base family ID is used
                    base => base >> 8,
                }
            })
            .unwrap_or(0);

        let vendor = cpuid_vendor.map(CpuidVendor::try_from);
        let proc_manufacturer = match vendor {
            Some(Ok(CpuidVendor::Intel)) => "Intel",
            Some(Ok(CpuidVendor::Amd)) => "Advanced Micro Devices, Inc.",
            _ => "",
        }
        .try_into()
        .unwrap();

        let proc_family = match (vendor, family) {
            // Explicitly match for Zen-based CPUs
            //
            // Although this family identifier is not valid in SMBIOS 2.7,
            // having been defined in 3.x, we pass it through anyways.
            (Some(Ok(CpuidVendor::Amd)), family) if family >= 0x17 => 0x6b,

            // Emit Unknown for everything else
            _ => 0x2,
        };

        let proc_id = cpuid_ident
            .map(|id| u64::from(id.eax) | (u64::from(id.edx) << 32))
            .unwrap_or(0);

        let proc_version = cpuid_procname
            .and_then(|vals| propolis::cpuid::parse_brand_string(vals).ok())
            .unwrap_or_default();

        let smb_type4 = smbios::table::Type4 {
            proc_type: type4::ProcType::Central,
            proc_family,
            proc_manufacturer,
            proc_id,
            proc_version: proc_version.try_into().unwrap_or_default(),
            status: type4::ProcStatus::Enabled,
            // unknown
            proc_upgrade: 0x2,
            // make core and thread counts equal for now
            core_count: self.spec.board.cpus,
            core_enabled: self.spec.board.cpus,
            thread_count: self.spec.board.cpus,
            proc_characteristics: type4::Characteristics::IS_64_BIT
                | type4::Characteristics::MULTI_CORE,
            ..Default::default()
        };

        let memsize_bytes = (self.spec.board.memory_mb as usize) * MB;
        let mut smb_type16 = smbios::table::Type16 {
            location: type16::Location::SystemBoard,
            array_use: type16::ArrayUse::System,
            error_correction: type16::ErrorCorrection::Unknown,
            num_mem_devices: 1,
            ..Default::default()
        };
        smb_type16.set_max_capacity(memsize_bytes);
        let phys_mem_array_handle = 0x1600.into();

        let mut smb_type17 = smbios::table::Type17 {
            phys_mem_array_handle,
            // Unknown
            form_factor: 0x2,
            // Unknown
            memory_type: 0x2,
            ..Default::default()
        };
        smb_type17.set_size(Some(memsize_bytes));

        let smb_type32 = smbios::table::Type32::default();

        // With "only" types 0, 1, 4, 16, 17, and 32, we are technically missing
        // some (types 3, 7, 9, 19) of the data required by the 2.7 spec.  The
        // data provided here were what we determined was a reasonable
        // collection to start with.  Should further requirements arise, we may
        // expand on it.
        let mut smb_tables = smbios::Tables::new(0x7f00.into());
        smb_tables.add(0x0000.into(), &smb_type0).unwrap();
        smb_tables.add(0x0100.into(), &smb_type1).unwrap();
        smb_tables.add(0x0300.into(), &smb_type4).unwrap();
        smb_tables.add(phys_mem_array_handle, &smb_type16).unwrap();
        smb_tables.add(0x1700.into(), &smb_type17).unwrap();
        smb_tables.add(0x3200.into(), &smb_type32).unwrap();

        smb_tables.commit()
    }

    fn generate_e820(&self) -> Result<Entry, MachineInitError> {
        info!(self.log, "Generating E820 map for guest address space");

        let mut e820_table = fwcfg::formats::E820Table::new();

        for (addr, len, kind) in self.machine.map_physmem.mappings().into_iter()
        {
            let addr = addr.try_into().expect("usize should fit into u64");
            let len = len.try_into().expect("usize should fit into u64");
            match kind {
                propolis::vmm::MapType::Dram => {
                    e820_table.add_mem(addr, len);
                }
                _ => {
                    e820_table.add_reserved(addr, len);
                }
            }
        }

        Ok(e820_table.finish())
    }

    fn generate_bootorder(&self) -> Result<Option<Entry>, MachineInitError> {
        info!(
            self.log,
            "Generating bootorder with order: {:?}",
            self.spec.boot_settings.as_ref()
        );
        let Some(boot_names) = self.spec.boot_settings.as_ref() else {
            return Ok(None);
        };

        let mut order = fwcfg::formats::BootOrder::new();

        for boot_entry in boot_names.order.iter() {
            // Theoretically we could support booting from network devices by
            // matching them here and adding their PCI paths, but exactly what
            // would happen is ill-understood. So, only check disks here.
            if let Some(spec) = self.spec.disks.get(&boot_entry.device_id) {
                match &spec.device_spec {
                    StorageDevice::Virtio(disk) => {
                        let bdf: pci::Bdf = disk.pci_path.into();
                        if bdf.bus.get() != 0 {
                            return Err(
                                MachineInitError::BootDeviceOnDownstreamPciBus(
                                    boot_entry.device_id.clone(),
                                    bdf.bus.get(),
                                ),
                            );
                        }

                        order.add_disk(bdf.location);
                    }
                    StorageDevice::Nvme(disk) => {
                        let bdf: pci::Bdf = disk.pci_path.into();
                        if bdf.bus.get() != 0 {
                            return Err(
                                MachineInitError::BootDeviceOnDownstreamPciBus(
                                    boot_entry.device_id.clone(),
                                    bdf.bus.get(),
                                ),
                            );
                        }

                        // TODO: separately, propolis-standalone passes an eui64
                        // of 0, so do that here too. is that.. ok?
                        order.add_nvme(bdf.location, 0);
                    }
                };
            } else {
                // This should be unreachable - we check that the boot disk is
                // valid when constructing the spec we're initializing from.
                return Err(MachineInitError::BootOrderEntryWithoutDevice(
                    boot_entry.device_id.clone(),
                ));
            }
        }

        Ok(Some(order.finish()))
    }

    /// Initialize qemu `fw_cfg` device, and populate it with data including CPU
    /// count, SMBIOS tables, and attached RAM-FB device.
    ///
    /// Should not be called before [`Self::initialize_rom()`].
    pub fn initialize_fwcfg(
        &mut self,
        cpus: u8,
        bootrom_version: &Option<String>,
    ) -> Result<Arc<ramfb::RamFb>, MachineInitError> {
        let fwcfg = fwcfg::FwCfg::new();
        fwcfg
            .insert_legacy(
                fwcfg::LegacyId::SmpCpuCount,
                fwcfg::Entry::fixed_u32(u32::from(cpus)),
            )
            .map_err(|e| MachineInitError::FwcfgInsertFailed("cpu count", e))?;

        let smbios::TableBytes { entry_point, structure_table } =
            self.generate_smbios(bootrom_version);
        fwcfg
            .insert_named(
                "etc/smbios/smbios-tables",
                fwcfg::Entry::Bytes(structure_table),
            )
            .map_err(|e| {
                MachineInitError::FwcfgInsertFailed("smbios tables", e)
            })?;
        fwcfg
            .insert_named(
                "etc/smbios/smbios-anchor",
                fwcfg::Entry::Bytes(entry_point),
            )
            .map_err(|e| {
                MachineInitError::FwcfgInsertFailed("smbios anchor", e)
            })?;

        if let Some(boot_order) = self.generate_bootorder()? {
            fwcfg.insert_named("bootorder", boot_order).map_err(|e| {
                MachineInitError::FwcfgInsertFailed("bootorder", e)
            })?;
        }
        let e820_entry = self.generate_e820()?;
        fwcfg
            .insert_named("etc/e820", e820_entry)
            .map_err(|e| MachineInitError::FwcfgInsertFailed("e820", e))?;

        let ramfb = ramfb::RamFb::create(
            self.log.new(slog::o!("component" => "ramfb")),
        );
        ramfb.attach(&self.machine.acc_mem);
        fwcfg
            .insert_named(ramfb::RamFb::FWCFG_ENTRY_NAME, fwcfg::Entry::RamFb)
            .map_err(|e| MachineInitError::FwcfgInsertFailed("ramfb", e))?;
        fwcfg.attach_ramfb(Some(ramfb.clone()));

        fwcfg.attach(&self.machine.bus_pio, &self.machine.acc_mem);

        self.devices.insert(SpecKey::Name(fwcfg.type_name().into()), fwcfg);
        self.devices
            .insert(SpecKey::Name(ramfb.type_name().into()), ramfb.clone());
        Ok(ramfb)
    }

    /// Initialize virtual CPUs by first setting their capabilities, inserting
    /// them into the device map, and then, if a kstat sampler is provided,
    /// tracking their kstats.
    pub async fn initialize_cpus(&mut self) -> Result<(), MachineInitError> {
        let hv_interface = self.machine.guest_hv_interface.as_ref();
        for vcpu in self.machine.vcpus.iter() {
            // Report that the guest is running on bhyve.
            //
            // The CPUID set in the spec is not allowed to contain any leaves in
            // the hypervisor leaf region (enforced at spec generation time).
            let mut set = self.spec.cpuid.clone();
            hv_interface.add_cpuid(&mut set).expect(
                "propolis_server::spec construction should deny direct \
                    requests to set hypervisor leaves",
            );

            // Instead of `TopoKind::supported`, we use an intentionally-reduced
            // list of Intel-only leaves for the moment. This is because if we
            // specialize leaves used by AMD (or just both vendors), we'll
            // change the topology a guest sees.
            //
            // The initial CPU platform defined in Nexus (Omicron#8728) hews to
            // the pre-specialization topology, which won't have leaf B at all.
            // Before that is sent, though, we'll see the present-but-zero
            // leaves from bhyve, which we would happily specialize into
            // something reflecting the guest if requested here. Once
            // Omicron#8728 lands and propolis-server receives explicit CPUID
            // profiles, we can add AMD leaves here too.
            let cpu_topo_leaves = [TopoKind::Std4];

            let specialized = propolis::cpuid::Specializer::new()
                .with_vcpu_count(
                    NonZeroU8::new(self.spec.board.cpus).unwrap(),
                    true,
                )
                .with_vcpuid(vcpu.id)
                .with_cache_topo()
                .clear_cpu_topo(TopoKind::iter())
                .with_cpu_topo(cpu_topo_leaves.into_iter())
                .execute(set)
                .map_err(|e| {
                    MachineInitError::CpuidSpecializationFailed(vcpu.id, e)
                })?;

            info!(self.log, "setting CPUID for vCPU";
                    "vcpu" => vcpu.id,
                    "cpuid" => ?specialized);

            vcpu.set_cpuid(specialized).with_context(|| {
                format!("setting CPUID for vcpu {}", vcpu.id)
            })?;

            vcpu.set_default_capabs()
                .context("failed to set vcpu capabilities")?;

            // The vCPUs behave like devices, so add them to the list as well
            self.devices.insert(
                SpecKey::Name(format!("vcpu-{}", vcpu.id)),
                vcpu.clone(),
            );
        }
        if let Some(sampler) = self.kstat_sampler.as_ref() {
            track_vcpu_kstats(&self.log, sampler, &self.stats_vm).await;
        }
        Ok(())
    }

    pub fn register_guest_hv_interface(
        &mut self,
        guest_hv_interface: Arc<dyn Lifecycle>,
    ) {
        self.devices.insert(
            SpecKey::Name("guest-hv-interface".to_string()),
            guest_hv_interface,
        );
    }
}


================================================
FILE: bin/propolis-server/src/lib/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod config;
mod initializer;
mod migrate;
mod serial;
pub mod server;
mod spec;
mod stats;
mod vcpu_tasks;
mod vm;
pub mod vnc;


================================================
FILE: bin/propolis-server/src/lib/migrate/codec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for encoding messages in the propolis/bhyve live
//! migration protocol. Messages are serialized to binary and
//! wrapped in Binary websocket frames with a trailing byte
//! indicating the message type.
//!
//! As defined in RFD0071, most messages are either serialized
//! structures or blobs, while the structures involved in the
//! memory transfer phases of the protocols are directly serialized
//! binary structures.  We represent each of these structures in a
//! dedicated message type; similarly with 4KiB "page" data, etc.
//! Serialized structures are assumed to be text.
//!
//! Several messages involved in memory transfer include bitmaps
//! that are nominally bounded by associated [start, end) address
//! ranges.  However, the framing layer makes no effort to validate
//! the implied invariants: higher level software is responsible
//! for that.

use super::MigrateError;

use bytes::{Buf, BufMut, Bytes};
use slog::error;
use strum::FromRepr;
use thiserror::Error;
use tokio_tungstenite::tungstenite;

/// Migration protocol errors.
#[derive(Debug, Error)]
pub enum ProtocolError {
    /// We received an unexpected message type
    #[error("couldn't decode message type ({0})")]
    InvalidMessageType(u8),

    /// The message received on the wire wasn't the expected length
    #[error("unexpected message length {1} for type {0:?}")]
    UnexpectedMessageLen(u8, usize),

    /// Encountered an I/O error on the transport
    #[error("I/O error: {0}")]
    Io(#[from] std::io::Error),

    /// Failed to serialize or deserialize a message
    #[error("serialization error: {0}")]
    Ron(#[from] ron::Error),

    /// Received non-UTF8 string
    #[error("non-UTF8 string: {0}")]
    Utf8(#[from] std::str::Utf8Error),

    /// Nothing, not even a tag byte
    #[error("received empty message with no discriminant")]
    EmptyMessage,

    /// An error occurred in the underlying websocket transport
    #[error("error occurred in websocket layer: {0}")]
    WebsocketError(Box<tokio_tungstenite::tungstenite::Error>),

    /// All our codec's messages should be tungstenite::Message::Binary
    #[error("received empty message with no discriminant")]
    UnexpectedWebsocketMessage(tungstenite::Message),
}
impl From<ron::de::SpannedError> for ProtocolError {
    fn from(value: ron::de::SpannedError) -> Self {
        Self::Ron(value.code)
    }
}

/// Message represents the different frame types for messages
/// exchanged in the live migration protocol.  Most structured
/// data is serialized into a string, while blobs are uninterpreted
/// vectors of bytes and 4KiB pages (e.g. of RAM) are uninterpreted
/// fixed-sized arrays.  The memory-related messages are nominally
/// structured, but given the overall volume of memory data exchanged,
/// we serialize and deserialize them directly.
#[derive(Debug)]
pub(crate) enum Message {
    Okay,
    Error(MigrateError),
    Serialized(String),
    Blob(Vec<u8>),
    Page(Vec<u8>),
    MemQuery(u64, u64),
    MemOffer(u64, u64, Vec<u8>),
    MemEnd(u64, u64),
    MemFetch(u64, u64, Vec<u8>),
    MemXfer(u64, u64, Vec<u8>),
    MemDone,
}

/// MessageType represents tags that are used in the protocol for
/// identifying frame types.  They are an implementation detail of
/// the wire format, and not used elsewhere.  However, they must be
/// kept in bijection with Message, above.
#[derive(Debug, PartialEq, FromRepr)]
#[repr(u8)]
enum MessageType {
    Okay,
    Error,
    Serialized,
    Blob,
    Page,
    MemQuery,
    MemOffer,
    MemEnd,
    MemFetch,
    MemXfer,
    MemDone,
}

/// By implementing `From<&Message>` on MessageType, we can translate
/// each message into its tag type, ensuring full coverage.
impl From<&Message> for MessageType {
    fn from(m: &Message) -> MessageType {
        match m {
            Message::Okay => MessageType::Okay,
            Message::Error(_) => MessageType::Error,
            Message::Serialized(_) => MessageType::Serialized,
            Message::Blob(_) => MessageType::Blob,
            Message::Page(_) => MessageType::Page,
            Message::MemQuery(_, _) => MessageType::MemQuery,
            Message::MemOffer(_, _, _) => MessageType::MemOffer,
            Message::MemEnd(_, _) => MessageType::MemEnd,
            Message::MemFetch(_, _, _) => MessageType::MemFetch,
            Message::MemXfer(_, _, _) => MessageType::MemXfer,
            Message::MemDone => MessageType::MemDone,
        }
    }
}

impl std::convert::TryInto<tungstenite::Message> for Message {
    type Error = ProtocolError;
    fn try_into(self) -> Result<tungstenite::Message, ProtocolError> {
        let mut dst = Vec::new();
        let tag = MessageType::from(&self) as u8;
        match self {
            Message::Okay | Message::MemDone => {}
            Message::Error(e) => {
                let serialized = ron::ser::to_string(&e)?;
                dst.extend(serialized.as_bytes());
            }
            Message::Serialized(s) => dst.put_slice(s.as_bytes()),
            Message::Blob(bytes) | Message::Page(bytes) => {
                dst.put_slice(&bytes);
            }
            Message::MemQuery(start, end) | Message::MemEnd(start, end) => {
                dst.put_u64_le(start);
                dst.put_u64_le(end);
            }
            Message::MemOffer(start, end, bitmap)
            | Message::MemFetch(start, end, bitmap)
            | Message::MemXfer(start, end, bitmap) => {
                dst.put_u64_le(start);
                dst.put_u64_le(end);
                dst.put_slice(&bitmap);
            }
        }
        // tag at the end so we can pop it later (& so u64's align nicely)
        dst.push(tag);
        Ok(tungstenite::Message::Binary(dst))
    }
}

// Retrieves a (`start`, `end`) pair from the buffer, ensuring valid length.
fn get_start_end(
    tag: MessageType,
    src: &mut Bytes,
) -> Result<(u64, u64), ProtocolError> {
    if src.len() < 16 {
        return Err(ProtocolError::UnexpectedMessageLen(tag as u8, src.len()));
    }
    let start = src.get_u64_le();
    let end = src.get_u64_le();
    Ok((start, end))
}

impl std::convert::TryInto<Message> for tungstenite::Message {
    type Error = ProtocolError;
    fn try_into(self) -> Result<Message, ProtocolError> {
        match self {
            tungstenite::Message::Binary(mut v) => {
                // If the tag byte is absent or invalid, don't bother looking at the message.
                let tag_byte = v.pop().ok_or(ProtocolError::EmptyMessage)?;
                let tag = MessageType::from_repr(tag_byte)
                    .ok_or(ProtocolError::InvalidMessageType(tag_byte))?;
                let mut src = Bytes::from(v);
                // At this point, we have a valid message of a known type, and
                // the remaining bytes are the message contents.
                // Attempt decode and return the received message.
                let m = match tag {
                    MessageType::Okay => {
                        if !src.is_empty() {
                            return Err(ProtocolError::UnexpectedMessageLen(
                                tag as u8,
                                src.len(),
                            ));
                        }
                        Message::Okay
                    }
                    MessageType::Error => {
                        let e = ron::de::from_str(std::str::from_utf8(&src)?)?;
                        Message::Error(e)
                    }
                    MessageType::Serialized => {
                        let s = std::str::from_utf8(&src)?.to_string();
                        Message::Serialized(s)
                    }
                    MessageType::Blob => Message::Blob(src.to_vec()),
                    MessageType::Page => {
                        if src.len() != 4096 {
                            return Err(ProtocolError::UnexpectedMessageLen(
                                tag as u8,
                                src.len(),
                            ));
                        }
                        Message::Page(src.to_vec())
                    }
                    MessageType::MemQuery => {
                        let (start, end) = get_start_end(tag, &mut src)?;
                        Message::MemQuery(start, end)
                    }
                    MessageType::MemOffer => {
                        let (start, end) = get_start_end(tag, &mut src)?;
                        let bitmap = src.to_vec();
                        Message::MemOffer(start, end, bitmap)
                    }
                    MessageType::MemEnd => {
                        let (start, end) = get_start_end(tag, &mut src)?;
                        Message::MemEnd(start, end)
                    }
                    MessageType::MemFetch => {
                        let (start, end) = get_start_end(tag, &mut src)?;
                        let bitmap = src.to_vec();
                        Message::MemFetch(start, end, bitmap)
                    }
                    MessageType::MemXfer => {
                        let (start, end) = get_start_end(tag, &mut src)?;
                        let bitmap = src.to_vec();
                        Message::MemXfer(start, end, bitmap)
                    }
                    MessageType::MemDone => {
                        if !src.is_empty() {
                            return Err(ProtocolError::UnexpectedMessageLen(
                                tag as u8,
                                src.len(),
                            ));
                        }
                        Message::MemDone
                    }
                };
                Ok(m)
            }
            x => Err(ProtocolError::UnexpectedWebsocketMessage(x)),
        }
    }
}

#[cfg(test)]
mod encoder_tests {
    use super::*;
    use std::convert::TryInto;
    use tokio_tungstenite::tungstenite;

    fn encode(m: Message) -> Vec<u8> {
        if let tungstenite::Message::Binary(bytes) = m.try_into().unwrap() {
            bytes
        } else {
            panic!();
        }
    }

    #[test]
    fn encode_okay() {
        let bytes = encode(Message::Okay);
        assert_eq!(&bytes[..], &[MessageType::Okay as u8]);
    }

    #[test]
    fn encode_error() {
        let error = MigrateError::Initiate;
        let mut bytes = encode(Message::Error(error));
        assert_eq!(bytes.pop(), Some(MessageType::Error as u8));
        assert_eq!(&bytes[..], br#"Initiate"#);
    }

    #[test]
    fn encode_serialized() {
        let obj = String::from("this is an object");
        let mut bytes = encode(Message::Serialized(obj));
        assert_eq!(bytes.pop(), Some(MessageType::Serialized as u8));
        assert_eq!(&bytes[..], b"this is an object");
    }

    #[test]
    fn encode_empty_blob() {
        let empty = Vec::new();
        let bytes = encode(Message::Blob(empty));
        assert_eq!(&bytes[..], &[MessageType::Blob as u8]);
    }

    #[test]
    fn encode_blob() {
        let nonempty = vec![1, 2, 3, 4];
        let bytes = encode(Message::Blob(nonempty));
        assert_eq!(&bytes[..], &[1, 2, 3, 4, MessageType::Blob as u8]);
    }

    #[test]
    fn encode_page() {
        let page = [0u8; 4096];
        let mut bytes = encode(Message::Page(page.to_vec()));
        assert_eq!(bytes.pop(), Some(MessageType::Page as u8));
        assert_eq!(bytes, page);
    }

    #[test]
    fn encode_mem_query() {
        let mut bytes = encode(Message::MemQuery(1, 2));
        assert_eq!(bytes.pop(), Some(MessageType::MemQuery as u8));
        assert_eq!(&bytes[..8], &[1, 0, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8..], &[2, 0, 0, 0, 0, 0, 0, 0]);
    }

    #[test]
    fn encode_mem_offer() {
        let mut bytes = encode(Message::MemOffer(0, 0x8000, vec![0b1010_0101]));
        assert_eq!(bytes.pop(), Some(MessageType::MemOffer as u8));
        assert_eq!(&bytes[..8], &[0, 0, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8..8 + 8], &[0, 0b1000_0000, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8 + 8..], &[0b1010_0101]);
    }

    #[test]
    fn encode_mem_end() {
        let mut bytes = encode(Message::MemEnd(0, 8 * 4096));
        assert_eq!(bytes.pop(), Some(MessageType::MemEnd as u8));
        assert_eq!(&bytes[..8], &[0, 0, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8..], &[0, 0b1000_0000, 0, 0, 0, 0, 0, 0]);
    }

    #[test]
    fn encode_mem_fetch() {
        let mut bytes = encode(Message::MemFetch(0, 0x4000, vec![0b0000_0101]));
        assert_eq!(bytes.pop(), Some(MessageType::MemFetch as u8));
        assert_eq!(&bytes[..8], &[0, 0, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8..8 + 8], &[0, 0x40, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8 + 8..], &[0b0000_0101]);
    }

    #[test]
    fn encode_mem_xfer() {
        let mut bytes = encode(Message::MemXfer(0, 0x8000, vec![0b1010_0101]));
        assert_eq!(bytes.pop(), Some(MessageType::MemXfer as u8));
        assert_eq!(&bytes[..8], &[0, 0, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8..8 + 8], &[0, 0x80, 0, 0, 0, 0, 0, 0]);
        assert_eq!(&bytes[8 + 8..], &[0b1010_0101]);
    }

    #[test]
    fn encode_mem_done() {
        let bytes = encode(Message::MemDone);
        assert_eq!(&bytes[..], [MessageType::MemDone as u8]);
    }
}

#[cfg(test)]
mod live_migration_decoder_tests {
    use super::*;

    #[test]
    fn get_start_end_ok() {
        let one_two = &[1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0];
        let mut bytes = bytes::Bytes::from_static(one_two);
        let (start, end) =
            super::get_start_end(MessageType::MemFetch, &mut bytes).unwrap();
        assert_eq!(start, 1);
        assert_eq!(end, 2);
    }

    #[test]
    fn get_start_end_err() {
        let one_tw = &[1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0];
        let mut bytes = bytes::Bytes::from_static(one_tw);
        assert!(
            super::get_start_end(MessageType::MemFetch, &mut bytes).is_err()
        );
    }
}

#[cfg(test)]
mod decoder_tests {
    use super::*;
    use std::convert::TryInto;
    use tokio_tungstenite::tungstenite;

    #[test]
    fn decode_bad_tag_fails() {
        let bytes = vec![222];
        let res: Result<Message, _> =
            tungstenite::Message::Binary(bytes).try_into();
        assert!(res.is_err());
    }

    #[test]
    fn decode_nonbinary_fails() {
        let res: Result<Message, _> =
            tungstenite::Message::Text(String::new()).try_into();
        assert!(res.is_err());
    }

    #[test]
    fn decode_tagless_fails() {
        let res: Result<Message, _> =
            tungstenite::Message::Binary(vec![]).try_into();
        assert!(res.is_err());
    }

    #[test]
    fn decode_error() {
        let mut bytes = br#"Websocket("foo")"#.to_vec();
        bytes.push(MessageType::Error as u8);
        let expected = MigrateError::Websocket("foo".into());
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::Error(e) if e == expected));
    }

    #[test]
    fn decode_blob() {
        let mut bytes = b"asdf".to_vec();
        bytes.push(MessageType::Blob as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::Blob(b) if b == b"asdf".to_vec()));
    }

    #[test]
    fn decode_page() {
        let mut bytes = vec![0u8; 4096];
        bytes.push(MessageType::Page as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::Page(p)
            if p.iter().all(|&b| b == 0)));
    }

    #[test]
    fn decode_mem_query() {
        let mut bytes = vec![1, 0, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[2, 0, 0, 0, 0, 0, 0, 0]);
        bytes.push(MessageType::MemQuery as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemQuery(start, end)
            if start == 1 && end == 2));
    }

    #[test]
    fn decode_mem_offer() {
        let mut bytes = vec![0, 0, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[0, 0x80, 0, 0, 0, 0, 0, 0]);
        bytes.push(0b0000_1111);
        bytes.push(MessageType::MemOffer as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemOffer(start, end, v)
            if start == 0 && end == 0x8000 && v == vec![0b0000_1111]));
    }

    #[test]
    fn decode_mem_offer_long_bitmap() {
        let mut bytes = vec![0, 0, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[0, 0x80, 0, 0, 0, 0, 0, 0]);
        bytes.push(0b0000_1111);
        bytes.push(0b0000_1010);
        bytes.push(MessageType::MemOffer as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemOffer(start, end, v)
            if start == 0 &&
                end == 0x8000 &&
                v == vec![0b0000_1111, 0b0000_1010]));
    }

    #[test]
    fn decode_mem_end() {
        let mut bytes = vec![0, 0x40, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[0, 0x40 + 0x80, 0, 0, 0, 0, 0, 0]);
        bytes.push(MessageType::MemEnd as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemEnd(start, end)
            if start == 0x4000 && end == 0xC000));
    }

    #[test]
    fn decode_mem_fetch() {
        let mut bytes = vec![0, 0, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[0, 0x80, 0, 0, 0, 0, 0, 0]);
        bytes.push(0b0000_1111);
        bytes.push(MessageType::MemFetch as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemFetch(start, end, v)
            if start == 0 && end == 0x8000 && v == vec![0b0000_1111]));
    }

    #[test]
    fn decode_mem_xfer() {
        let mut bytes = vec![0, 0, 0, 0, 0, 0, 0, 0];
        bytes.extend(&[0, 0x80, 0, 0, 0, 0, 0, 0]);
        bytes.push(0b0000_1111);
        bytes.push(MessageType::MemXfer as u8);
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemXfer(start, end, v)
            if start == 0 && end == 0x8000 && v == vec![0b0000_1111]));
    }

    #[test]
    fn decode_mem_done() {
        let bytes = vec![MessageType::MemDone as u8];
        let decoded = tungstenite::Message::Binary(bytes).try_into().unwrap();
        assert!(matches!(decoded, Message::MemDone));
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/destination.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use bitvec::prelude as bv;
use futures::{SinkExt, StreamExt};
use hyper::header::HeaderValue;
use propolis::common::{GuestAddr, Lifecycle, PAGE_SIZE};
use propolis::migrate::{
    MigrateCtx, MigrateStateError, Migrator, PayloadOffer, PayloadOffers,
};
use propolis::vmm;
use propolis_api_types::instance::ReplacementComponent;
use propolis_api_types::instance_spec::SpecKey;
use slog::{error, info, trace, warn};
use std::collections::BTreeMap;
use std::convert::TryInto;
use std::io;
use std::net::SocketAddr;
use std::sync::Arc;
use tokio::net::TcpStream;
use tokio_tungstenite::tungstenite::client::IntoClientRequest;
use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode;
use tokio_tungstenite::tungstenite::protocol::CloseFrame;
use tokio_tungstenite::{tungstenite, MaybeTlsStream, WebSocketStream};
use uuid::Uuid;

use crate::migrate::codec;
use crate::migrate::memx;
use crate::migrate::preamble::Preamble;
use crate::migrate::probes;
use crate::migrate::{
    Device, MigrateError, MigratePhase, MigrateRole, MigrationState, PageIter,
};
use crate::spec::Spec;
use crate::vm::ensure::{VmEnsureActive, VmEnsureNotStarted};
use crate::vm::state_publisher::{
    ExternalStateUpdate, MigrationStateUpdate, StatePublisher,
};

use super::protocol::Protocol;
use super::MigrateConn;

pub(crate) struct MigrationTargetInfo {
    pub migration_id: Uuid,
    pub src_addr: SocketAddr,
    pub replace_components: BTreeMap<SpecKey, ReplacementComponent>,
}

/// The interface to an arbitrary version of the target half of the live
/// migration protocol.
//
// Use `async_trait` here to help generate a `Send` bound on the futures
// returned by the functions in this trait.
#[async_trait::async_trait]
pub(crate) trait DestinationProtocol {
    /// Runs live migration as a target, attempting to create a set of VM
    /// objects in the process. On success, returns an "active VM" placeholder
    /// that the caller can use to set up and start a state driver loop.
    async fn run<'ensure>(
        mut self,
        ensure: VmEnsureNotStarted<'ensure>,
    ) -> Result<VmEnsureActive<'ensure>, MigrateError>;
}

/// Connects to a live migration source using the migration request information
/// in `migrate_info`, then negotiates a protocol version with that source.
/// Returns a [`DestinationProtocol`] implementation for the negotiated version
/// that the caller can use to run the migration.
pub(crate) async fn initiate(
    log: &slog::Logger,
    migrate_info: &MigrationTargetInfo,
    local_addr: SocketAddr,
) -> Result<impl DestinationProtocol, MigrateError> {
    let migration_id = migrate_info.migration_id;

    let log = log.new(slog::o!(
        "migration_id" => migration_id.to_string(),
        "migrate_role" => "destination",
        "migrate_src_addr" => migrate_info.src_addr
    ));

    info!(log, "negotiating migration as destination");

    // Build upgrade request to the source instance
    let mut conn =
        migration_start_connect(&log, migrate_info.src_addr, migration_id)
            .await?;

    // Generate a list of protocols that this target supports, then send them to
    // the source and allow it to choose its favorite.
    let dst_protocols = super::protocol::make_protocol_offer();
    conn.send(tungstenite::Message::Text(dst_protocols)).await?;
    let src_selected = match conn.next().await {
        Some(Ok(tungstenite::Message::Text(selected))) => selected,
        x => {
            error!(
                log,
                "source instance failed to negotiate protocol version: {:?}", x
            );

            // Tell the source about its mistake. This is best-effort.
            if let Err(e) = conn
                .send(tungstenite::Message::Close(Some(CloseFrame {
                    code: CloseCode::Protocol,
                    reason: "did not respond to version handshake.".into(),
                })))
                .await
            {
                warn!(log, "failed to send handshake failure to source";
                      "error" => ?e);
            }

            return Err(MigrateError::Initiate);
        }
    };

    // Make sure the source's selected protocol parses correctly and is in the
    // list of protocols this target supports. If the source's choice is valid,
    // use the protocol it picked.
    let selected =
        match super::protocol::select_protocol_from_offer(&src_selected) {
            Ok(Some(selected)) => selected,
            Ok(None) => {
                let offered = super::protocol::make_protocol_offer();
                error!(log, "source selected protocol not on offer";
                       "offered" => &offered,
                       "selected" => &src_selected);

                return Err(MigrateError::NoMatchingProtocol(
                    src_selected,
                    offered,
                ));
            }
            Err(e) => {
                error!(log, "source selected protocol failed to parse";
                       "selected" => &src_selected);

                return Err(MigrateError::ProtocolParse(
                    src_selected,
                    e.to_string(),
                ));
            }
        };

    Ok(match selected {
        Protocol::RonV0 => RonV0::new(log, migration_id, conn, local_addr),
    })
}

async fn migration_start_connect(
    log: &slog::Logger,
    src_addr: SocketAddr,
    migration_id: Uuid,
) -> Result<WebSocketStream<MaybeTlsStream<TcpStream>>, MigrateError> {
    // We do this by hand to avoid a dependency from propolis-server to
    // propolis-client.
    // TODO(#165): https (wss)
    // TODO: We need to make sure the src_addr is a valid target
    let src_migrate_url =
        format!("ws://{}/instance/migrate/{}/start", src_addr, migration_id);
    info!(log, "Begin migration"; "src_migrate_url" => &src_migrate_url);
    let mut req = src_migrate_url.into_client_request()?;

    // Add the api-version header. This assumes the instance_migrate_start API
    // hasn't changed. See the note in crates/propolis-server-api/src/lib.rs.
    req.headers_mut().insert(
        omicron_common::api::VERSION_HEADER,
        HeaderValue::from_str(
            &propolis_server_api::VERSION_INITIAL.to_string(),
        )
        .expect("VERSION_INITIAL is \"1.0.0\" which is a valid header value"),
    );

    let (conn, _) = tokio_tungstenite::connect_async(req).await?;
    Ok(conn)
}

/// The runner for version 0 of the LM protocol, using RON encoding.
struct RonV0<T: MigrateConn> {
    /// The ID for this migration.
    migration_id: Uuid,

    /// The logger for messages from this protocol.
    log: slog::Logger,

    /// The channel to use to send messages to the state worker coordinating
    /// this migration.
    conn: WebSocketStream<T>,

    /// Local propolis-server address
    /// (to inform the source-side where to redirect its clients)
    local_addr: SocketAddr,
}

#[async_trait::async_trait]
impl<T: MigrateConn + Sync> DestinationProtocol for RonV0<T> {
    async fn run<'ensure>(
        mut self,
        mut ensure: VmEnsureNotStarted<'ensure>,
    ) -> Result<VmEnsureActive<'ensure>, MigrateError> {
        info!(self.log(), "entering destination migration task");

        let result = async {
            // Run the sync phase to ensure that the source's instance spec is
            // compatible with the spec supplied in the ensure parameters.
            let spec = match self.run_sync_phases(&mut ensure).await {
                Ok(spec) => spec,
                Err(e) => {
                    self.update_state(
                        ensure.state_publisher(),
                        MigrationState::Error,
                    );
                    let e = ensure.fail(e.into()).await;
                    return Err(e
                        .downcast::<MigrateError>()
                        .expect("original error was a MigrateError"));
                }
            };

            // The sync phase succeeded, so it's OK to go ahead with creating
            // the objects in the target's instance spec.
            let mut objects_created =
                ensure.create_objects_from_spec(spec).await.map_err(|e| {
                    MigrateError::TargetInstanceInitializationFailed(
                        e.to_string(),
                    )
                })?;
            objects_created.prepare_for_migration().await;
            let mut ensure = objects_created.ensure_active().await;

            // Now that the VM's objects exist, run the rest of the protocol to
            // import state into them.
            if let Err(e) = self.run_import_phases(&mut ensure).await {
                self.update_state(
                    ensure.state_publisher(),
                    MigrationState::Error,
                );
                ensure.fail().await;
                return Err(e);
            }

            Ok(ensure)
        }
        .await;

        match result {
            Ok(vm) => {
                info!(self.log(), "migration in succeeded");
                Ok(vm)
            }
            Err(err) => {
                error!(self.log(), "migration in failed"; "error" => ?err);

                // We encountered an error, try to inform the remote before
                // bailing Note, we don't use `?` here as this is a best effort
                // and we don't want an error encountered during this send to
                // shadow the run error from the caller.
                if let Ok(e) = codec::Message::Error(err.clone()).try_into() {
                    let _ = self.conn.send(e).await;
                }
                Err(err)
            }
        }
    }
}

impl<T: MigrateConn> RonV0<T> {
    fn new(
        log: slog::Logger,
        migration_id: Uuid,
        conn: WebSocketStream<T>,
        local_addr: SocketAddr,
    ) -> Self {
        Self { log, migration_id, conn, local_addr }
    }

    fn log(&self) -> &slog::Logger {
        &self.log
    }

    fn update_state(
        &self,
        publisher: &mut StatePublisher,
        state: MigrationState,
    ) {
        publisher.update(ExternalStateUpdate::Migration(
            MigrationStateUpdate {
                state,
                id: self.migration_id,
                role: MigrateRole::Destination,
            },
        ));
    }

    async fn run_sync_phases(
        &mut self,
        ensure_ctx: &mut VmEnsureNotStarted<'_>,
    ) -> Result<Spec, MigrateError> {
        let step = MigratePhase::MigrateSync;

        probes::migrate_phase_begin!(|| { step.to_string() });
        let result = self.sync(ensure_ctx).await;
        probes::migrate_phase_end!(|| { step.to_string() });

        result
    }

    async fn run_import_phases(
        &mut self,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        // The RAM transfer phase runs twice, once before the source pauses and
        // once after. There is no explicit pause phase on the destination,
        // though, so that step does not appear here even though there are
        // pre- and post-pause steps.
        self.run_import_phase(MigratePhase::RamPushPrePause, ensure_ctx)
            .await?;
        self.run_import_phase(MigratePhase::RamPushPostPause, ensure_ctx)
            .await?;

        // Import of the time data *must* be done before we import device
        // state: the proper functioning of device timers depends on an adjusted
        // boot_hrtime.
        self.run_import_phase(MigratePhase::TimeData, ensure_ctx).await?;
        self.run_import_phase(MigratePhase::DeviceState, ensure_ctx).await?;
        self.run_import_phase(MigratePhase::RamPull, ensure_ctx).await?;
        self.run_import_phase(MigratePhase::ServerState, ensure_ctx).await?;
        self.run_import_phase(MigratePhase::Finish, ensure_ctx).await?;

        Ok(())
    }

    async fn run_import_phase(
        &mut self,
        step: MigratePhase,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        probes::migrate_phase_begin!(|| { step.to_string() });

        let res = match step {
            MigratePhase::MigrateSync => {
                unreachable!("sync phase runs before import")
            }

            // no pause step on the dest side
            MigratePhase::Pause => {
                unreachable!("no explicit pause phase on dest")
            }

            MigratePhase::RamPushPrePause | MigratePhase::RamPushPostPause => {
                self.ram_push(&step, ensure_ctx).await
            }
            MigratePhase::DeviceState => self.device_state(ensure_ctx).await,
            MigratePhase::TimeData => self.time_data(ensure_ctx).await,
            MigratePhase::RamPull => self.ram_pull(ensure_ctx).await,
            MigratePhase::ServerState => self.server_state(ensure_ctx).await,
            MigratePhase::Finish => self.finish(ensure_ctx).await,
        };

        probes::migrate_phase_end!(|| { step.to_string() });

        res
    }

    async fn sync(
        &mut self,
        ensure_ctx: &mut VmEnsureNotStarted<'_>,
    ) -> Result<Spec, MigrateError> {
        self.update_state(ensure_ctx.state_publisher(), MigrationState::Sync);
        let preamble: Preamble = match self.read_msg().await? {
            codec::Message::Serialized(s) => {
                Ok(ron::de::from_str(&s).map_err(codec::ProtocolError::from)?)
            }
            msg => {
                error!(
                    self.log(),
                    "expected serialized preamble but received: {msg:?}"
                );
                Err(MigrateError::UnexpectedMessage)
            }
        }?;
        info!(self.log(), "Destination read Preamble: {:?}", preamble);

        let spec = match preamble.amend_spec(
            &ensure_ctx
                .migration_info()
                .expect("migration in was requested")
                .replace_components,
        ) {
            Ok(spec) => spec,
            Err(e) => {
                error!(
                    self.log(),
                    "source and destination instance specs incompatible";
                    "error" => #%e
                );
                return Err(MigrateError::InstanceSpecsIncompatible(
                    e.to_string(),
                ));
            }
        };

        self.send_msg(codec::Message::Okay).await?;
        Ok(spec)
    }

    async fn ram_push(
        &mut self,
        phase: &MigratePhase,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        let state = match phase {
            MigratePhase::RamPushPrePause => MigrationState::RamPush,
            MigratePhase::RamPushPostPause => MigrationState::RamPushDirty,
            _ => unreachable!("should only push RAM in a RAM push phase"),
        };

        self.update_state(ensure_ctx.state_publisher(), state);
        let (dirty, highest) = self.query_ram().await?;
        for (k, region) in dirty.as_raw_slice().chunks(4096).enumerate() {
            if region.iter().all(|&b| b == 0) {
                continue;
            }

            // This is an iteration over chunks of 4,096 bitmap bytes, so
            // (k * 4096) is the offset (into the overall bitmap) of the first
            // byte in the chunk. Multiply this by 8 bits/byte to get a number
            // of bits, then multiply by PAGE_SIZE to get a physical address.
            let start = (k * 4096 * 8 * PAGE_SIZE) as u64;
            let end = start + (region.len() * 8 * PAGE_SIZE) as u64;
            let end = highest.min(end);
            self.send_msg(memx::make_mem_fetch(start, end, region)).await?;
            let m = self.read_msg().await?;
            trace!(
                self.log(),
                "ram_push ({:?}): source xfer phase recvd {:?}",
                m,
                phase
            );
            match m {
                codec::Message::MemXfer(start, end, bits) => {
                    if !memx::validate_bitmap(start, end, &bits) {
                        error!(
                            self.log(),
                            "ram_push ({:?}): MemXfer received bad bitmap",
                            phase
                        );
                        return Err(MigrateError::Phase);
                    }
                    // XXX: We should do stricter validation on the fetch
                    // request here.  For instance, we shouldn't "push" MMIO
                    // space or non-existent RAM regions.  While we de facto
                    // do not because of the way access is implemented, we
                    // should probably disallow it at the protocol level.
                    self.xfer_ram(ensure_ctx, start, end, &bits).await?;
                }
                _ => return Err(MigrateError::UnexpectedMessage),
            };
        }
        self.send_msg(codec::Message::MemDone).await?;
        self.update_state(ensure_ctx.state_publisher(), MigrationState::Pause);
        Ok(())
    }

    async fn query_ram(
        &mut self,
    ) -> Result<(bv::BitVec<u8, bv::Lsb0>, u64), MigrateError> {
        self.send_msg(codec::Message::MemQuery(0, !0)).await?;

        let mut dirty = bv::BitVec::<u8, bv::Lsb0>::new();
        let mut highest = 0;
        loop {
            let m = self.read_msg().await?;
            trace!(self.log(), "ram_push: source xfer phase recvd {:?}", m);
            match m {
                codec::Message::MemEnd(start, end) => {
                    if start != 0 || end != !0 {
                        error!(self.log(), "ram_push: received bad MemEnd");
                        return Err(MigrateError::Phase);
                    }
                    break;
                }
                codec::Message::MemOffer(start, end, bits) => {
                    if !memx::validate_bitmap(start, end, &bits) {
                        error!(
                            self.log(),
                            "ram_push: MemOffer received bad bitmap"
                        );
                        return Err(MigrateError::Phase);
                    }
                    if end > highest {
                        highest = end;
                    }
                    let start_bit_index = start as usize / PAGE_SIZE;
                    if dirty.len() < start_bit_index {
                        dirty.resize(start_bit_index, false);
                    }
                    dirty.extend_from_raw_slice(&bits);
                }
                _ => return Err(MigrateError::UnexpectedMessage),
            }
        }
        Ok((dirty, highest))
    }

    async fn xfer_ram(
        &mut self,
        ensure_ctx: &VmEnsureActive<'_>,
        start: u64,
        end: u64,
        bits: &[u8],
    ) -> Result<(), MigrateError> {
        info!(self.log(), "ram_push: xfer RAM between {} and {}", start, end);
        for addr in PageIter::new(start, end, bits) {
            let bytes = self.read_page().await?;
            self.write_guest_ram(ensure_ctx, GuestAddr(addr), &bytes).await?;
        }
        Ok(())
    }

    async fn device_state(
        &mut self,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        self.update_state(ensure_ctx.state_publisher(), MigrationState::Device);

        let devices: Vec<Device> = match self.read_msg().await? {
            codec::Message::Serialized(encoded) => {
                ron::de::from_reader(encoded.as_bytes())
                    .map_err(codec::ProtocolError::from)?
            }
            msg => {
                error!(self.log(), "device_state: unexpected message: {msg:?}");
                return Err(MigrateError::UnexpectedMessage);
            }
        };
        self.read_ok().await?;

        info!(self.log(), "Devices: {devices:#?}");

        {
            let vm_objects = ensure_ctx.vm_objects().lock_shared().await;
            let migrate_ctx =
                MigrateCtx { mem: &vm_objects.access_mem().unwrap() };
            for device in devices {
                let key = SpecKey::from(device.instance_name.clone());
                info!(self.log(), "Applying state to device {key}");

                let target =
                    vm_objects.device_by_id(&key).ok_or_else(|| {
                        MigrateError::UnknownDevice(key.to_string())
                    })?;
                self.import_device(&target, &device, &migrate_ctx)?;
            }
        }

        self.send_msg(codec::Message::Okay).await
    }

    // Get the guest time data from the source, make updates to it based on the
    // new host, and write the data out to bhvye.
    async fn time_data(
        &mut self,
        ensure_ctx: &VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        // Read time data sent by the source and deserialize
        let raw: String = match self.read_msg().await? {
            codec::Message::Serialized(encoded) => encoded,
            msg => {
                error!(self.log(), "time data: unexpected message: {msg:?}");
                return Err(MigrateError::UnexpectedMessage);
            }
        };
        info!(self.log(), "VMM Time Data: {:?}", raw);
        let time_data_src: vmm::time::VmTimeData = ron::from_str(&raw)
            .map_err(|e| {
                MigrateError::TimeData(format!(
                    "VMM Time Data deserialization error: {e}"
                ))
            })?;
        probes::migrate_time_data_before!(|| {
            (
                time_data_src.guest_freq,
                time_data_src.guest_tsc,
                time_data_src.boot_hrtime,
            )
        });

        // Take a snapshot of the host hrtime/wall clock time, then adjust
        // time data appropriately.
        let vmm_hdl =
            &ensure_ctx.vm_objects().lock_shared().await.vmm_hdl().clone();

        let (dst_hrt, dst_wc) = vmm::time::host_time_snapshot(vmm_hdl)
            .map_err(|e| {
                MigrateError::TimeData(format!("could not read host time: {e}"))
            })?;
        let (time_data_dst, adjust) =
            vmm::time::adjust_time_data(time_data_src, dst_hrt, dst_wc)
                .map_err(|e| {
                    MigrateError::TimeData(format!(
                        "could not adjust VMM Time Data: {e}"
                    ))
                })?;

        // In case import fails, log adjustments made to time data and fire
        // dtrace probe first
        if adjust.migrate_delta_negative {
            warn!(
                self.log(),
                "Found negative wall clock delta between target import \
                and source export:\n\
                - source wall clock time: {:?}\n\
                - target wall clock time: {:?}\n",
                time_data_src.wall_clock(),
                dst_wc
            );
        }
        info!(
            self.log(),
            "Time data adjustments:\n\
            - guest TSC freq: {} Hz = {} GHz\n\
            - guest uptime ns: {:?}\n\
            - migration time delta: {:?}\n\
            - guest_tsc adjustment = {} + {} = {}\n\
            - boot_hrtime adjustment = {} ---> {} - {} = {}\n\
            - dest highres clock time: {}\n\
            - dest wall clock time: {:?}",
            time_data_dst.guest_freq,
            time_data_dst.guest_freq as f64 / vmm::time::NS_PER_SEC as f64,
            adjust.guest_uptime_ns,
            adjust.migrate_delta,
            time_data_src.guest_tsc,
            adjust.guest_tsc_delta,
            time_data_dst.guest_tsc,
            time_data_src.boot_hrtime,
            dst_hrt,
            adjust.boot_hrtime_delta,
            time_data_dst.boot_hrtime,
            dst_hrt,
            dst_wc
        );
        probes::migrate_time_data_after!(|| {
            (
                time_data_dst.guest_freq,
                time_data_dst.guest_tsc,
                time_data_dst.boot_hrtime,
                adjust.guest_uptime_ns,
                adjust.migrate_delta.as_nanos() as u64,
                adjust.migrate_delta_negative,
            )
        });

        // Import the adjusted time data
        vmm::time::import_time_data(vmm_hdl, time_data_dst).map_err(|e| {
            MigrateError::TimeData(format!("VMM Time Data import error: {e}"))
        })?;

        self.send_msg(codec::Message::Okay).await
    }

    fn import_device(
        &self,
        target: &Arc<dyn Lifecycle>,
        device: &Device,
        migrate_ctx: &MigrateCtx,
    ) -> Result<(), MigrateError> {
        match target.migrate() {
            Migrator::NonMigratable => {
                error!(
                    self.log(),
                    "Can't migrate instance with non-migratable \
                               device ({})",
                    device.instance_name
                );
                return Err(MigrateStateError::NonMigratable.into());
            }
            Migrator::Empty => {
                // The source shouldn't be sending devices with empty payloads
                warn!(
                    self.log(),
                    "received unexpected device state for device {}",
                    device.instance_name
                );
            }
            Migrator::Single(mech) => {
                if device.payload.len() != 1 {
                    return Err(MigrateError::DeviceState(format!(
                        "Unexpected payload count {}",
                        device.payload.len()
                    )));
                }

                let payload = &device.payload[0];
                let ron_data = &mut ron::Deserializer::from_str(&payload.data)
                    .map_err(codec::ProtocolError::from)?;
                let clean =
                    Box::new(<dyn erased_serde::Deserializer>::erase(ron_data));
                let offer = PayloadOffer {
                    kind: &payload.kind,
                    version: payload.version,
                    payload: clean,
                };

                mech.import(offer, migrate_ctx)?;
            }
            Migrator::Multi(mech) => {
                // Assembling the collection of PayloadOffers looks a bit more
                // verbose than ideal, but gathering the borrows (those split
                // from Device, and the mutable Deserializer) all at once
                // requires a delicate dance.
                let mut payload_desers: Vec<ron::Deserializer> =
                    Vec::with_capacity(device.payload.len());
                let mut metadata: Vec<(&str, u32)> =
                    Vec::with_capacity(device.payload.len());
                for payload in device.payload.iter() {
                    payload_desers.push(
                        ron::Deserializer::from_str(&payload.data)
                            .map_err(codec::ProtocolError::from)?,
                    );
                    metadata.push((&payload.kind, payload.version));
                }
                let offer_iter = metadata
                    .iter()
                    .zip(payload_desers.iter_mut())
                    .map(|(meta, deser)| PayloadOffer {
                        kind: meta.0,
                        version: meta.1,
                        payload: Box::new(
                            <dyn erased_serde::Deserializer>::erase(deser),
                        ),
                    });

                let mut offer = PayloadOffers::new(offer_iter);
                mech.import(&mut offer, migrate_ctx)?;

                let mut count = 0;
                for offer in offer.remaining() {
                    error!(
                        self.log(),
                        "Unexpected payload - device:{} kind:{} version:{}",
                        &device.instance_name,
                        offer.kind,
                        offer.version,
                    );
                    count += 1;
                }
                if count != 0 {
                    return Err(MigrateError::DeviceState(format!(
                        "Found {} unconsumed payload(s) for device {}",
                        count, &device.instance_name,
                    )));
                }
            }
        }
        Ok(())
    }

    async fn ram_pull(
        &mut self,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        self.update_state(
            ensure_ctx.state_publisher(),
            MigrationState::RamPull,
        );
        self.send_msg(codec::Message::MemQuery(0, !0)).await?;
        let m = self.read_msg().await?;
        info!(self.log(), "ram_pull: got end {:?}", m);
        self.send_msg(codec::Message::MemDone).await
    }

    async fn server_state(
        &mut self,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        self.update_state(ensure_ctx.state_publisher(), MigrationState::Server);
        self.send_msg(codec::Message::Serialized(
            ron::to_string(&self.local_addr)
                .map_err(codec::ProtocolError::from)?,
        ))
        .await?;
        let com1_history = match self.read_msg().await? {
            codec::Message::Serialized(encoded) => encoded,
            msg => {
                error!(self.log(), "server_state: unexpected message: {msg:?}");
                return Err(MigrateError::UnexpectedMessage);
            }
        };

        ensure_ctx
            .vm_objects()
            .lock_shared()
            .await
            .com1()
            .import(&com1_history)
            .await
            .map_err(|e| MigrateError::Codec(e.to_string()))?;

        self.send_msg(codec::Message::Okay).await
    }

    async fn finish(
        &mut self,
        ensure_ctx: &mut VmEnsureActive<'_>,
    ) -> Result<(), MigrateError> {
        // Tell the source this destination is ready to run the VM.
        self.send_msg(codec::Message::Okay).await?;

        // Wait for the source to acknowledge that it's handing control to this
        // destination. If this acknowledgement doesn't arrive, there's no way
        // to be sure the source hasn't decided the migration has failed and
        // that it should resume the VM.
        self.read_ok().await?;

        // The source has acknowledged the migration is complete, so it's safe
        // to declare victory publicly.
        self.update_state(ensure_ctx.state_publisher(), MigrationState::Finish);
        Ok(())
    }

    async fn read_msg(&mut self) -> Result<codec::Message, MigrateError> {
        self.conn
            .next()
            .await
            .ok_or_else(|| {
                codec::ProtocolError::Io(io::Error::from(
                    io::ErrorKind::BrokenPipe,
                ))
            })?
            // If this is an error message, lift that out
            .map(|msg| match msg.try_into()? {
                codec::Message::Error(err) => {
                    error!(
                        self.log(),
                        "migration failed due to error from source: {err}"
                    );
                    Err(MigrateError::RemoteError(
                        MigrateRole::Source,
                        err.to_string(),
                    ))
                }
                msg => Ok(msg),
            })?
    }

    async fn read_ok(&mut self) -> Result<(), MigrateError> {
        match self.read_msg().await? {
            codec::Message::Okay => Ok(()),
            msg => {
                error!(self.log(), "expected `Okay` but received: {msg:?}");
                Err(MigrateError::UnexpectedMessage)
            }
        }
    }

    async fn read_page(&mut self) -> Result<Vec<u8>, MigrateError> {
        match self.read_msg().await? {
            codec::Message::Page(bytes) => Ok(bytes),
            _ => Err(MigrateError::UnexpectedMessage),
        }
    }

    async fn send_msg(
        &mut self,
        m: codec::Message,
    ) -> Result<(), MigrateError> {
        Ok(self.conn.send(m.try_into()?).await?)
    }

    async fn write_guest_ram(
        &mut self,
        ensure_ctx: &VmEnsureActive<'_>,
        addr: GuestAddr,
        buf: &[u8],
    ) -> Result<(), MigrateError> {
        let objects = ensure_ctx.vm_objects().lock_shared().await;
        let memctx = objects.access_mem().unwrap();
        let len = buf.len();
        memctx.write_from(addr, buf, len);
        Ok(())
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/memx.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::migrate::codec;

// The bitmap data structure uses a single bit to represent
// each 4KiB page frame in the [start, end) GPA range, but we
// use [start, end) pages instead of a start address and an
// upper bound determined by the size of the bitmap, so we need
// to verify that the size of the bitmap corresponds to the
// given range.
//
// Why not use a simpler, base and bound representation?
// There is a problem: because the minimum width data type
// useful for the bitmap is an eight bit byte, it represents
// up to eight pages.  Since there is no constraint that the
// range associated with any given bitmap be eight-page
// aligned, we must be able to handle the bitmap containing
// the case where the last entry in the bitmap contains bits
// corresponding to up to seven bits beyond the range.
//
// Fortunately, we can declare that any attempt to create an
// invalid bitmap by software is a programming error, and
// assert against it.  Similarly, any invalid received
// bitmap sent from a remote Propolis is an error, and we
// can abort.  This code enforces these rules as invariants.

// Validates the parameters from a fetch, offer, or xfer.
pub(crate) fn validate_bitmap(start: u64, end: u64, bits: &[u8]) -> bool {
    if !start.is_multiple_of(4096) || !end.is_multiple_of(4096) {
        return false;
    }
    if end <= start {
        return false;
    }
    let npages = ((end - start) / 4096) as usize;
    let npages_bitmap = bits.len() * 8;
    if npages_bitmap < npages || (npages_bitmap - npages) >= 8 {
        return false;
    }
    if npages_bitmap != npages {
        let last_bits = npages_bitmap - npages;
        let mask = !0u8 << (8 - last_bits);
        let last_byte = bits[bits.len() - 1];
        if last_byte & mask != 0 {
            return false;
        }
    }
    true
}

/// Creates an offer message for a range of physical
/// addresses and a bitmap.
pub(crate) fn make_mem_offer(
    start_gpa: u64,
    end_gpa: u64,
    bitmap: &[u8],
) -> codec::Message {
    assert!(validate_bitmap(start_gpa, end_gpa, bitmap));
    codec::Message::MemOffer(start_gpa, end_gpa, bitmap.into())
}

pub(crate) fn make_mem_fetch(
    start_gpa: u64,
    end_gpa: u64,
    bitmap: &[u8],
) -> codec::Message {
    assert!(validate_bitmap(start_gpa, end_gpa, bitmap));
    codec::Message::MemFetch(start_gpa, end_gpa, bitmap.into())
}

pub(crate) fn make_mem_xfer(
    start_gpa: u64,
    end_gpa: u64,
    bitmap: &[u8],
) -> codec::Message {
    assert!(validate_bitmap(start_gpa, end_gpa, bitmap));
    codec::Message::MemXfer(start_gpa, end_gpa, bitmap.into())
}

#[cfg(test)]
mod memx_test {
    use super::*;
    use crate::migrate::codec;

    #[test]
    fn make_mem_offer_simple() {
        let bitmap = &[0b1010_1001u8];
        let msg = make_mem_offer(0, 8 * 4096, bitmap);
        let expected = vec![0b1010_1001u8];
        assert!(matches!(msg, codec::Message::MemOffer(0, s, v)
            if s == 8 * 4096 && v == expected));
    }

    #[test]
    fn make_mem_xfer_short() {
        let bitmap = &[0b0011_1111u8];
        let msg = make_mem_xfer(0, 6 * 4096, bitmap);
        let expected = vec![0b0011_1111u8];
        assert!(matches!(msg, codec::Message::MemXfer(0, s, v)
            if s == 6 * 4096 && v == expected));
    }

    #[test]
    #[should_panic]
    fn make_mem_fetch_too_long_fails() {
        let bitmap = &[0b1100_0000u8];
        let _msg = make_mem_fetch(0, 7 * 4096, bitmap);
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use bit_field::BitField;
use dropshot::HttpError;
use propolis::migrate::MigrateStateError;
use propolis_api_types::migration::MigrationState;
use serde::{Deserialize, Serialize};
use slog::error;
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite};

mod codec;
pub mod destination;
mod memx;
mod preamble;
pub mod protocol;
pub mod source;

/// Trait bounds for connection objects used in live migrations.
pub(crate) trait MigrateConn:
    AsyncRead + AsyncWrite + Unpin + Send
{
}

impl MigrateConn for tokio_tungstenite::MaybeTlsStream<tokio::net::TcpStream> {}
impl MigrateConn for dropshot::WebsocketConnectionRaw {}

#[derive(Copy, Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
pub enum MigrateRole {
    Source,
    Destination,
}

// N.B. Keep in sync with scripts/live-migration-times.d.
#[derive(Debug, PartialEq, Eq)]
enum MigratePhase {
    MigrateSync,
    Pause,
    RamPushPrePause,
    RamPushPostPause,
    TimeData,
    DeviceState,
    RamPull,
    ServerState,
    Finish,
}

impl std::fmt::Display for MigratePhase {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let s = match self {
            MigratePhase::MigrateSync => "Sync",
            MigratePhase::Pause => "Pause",
            MigratePhase::RamPushPrePause => "RamPushPrePause",
            MigratePhase::RamPushPostPause => "RamPushPostPause",
            MigratePhase::TimeData => "TimeData",
            MigratePhase::DeviceState => "DeviceState",
            MigratePhase::RamPull => "RamPull",
            MigratePhase::ServerState => "ServerState",
            MigratePhase::Finish => "Finish",
        };

        write!(f, "{s}")
    }
}

/// Errors which may occur during the course of a migration
#[derive(Clone, Debug, Error, Deserialize, PartialEq, Serialize)]
pub enum MigrateError {
    /// An error as a result of some Websocket operation (i.e. establishing
    /// or maintaining the connection between the source and destination)
    #[error("Websocket error: {0}")]
    Websocket(String),

    /// Failed to initiate the migration protocol
    #[error("couldn't establish migration connection to source instance")]
    Initiate,

    #[error("failed to parse the offered protocol list ({0}): {1}")]
    ProtocolParse(String, String),

    /// The source and destination instances are not compatible
    #[error("the source ({0}) and destination ({1}) instances have no common protocol")]
    NoMatchingProtocol(String, String),

    /// Incomplete WebSocket upgrade request
    #[error("expected connection upgrade")]
    UpgradeExpected,

    /// Error parsing the contents of the preamble
    #[error("failed to parse preamble: {0}")]
    PreambleParse(String),

    /// Source and target decided their configurations are incompatible
    #[error("instance specs incompatible: {0}")]
    InstanceSpecsIncompatible(String),

    /// Attempted to migrate an uninitialized instance
    #[error("failed to initialize the target VM: {0}")]
    TargetInstanceInitializationFailed(String),

    /// The given UUID does not match the existing instance/migration UUID
    #[error("unexpected Uuid")]
    UuidMismatch,

    /// A different migration already in progress
    #[error("a migration from the current instance is already in progress")]
    MigrationAlreadyInProgress,

    /// Migration state was requested with no migration in process
    #[error("no migration is currently in progress")]
    NoMigrationInProgress,

    /// A VM controller function returned an error
    #[error("VM state machine error: {0}")]
    StateMachine(String),

    /// Encountered an error as part of encoding/decoding migration messages
    #[error("codec error: {0}")]
    Codec(String),

    /// The instance is in an invalid state for the current operation
    #[error("encountered invalid instance state")]
    InvalidInstanceState,

    /// Received a message out of order
    #[error("received unexpected migration message")]
    UnexpectedMessage,

    /// Failed to pause the source instance's devices or tasks
    #[error("failed to pause source instance")]
    SourcePause,

    /// Phase error
    #[error("received out-of-phase message")]
    Phase,

    /// Failed to export/import time data state
    #[error("failed to migrate VMM time data: {0}")]
    TimeData(String),

    /// Failed to export/import device state for migration
    #[error("failed to migrate device state: {0}")]
    DeviceState(String),

    /// The destination instance doesn't recognize the received device
    #[error("received device state for unknown device ({0})")]
    UnknownDevice(String),

    /// The other end of the migration ran into an error
    #[error("{0:?} migration instance encountered error: {1}")]
    RemoteError(MigrateRole, String),
}

impl From<tokio_tungstenite::tungstenite::Error> for MigrateError {
    fn from(err: tokio_tungstenite::tungstenite::Error) -> MigrateError {
        MigrateError::Websocket(err.to_string())
    }
}

impl From<codec::ProtocolError> for MigrateError {
    fn from(err: codec::ProtocolError) -> Self {
        MigrateError::Codec(err.to_string())
    }
}

impl From<MigrateStateError> for MigrateError {
    fn from(value: MigrateStateError) -> Self {
        Self::DeviceState(value.to_string())
    }
}

impl From<MigrateError> for HttpError {
    fn from(err: MigrateError) -> Self {
        let msg = format!("migration failed: {err}");
        match &err {
            MigrateError::Websocket(_)
            | MigrateError::Initiate
            | MigrateError::ProtocolParse(_, _)
            | MigrateError::NoMatchingProtocol(_, _)
            | MigrateError::TargetInstanceInitializationFailed(_)
            | MigrateError::PreambleParse(_)
            | MigrateError::InstanceSpecsIncompatible(_)
            | MigrateError::InvalidInstanceState
            | MigrateError::Codec(_)
            | MigrateError::UnexpectedMessage
            | MigrateError::SourcePause
            | MigrateError::Phase
            | MigrateError::TimeData(_)
            | MigrateError::DeviceState(_)
            | MigrateError::RemoteError(_, _)
            | MigrateError::StateMachine(_) => {
                HttpError::for_internal_error(msg)
            }
            MigrateError::MigrationAlreadyInProgress
            | MigrateError::NoMigrationInProgress
            | MigrateError::UuidMismatch
            | MigrateError::UpgradeExpected
            | MigrateError::UnknownDevice(_) => {
                HttpError::for_bad_request(None, msg)
            }
        }
    }
}

/// Serialized device state sent during migration.
#[derive(Debug, Deserialize, Serialize)]
struct Device {
    /// The unique name identifying the device in the instance inventory.
    pub instance_name: String,

    /// Device state data
    pub payload: Vec<DevicePayload>,
}
#[derive(Debug, Deserialize, Serialize)]
struct DevicePayload {
    /// Payload schema type
    pub kind: String,

    /// Payload schema version
    pub version: u32,

    /// Serialized device state.
    pub data: String,
}

// We should probably turn this into some kind of ValidatedBitmap
// data structure, so that we're only parsing it once.
struct PageIter<'a> {
    start: u64,
    current: u64,
    end: u64,
    bits: &'a [u8],
}

impl<'a> PageIter<'a> {
    pub fn new(start: u64, end: u64, bits: &'a [u8]) -> PageIter<'a> {
        let current = start;
        PageIter { start, current, end, bits }
    }
}

impl Iterator for PageIter<'_> {
    type Item = u64;
    fn next(&mut self) -> Option<Self::Item> {
        while self.current < self.end {
            let addr = self.current;
            self.current += 4096;
            let page_offset = ((addr - self.start) / 4096) as usize;
            let b = self.bits[page_offset / 8];
            if b.get_bit(page_offset % 8) {
                return Some(addr);
            }
        }
        None
    }
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn migrate_phase_begin(step_desc: &str) {}
    fn migrate_phase_end(step_desc: &str) {}
    fn migrate_xfer_ram_region(pages: u64, size: u64, paused: u8) {}
    fn migrate_xfer_ram_page(addr: u64, size: u64) {}
    fn migrate_time_data_before(
        src_guest_freq: u64,
        src_guest_tsc: u64,
        src_boot_hrtime: i64,
    ) {
    }
    fn migrate_time_data_after(
        dst_guest_freq: u64,
        dst_guest_tsc: u64,
        dst_boot_hrtime: i64,
        guest_uptime: u64,
        migrate_delta_ns: u64,
        migrate_delta_negative: bool,
    ) {
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/preamble.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;

use propolis_api_types::instance::ReplacementComponent;
use propolis_api_types_versions::v1;
use serde::{Deserialize, Serialize};

use crate::spec::{api_spec_v0::ApiSpecError, Spec};

use super::MigrateError;

#[derive(Deserialize, Serialize, Debug)]
pub(crate) struct Preamble {
    pub instance_spec: v1::instance_spec::VersionedInstanceSpec,
    pub blobs: Vec<Vec<u8>>,
}

impl Preamble {
    pub fn new(
        instance_spec: v1::instance_spec::VersionedInstanceSpec,
    ) -> Preamble {
        Preamble { instance_spec, blobs: Vec::new() }
    }

    /// Consume the spec in this Preamble and produce an instance spec suitable
    /// for initializing the target VM.
    ///
    /// This routine enumerates the disks and NICs in the `target_spec` and
    /// looks for disks with a Crucible backend and NICs with a viona backend.
    /// Any such backends will replace the corresponding backend entries in the
    /// source spec. If the target spec contains a replacement backend that is
    /// not present in the source spec, this routine fails.
    pub fn amend_spec(
        self,
        replacements: &BTreeMap<
            v1::instance_spec::SpecKey,
            ReplacementComponent,
        >,
    ) -> Result<Spec, MigrateError> {
        fn wrong_type_error(
            id: &v1::instance_spec::SpecKey,
            kind: &str,
        ) -> MigrateError {
            let msg =
                format!("component {id} is not a {kind} in the source spec");
            MigrateError::InstanceSpecsIncompatible(msg)
        }

        let v1::instance_spec::VersionedInstanceSpec::V0(mut source_spec) =
            self.instance_spec;
        for (id, comp) in replacements {
            let Some(to_amend) = source_spec.components.get_mut(id) else {
                return Err(MigrateError::InstanceSpecsIncompatible(format!(
                    "replacement component {id} not in source spec",
                )));
            };

            match comp {
                #[cfg(not(feature = "failure-injection"))]
                ReplacementComponent::MigrationFailureInjector(_) => {
                    return Err(MigrateError::InstanceSpecsIncompatible(
                        format!(
                            "replacing migration failure injector {id} is \
                            impossible because the feature is compiled out"
                        ),
                    ));
                }

                #[cfg(feature = "failure-injection")]
                ReplacementComponent::MigrationFailureInjector(comp) => {
                    let v1::instance_spec::Component::MigrationFailureInjector(
                        src,
                    ) = to_amend
                    else {
                        return Err(wrong_type_error(
                            id,
                            "migration failure injector",
                        ));
                    };

                    *src = comp.clone();
                }
                ReplacementComponent::CrucibleStorageBackend(comp) => {
                    let v1::instance_spec::Component::CrucibleStorageBackend(
                        src,
                    ) = to_amend
                    else {
                        return Err(wrong_type_error(id, "crucible backend"));
                    };

                    *src = comp.clone();
                }
                ReplacementComponent::VirtioNetworkBackend(comp) => {
                    let v1::instance_spec::Component::VirtioNetworkBackend(src) =
                        to_amend
                    else {
                        return Err(wrong_type_error(id, "viona backend"));
                    };

                    *src = comp.clone();
                }
            }
        }

        let amended_spec =
            source_spec.try_into().map_err(|e: ApiSpecError| {
                MigrateError::PreambleParse(e.to_string())
            })?;

        // TODO: Compare opaque blobs.

        Ok(amended_spec)
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/protocol.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functions for dealing with protocol negotiation.
//!
//! Protocols are identified by strings of the form
//! "propolis-migrate-encoding/version". During protocol negotiation, the
//! destination sends a list of protocol encodings and versions it supports. The
//! source selects a mutually-supported protocol from this list and sends it
//! back to the destination. Thereafter they communicate using the message
//! sequence specified by the version number and encoded using the encoding
//! in the string.

use std::{fmt::Display, iter::Peekable, num::ParseIntError, str::FromStr};

use lazy_static::lazy_static;
use serde::{Deserialize, Serialize};
use strum::{EnumIter, IntoEnumIterator};
use thiserror::Error;

/// The complete set of protocols supported by this version of the migration
/// library.
#[derive(Debug, Clone, Copy, EnumIter)]
pub enum Protocol {
    RonV0,
}

impl Protocol {
    /// Yields the offer string for this protocol variant. This can be sent to
    /// a migration counterpart to offer this protocol version.
    pub fn offer_string(&self) -> String {
        ProtocolParts::from(*self).offer_string()
    }
}

impl TryFrom<ProtocolParts> for Protocol {
    type Error = anyhow::Error;

    fn try_from(value: ProtocolParts) -> Result<Self, Self::Error> {
        let protocol = match value {
            ProtocolParts { encoding: Encoding::Ron, version: 0 } => {
                Self::RonV0
            }
            _ => anyhow::bail!(format!(
                "no protocol matching definition: {value:?}"
            )),
        };

        Ok(protocol)
    }
}

// Migration offers are of the form "propolis-migrate-encoding/version".
// Offer strings with multiple versions are comma-delimited.

/// The prefix to strip from a protocol offer to get the encoding and version.
const PREFIX: &str = "propolis-migrate-";

/// The separator in a protocol offer that separates the encoding from the
/// version.
const ENCODING_VERSION_SEPARATOR: char = '/';

/// The delimiter separating offers in a single string.
const DELIMITER: char = ',';

/// Errors that can arise while parsing a protocol offer string.
#[derive(Clone, Debug, Error, PartialEq, Serialize, Deserialize)]
pub enum ProtocolParseError {
    #[error("protocol string did not begin with propolis-migrate: {0}")]
    InvalidPrefix(String),

    #[error("protocol string did not have a '/' separator: {0}")]
    NoEncodingVersionSeparator(String),

    #[error("invalid encoding: {0}")]
    InvalidEncoding(String),

    #[error("failed to parse protocol version number {0}: {1}")]
    InvalidVersionNumber(String, String),

    #[error("offered protocol set contained duplicate protocol {0}")]
    DuplicateProtocolInOffer(String),
}

/// The set of permissible encodings.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
enum Encoding {
    /// Encode using Rust Object Notation.
    Ron,
}

impl Display for Encoding {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}",
            match self {
                Encoding::Ron => "ron",
            }
        )
    }
}

impl FromStr for Encoding {
    type Err = ProtocolParseError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "ron" => Ok(Encoding::Ron),
            _ => Err(ProtocolParseError::InvalidEncoding(s.to_owned())),
        }
    }
}

/// A protocol selection.
//
// N.B. The ordering of fields in this struct matters! It ensures that the
//      derived impls of PartialOrd and Ord compare versions before encodings.
//      This ensures that the negotiation process always selects the latest
//      version irrespective of whether it has the most preferable encoding.
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
struct ProtocolParts {
    version: u32,
    encoding: Encoding,
}

impl ProtocolParts {
    fn offer_string(&self) -> String {
        format!(
            "{}{}{}{}",
            PREFIX, self.encoding, ENCODING_VERSION_SEPARATOR, self.version
        )
    }
}

impl From<Protocol> for ProtocolParts {
    fn from(value: Protocol) -> Self {
        match value {
            Protocol::RonV0 => {
                ProtocolParts { version: 0, encoding: Encoding::Ron }
            }
        }
    }
}

impl FromStr for ProtocolParts {
    type Err = ProtocolParseError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let (encoding, version) = s
            .strip_prefix(PREFIX)
            .ok_or_else(|| ProtocolParseError::InvalidPrefix(s.to_owned()))?
            .split_once(ENCODING_VERSION_SEPARATOR)
            .ok_or_else(|| {
                ProtocolParseError::NoEncodingVersionSeparator(s.to_owned())
            })?;

        let encoding = Encoding::from_str(encoding)?;
        let version = version.parse().map_err(|e: ParseIntError| {
            ProtocolParseError::InvalidVersionNumber(
                version.to_owned(),
                e.to_string(),
            )
        })?;

        Ok(ProtocolParts { encoding, version })
    }
}

lazy_static! {
    static ref PROTOCOL_PARTS: Vec<ProtocolParts> =
        Protocol::iter().map(ProtocolParts::from).collect();
}

/// Constructs a protocol offer string from a peekable protocol iterator.
fn make_protocol_offers_from_parts<
    T: std::iter::Iterator<Item = ProtocolParts>,
>(
    mut iter: Peekable<T>,
) -> String {
    let mut s = String::new();
    while let Some(p) = iter.next() {
        s.push_str(&p.offer_string());
        if iter.peek().is_some() {
            s.push(DELIMITER);
        }
    }

    s
}

/// Constructs a protocol offer string from the static supported protocol set.
pub(super) fn make_protocol_offer() -> String {
    make_protocol_offers_from_parts(
        Protocol::iter().map(ProtocolParts::from).peekable(),
    )
}

/// Parses an incoming protocol offer string into a set of protocol descriptors.
fn parse_protocol_offer(
    offer: &str,
) -> Result<Vec<ProtocolParts>, ProtocolParseError> {
    let mut parsed = Vec::new();
    let offers = offer.split(DELIMITER);
    for o in offers {
        let protocol: ProtocolParts = o.parse()?;
        if parsed.contains(&protocol) {
            return Err(ProtocolParseError::DuplicateProtocolInOffer(
                protocol.offer_string(),
            ));
        }

        parsed.push(protocol);
    }

    parsed.sort_unstable();
    Ok(parsed)
}

/// Selects the first protocol from `offered` that appears in `supported`.
/// The caller must ensure that `offered` is sorted ascending.
fn select_compatible_protocol(
    offered: &[ProtocolParts],
    supported: &[ProtocolParts],
) -> Option<ProtocolParts> {
    assert!(offered.windows(2).all(|subslice| subslice[0] <= subslice[1]));

    for o in offered.iter().rev() {
        if supported.contains(o) {
            return Some(*o);
        }
    }

    None
}

/// Given an incoming protocol offer string, selects a compatible protocol to
/// use.
///
/// # Return value
///
/// `Ok(Some(selection))` if a protocol was negotiated. `Ok(None)` if the offer
/// was parsed but no mutually agreeable protocol was found therein. `Err` if
/// the offered protocol string was not parseable.
pub(super) fn select_protocol_from_offer(
    offer: &str,
) -> Result<Option<Protocol>, ProtocolParseError> {
    let offered = parse_protocol_offer(offer)?;
    Ok(select_compatible_protocol(&offered, &PROTOCOL_PARTS).map(|parts| {
        parts.try_into().expect(
            "compatible protocol strings should have a Protocol variant",
        )
    }))
}

#[cfg(test)]
mod test {
    use super::*;

    // N.B. The test protocol lists are sorted by version to meet the
    //      requirements of `select_compatible_protocol`.
    const PROTOCOLS_V1: [ProtocolParts; 3] = [
        ProtocolParts { version: 0, encoding: Encoding::Ron },
        ProtocolParts { version: 1, encoding: Encoding::Ron },
        ProtocolParts { version: 2, encoding: Encoding::Ron },
    ];

    const PROTOCOLS_V2: [ProtocolParts; 5] = [
        ProtocolParts { version: 0, encoding: Encoding::Ron },
        ProtocolParts { version: 1, encoding: Encoding::Ron },
        ProtocolParts { version: 2, encoding: Encoding::Ron },
        ProtocolParts { version: 3, encoding: Encoding::Ron },
        ProtocolParts { version: 4, encoding: Encoding::Ron },
    ];

    #[test]
    fn negotiation_selects_newest_version() {
        let selected =
            select_compatible_protocol(&PROTOCOLS_V1, &PROTOCOLS_V2).unwrap();
        assert_eq!(selected.version, 2);
        assert_eq!(selected.encoding, Encoding::Ron);

        let selected =
            select_compatible_protocol(&PROTOCOLS_V1, &PROTOCOLS_V2).unwrap();
        assert_eq!(selected.version, 2);
        assert_eq!(selected.encoding, Encoding::Ron);
    }

    #[test]
    fn parse_sorts_offered_protocols() {
        let parts_str = "propolis-migrate-ron/42,\
            propolis-migrate-ron/65,\
            propolis-migrate-ron/0,\
            propolis-migrate-ron/27";

        let offered = parse_protocol_offer(parts_str).unwrap();
        let expected_versions = [0, 27, 42, 65];
        assert_eq!(
            offered.iter().map(|parts| parts.version).collect::<Vec<u32>>(),
            expected_versions
        );
    }

    #[test]
    fn offer_string_round_trip() {
        let offer = make_protocol_offers_from_parts(
            PROTOCOLS_V2.iter().map(Clone::clone).peekable(),
        );
        let set = parse_protocol_offer(&offer).unwrap();
        assert_eq!(set, PROTOCOLS_V2);
    }

    #[test]
    fn parse_failures() {
        assert!("not-a-prefix".parse::<ProtocolParts>().is_err());
        assert!("propolis-migrate-ron-1".parse::<ProtocolParts>().is_err());
        assert!("propolis-migrate-json/2".parse::<ProtocolParts>().is_err());
        assert!("propolis-migrate-ron/version3"
            .parse::<ProtocolParts>()
            .is_err());

        assert!(parse_protocol_offer(
            "propolis-migrate-ron/1,propolis-migrate-ron/1"
        )
        .is_err());
    }
}


================================================
FILE: bin/propolis-server/src/lib/migrate/source.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use bitvec::prelude::{BitSlice, Lsb0};
use futures::{SinkExt, StreamExt};
use propolis::common::{GuestAddr, GuestData, PAGE_SIZE};
use propolis::migrate::{
    MigrateCtx, MigrateStateError, Migrator, PayloadOutputs,
};
use propolis::vmm;
use propolis_api_types_versions::v1;
use slog::{debug, error, info, trace, warn};
use std::collections::HashMap;
use std::convert::TryInto;
use std::io;
use std::ops::{Range, RangeInclusive};
use tokio_tungstenite::tungstenite::protocol::frame::coding::CloseCode;
use tokio_tungstenite::tungstenite::protocol::CloseFrame;
use tokio_tungstenite::{tungstenite, WebSocketStream};
use uuid::Uuid;

use crate::migrate::codec::Message;
use crate::migrate::memx;
use crate::migrate::preamble::Preamble;
use crate::migrate::probes;
use crate::migrate::protocol::Protocol;
use crate::migrate::{codec, protocol};
use crate::migrate::{
    Device, DevicePayload, MigrateError, MigratePhase, MigrateRole,
    MigrationState, PageIter,
};

use crate::vm::objects::VmObjects;
use crate::vm::state_publisher::{
    ExternalStateUpdate, MigrationStateUpdate, StatePublisher,
};

use super::MigrateConn;

/// Specifies which pages should be offered during a RAM transfer phase.
///
/// On Dirty Pages and the Discipline Thereof
/// -----------------------------------------
///
/// In an ideal world, a migration would only ever transfer pages of the guest's
/// address space which have actually been touched by the guest; we don't want
/// to waste time sending a bunch of zero pages on the wire. RAM is offered to
/// the migration destination in two phases: first, we transfer the majority of
/// the guest's RAM prior to pausing the guest, and second, after pausing the
/// guest, we transfer any pages which have been touched since when we performed
/// the first transfer. This way, we perform most of the RAM transfer while the
/// guest is still running, and only re-transfer pages which have been dirtied
/// again while paused.
///
/// Transferring only dirty pages is made possible by bhyve's
/// `VM_TRACK_DIRTY_PAGES` and `VM_NPT_OPERATION`` ioctls. The
/// `VM_TRACK_DIRTY_PAGES` ioctl allows us to generate a bitmap of which pages
/// have their dirty flags set, allowing us to offer only those pages when
/// performing a RAM transfer. Because `VM_TRACK_DIRTY_PAGES`` also clears the
/// dirty bits, if we call the ioctl when offering the initial pre-pause RAM
/// transfer, and then again when performing the post-pause RAM transfer, the
/// second ioctl call will see only the dirty bits that were set *since* when
/// the initial transfer was performed, allowing us to offer only the pages
/// which have been touched since we transferred most of the memory.
///
/// Sounds simple enough, right? Well, here's where things get interesting.
/// Should a migration *fail* after transferring RAM, all the dirty bits on the
/// guest's pages will have been cleared by the `VM_TRACK_DIRTY_PAGES` ioctl.
/// This means that, for a naive implementation which just always transfers only
/// the pages marked as dirty by the ioctl, a second or third migration attempt
/// will not transfer any pages whose dirty bits were cleared by the first
/// migration attempt and haven't been touched again since then. This is bad
/// news! The guest has written to those pages, and, just because it hasn't
/// touched them since the last migration attempt, it may still care about that
/// memory, and attempt to read what it put there again in the future --- with
/// unpleasant results, if we haven't transferred that memory.
///
/// There are two potential ways we can solve this, and --- as you're about to
/// discover --- we implement both of them:
///
/// 1. The obvious solution: we can just offer all RAM in the pre-pause RAM push
///    phase, clearing any dirty bits, and then offer only dirty pages in the
///    post-pause RAM push phase. This has the nice property that it's trivially
///    correct no matter how many migration attempts it takes before we actually
///    migrate a guest successfully. It also has the less nice property that
///    we're offering a bunch of pages that the guest has never actually touched
///    and doesn't care about.
/// 2. The clever solution: what if we had a way to put the dirty bit *back*
///    after we've cleared it? If such a thing existed, we could record which
///    pages were dirty when we performed the RAM transfers, and then, should
///    the migration fail, go back and put those dirty bits *back*, so that a
///    potential future migration attempt will still see that those pages are
///    dirty and offer them again.
///
/// The good news is that there is, in fact, a way to do that, using the
/// `VM_NPT_OPERATION` ioctl's `VNO_OP_SET_DIRTY` operation (which, in Propolis,
/// we pronounce like [`VmmHdl::set_dirty_pages`]). However, the less good news
/// is that this ioctl isn't available in all the bhyve versions that Propolis
/// supports, as it was added in bhyve v17. Therefore, we implement both
/// solutions, depending on which bhyve version is present. If we have
/// VM_NPT_OPERATION, both the pre-pause and post-pause RAM push phases will use
/// [`RamOfferDiscipline::OfferDirty`], and only offer dirty pages, recording
/// any dirty bits that were cleared by the `VM_TRACK_DIRTY_PAGES` ioctl. If we
/// don't have `VM_NPT_OPERATION`, we use [`RamOfferDiscipline::OfferAll`] in
/// the pre-pause phase, and [`RamOfferDiscipline::OfferDirty`] only in the
/// post-pause phase. Because we can't put the dirty bits back, we have to offer
/// all the memory in the first phase, but we can still use dirty page tracking
/// to avoid re-offering pages that were transferred in the first phase when we
/// do the second RAM offer after pausing the guest.
#[derive(Debug)]
enum RamOfferDiscipline {
    /// Offer all pages irrespective of whether they are dirty.
    OfferAll,

    /// Offer only pages that the hypervisor are marked as dirty.
    OfferDirty,
}

/// The interface to an arbitrary version of the source half of the live
/// migration protocol.
//
// Use `async_trait` here to help generate a `Send` bound on the futures
// returned by the functions in this trait.
#[async_trait::async_trait]
pub(crate) trait SourceProtocol {
    /// Runs live migration out of the supplied `vm_objects`, writing back any
    /// state that must be saved for future migration attempts to
    /// `persistent_state`.
    ///
    /// This routine guarantees that the supplied `vm_objects` are paused on
    /// success and resumed on failure.
    async fn run(
        self,
        vm_objects: &VmObjects,
        publisher: &mut StatePublisher,
        persistent_state: &mut PersistentState,
    ) -> Result<(), MigrateError>;
}

/// Negotiates a live migration protocol version with a target who has connected
/// over `conn`. If this is successful, returns a `SourceProtocol`
/// implementation that can be used to run the requested migration.
pub(crate) async fn initiate<T: MigrateConn>(
    log: &slog::Logger,
    migration_id: Uuid,
    mut conn: WebSocketStream<T>,
    vm_objects: &VmObjects,
    persistent_state: &PersistentState,
) -> Result<impl SourceProtocol, MigrateError> {
    // Create a new log context for the migration
    let log = log.new(slog::o!(
        "migration_id" => migration_id.to_string(),
        "migrate_role" => "source"
    ));
    info!(log, "negotiating migration as source");

    // The protocol should start with some text from the destination identifying
    // the protocol versions it supports.
    let dst_protocols = match conn.next().await {
        Some(Ok(tungstenite::Message::Text(dst_protocols))) => dst_protocols,
        x => {
            error!(
                log,
                "destination side did not begin migration version handshake: \
                 {:?}",
                x
            );

            // Tell the destination it misbehaved. This is best-effort.
            if let Err(e) = conn
                .send(tungstenite::Message::Close(Some(CloseFrame {
                    code: CloseCode::Protocol,
                    reason: "did not begin with version handshake.".into(),
                })))
                .await
            {
                warn!(log, "failed to send handshake failed message to source";
                      "error" => ?e);
            }

            return Err(MigrateError::Initiate);
        }
    };

    // Pick the most favorable protocol from the list the destination supplied
    // and send it back to the destination.
    info!(log, "destination offered protocols: {}", dst_protocols);
    let selected = match protocol::select_protocol_from_offer(&dst_protocols) {
        Ok(Some(selected)) => {
            conn.send(tungstenite::Message::Text(selected.offer_string()))
                .await?;
            selected
        }
        Ok(None) => {
            let src_protocols = protocol::make_protocol_offer();
            error!(
                log,
                "no compatible destination protocols";
                "dst_protocols" => &dst_protocols,
                "src_protocols" => &src_protocols,
            );
            return Err(MigrateError::NoMatchingProtocol(
                src_protocols,
                dst_protocols,
            ));
        }
        Err(e) => {
            error!(log, "failed to parse destination protocol offer";
                           "dst_protocols" => &dst_protocols,
                           "error" => %e);
            return Err(MigrateError::ProtocolParse(
                dst_protocols,
                e.to_string(),
            ));
        }
    };

    info!(log, "selected protocol {:?}", selected);
    match selected {
        Protocol::RonV0 => Ok(RonV0::new(
            log,
            vm_objects,
            migration_id,
            conn,
            persistent_state,
        )
        .await),
    }
}

/// State which must be stored across multiple migration attempts.
///
/// This struct is stored on the [`VmController`] so that may be accessed by
/// subsequent [`migrate`] invocations.

#[derive(Default)]
pub(crate) struct PersistentState {
    /// Set if we were unable to re-set dirty bits on guest pages after a failed
    /// migration attempt. If this occurs, we can no longer offer only dirty
    /// pages in a subsequent migration attempt, as some pages which should be
    /// marked as dirty may not be.
    pub(crate) has_redirtying_ever_failed: bool,
}

/// Context for the source side of protocol version 0 using the RON encoding.
struct RonV0<T: MigrateConn> {
    /// The logger to which to log messages from this migration attempt.
    log: slog::Logger,

    /// The migration's ID.
    migration_id: Uuid,

    /// Transport to the destination Instance.
    conn: WebSocketStream<T>,

    /// Guest page table dirty bits to restore in the event of a migration
    /// failure, so that a subsequent migration can attempt to offer only dirty
    /// pages. These dirty bits are accumulated across all RAM push phases, so
    /// that any pages which become dirty after the pre-pause RAM push are also
    /// added to these bitmaps.
    ///
    /// This is `Some` if we should attempt to re-set dirty bits on guest pages
    /// in the event of a migration failure, and `None` if we cannot do so. It
    /// will be `Some` if (and only if):
    ///
    /// - The current bhyve version supports the `VM_NPT_OPERATION` ioctl (i.e.
    ///   it is at least bhyve v17 or later),
    /// - A previous attempt to re-dirty pages has failed (viz.
    ///   [`PersistentState::has_retrying_ever_failed`]), in which case, we can
    ///   no longer trust  that all previously dirtied pages were re-dirtied
    ///   correctly.
    ///
    /// Otherwise, we must fall back to always offering all pages in the initial
    /// pre-pause RAM push phase.
    dirt: Option<HashMap<GuestAddr, PageBitmap>>,
}

const PAGE_BITMAP_SIZE: usize = 4096;
type PageBitmap = [u8; PAGE_BITMAP_SIZE];

impl<T: MigrateConn> RonV0<T> {
    async fn new(
        log: slog::Logger,
        vm: &VmObjects,
        migration_id: Uuid,
        conn: WebSocketStream<T>,
        persistent_state: &PersistentState,
    ) -> Self {
        // Create a (prospective) dirty page map if bhyve supports the NPT
        // API. If this map is present and the VM hasn't recorded that it's
        // possibly unhealthy, it will be used to offer only dirty pages during
        // the pre-pause RAM push.
        let dirt = {
            let can_npt_operate =
                vm.lock_shared().await.vmm_hdl().can_npt_operate();

            let has_redirtying_ever_failed =
                persistent_state.has_redirtying_ever_failed;
            if can_npt_operate && !has_redirtying_ever_failed {
                Some(Default::default())
            } else {
                info!(
                    log,
                    "guest pages not redirtyable, will offer all pages in pre-pause";
                    "can_npt_operate" => can_npt_operate,
                    "has_redirtying_ever_failed" => has_redirtying_ever_failed
                );
                None
            }
        };
        Self { log, migration_id, conn, dirt }
    }
}

#[async_trait::async_trait]
impl<T: MigrateConn> SourceProtocol for RonV0<T> {
    async fn run(
        self,
        vm_objects: &VmObjects,
        publisher: &mut StatePublisher,
        persistent_state: &mut PersistentState,
    ) -> Result<(), MigrateError> {
        let mut runner = RonV0Runner {
            log: self.log,
            migration_id: self.migration_id,
            conn: self.conn,
            dirt: self.dirt,
            vm: vm_objects,
            state_publisher: publisher,
            persistent_state,
            paused: false,
        };

        runner.run().await
    }
}

struct RonV0Runner<'vm, T: MigrateConn> {
    log: slog::Logger,
    migration_id: Uuid,
    conn: WebSocketStream<T>,
    dirt: Option<HashMap<GuestAddr, PageBitmap>>,
    vm: &'vm VmObjects,
    state_publisher: &'vm mut StatePublisher,
    persistent_state: &'vm mut PersistentState,
    paused: bool,
}

impl<T: MigrateConn> RonV0Runner<'_, T> {
    fn log(&self) -> &slog::Logger {
        &self.log
    }

    fn update_state(&mut self, state: MigrationState) {
        self.state_publisher.update(ExternalStateUpdate::Migration(
            MigrationStateUpdate {
                state,
                id: self.migration_id,
                role: MigrateRole::Source,
            },
        ));
    }

    async fn pause_vm(&mut self) {
        assert!(!self.paused);
        self.paused = true;
        self.vm.lock_exclusive().await.pause().await;
    }

    async fn resume_vm(&mut self) {
        assert!(self.paused);
        self.paused = false;
        self.vm.lock_exclusive().await.resume();
    }

    async fn run_phase(
        &mut self,
        step: MigratePhase,
    ) -> Result<(), MigrateError> {
        probes::migrate_phase_begin!(|| { step.to_string() });

        let res = match step {
            MigratePhase::MigrateSync => self.sync().await,
            MigratePhase::Pause => self.pause().await,
            MigratePhase::RamPushPrePause | MigratePhase::RamPushPostPause => {
                self.ram_push(&step).await
            }
            MigratePhase::TimeData => self.time_data().await,
            MigratePhase::DeviceState => self.device_state().await,
            MigratePhase::RamPull => self.ram_pull().await,
            MigratePhase::ServerState => self.server_state().await,
            MigratePhase::Finish => self.finish().await,
        };

        probes::migrate_phase_end!(|| { step.to_string() });

        res
    }

    async fn run(&mut self) -> Result<(), MigrateError> {
        info!(self.log(), "Entering Source Migration Task");

        let result: Result<_, MigrateError> = async {
            self.run_phase(MigratePhase::MigrateSync).await?;
            self.run_phase(MigratePhase::RamPushPrePause).await?;
            self.run_phase(MigratePhase::Pause).await?;
            self.run_phase(MigratePhase::RamPushPostPause).await?;
            self.run_phase(MigratePhase::TimeData).await?;
            self.run_phase(MigratePhase::DeviceState).await?;
            self.run_phase(MigratePhase::RamPull).await?;
            self.run_phase(MigratePhase::ServerState).await?;
            self.run_phase(MigratePhase::Finish).await?;
            Ok(())
        }
        .await;

        if let Err(err) = result {
            self.update_state(MigrationState::Error);
            let _ = self.send_msg(codec::Message::Error(err.clone())).await;

            // If we are capable of setting the dirty bit on guest page table
            // entries, re-dirty them, so that a later migration attempt can also
            // only offer dirty pages. If we can't use VM_NPT_OPERATION, a
            // subsequent migration attempt will offer all pages.
            //
            // See the lengthy comment on `RamOfferDiscipline` above for more
            // details about what's going on here.
            let vmm_hdl = self.vm.lock_shared().await.vmm_hdl().clone();
            for (&GuestAddr(gpa), dirtiness) in self.dirt.iter().flatten() {
                if let Err(e) = vmm_hdl.set_dirty_pages(gpa, dirtiness) {
                    // Bad news! Our attempt to re-set the dirty bit on these
                    // pages has failed! Thus, subsequent migration attempts
                    // /!\ CAN NO LONGER RELY ON DIRTY PAGE TRACKING /!\
                    // and must always offer all pages in the initial RAM push
                    // phase.
                    //
                    // Record that now so we never try to do this again.
                    self.persistent_state.has_redirtying_ever_failed = true;
                    error!(
                        self.log(),
                        "failed to restore dirty bits: {e}";
                        "gpa" => gpa,
                    );
                    // No sense continuing to try putting back any remaining
                    // dirty bits, as we won't be using them any longer.
                    break;
                } else {
                    debug!(self.log(), "re-dirtied pages at {gpa:#x}",);
                }
            }

            if self.paused {
                self.resume_vm().await;
            }

            Err(err)
        } else {
            // The VM should be paused after successfully migrating out; the
            // state driver assumes as much when subsequently halting the
            // instance.
            assert!(self.paused);
            info!(self.log(), "Source Migration Successful");
            Ok(())
        }
    }

    async fn sync(&mut self) -> Result<(), MigrateError> {
        self.update_state(MigrationState::Sync);
        let preamble =
            Preamble::new(v1::instance_spec::VersionedInstanceSpec::V0(
                self.vm.lock_shared().await.instance_spec().clone().into(),
            ));
        let s = ron::ser::to_string(&preamble)
            .map_err(codec::ProtocolError::from)?;
        self.send_msg(codec::Message::Serialized(s)).await?;

        self.read_ok().await
    }

    async fn ram_push(
        &mut self,
        phase: &MigratePhase,
    ) -> Result<(), MigrateError> {
        match phase {
            MigratePhase::RamPushPrePause => {
                self.update_state(MigrationState::RamPush)
            }
            MigratePhase::RamPushPostPause => {
                self.update_state(MigrationState::RamPushDirty)
            }
            _ => unreachable!("should only push RAM in a RAM push phase"),
        }

        let vmm_ram_range = self.vmm_ram_bounds().await?;
        let req_ram_range = self.read_mem_query().await?;
        info!(
            self.log(),
            "ram_push ({:?}): got query for range {:#x?}, vm range {:#x?}",
            phase,
            req_ram_range,
            vmm_ram_range
        );

        // Determine whether we can offer only dirty pages, or if we must offer
        // all pages.
        //
        // Refer to the giant comment on `RamOfferDiscipline` above for more
        // details about this determination.
        let offer_discipline = match phase {
            // If we are in the pre-pause RAM push phase, and we don't have
            // VM_NPT_OPERATION to put back any dirty bits if the migration
            // fails, we have to offer all pages here.
            MigratePhase::RamPushPrePause if self.dirt.is_none() => {
                RamOfferDiscipline::OfferAll
            }
            // Otherwise, if we are in the post-pause phase, or if we *can* just
            // put back the dirty bits in the event of a migration failure, we
            // need only offer pages that have their dirty bit set.
            _ => RamOfferDiscipline::OfferDirty,
        };
        self.offer_ram(vmm_ram_range, req_ram_range, offer_discipline).await?;

        loop {
            let m = self.read_msg().await?;
            trace!(self.log(), "ram_push: source xfer phase recvd {:?}", m);
            match m {
                codec::Message::MemDone => break,
                codec::Message::MemFetch(start, end, bits) => {
                    if !memx::validate_bitmap(start, end, &bits) {
                        error!(self.log(), "invalid bitmap");
                        return Err(MigrateError::Phase);
                    }

                    // XXX: We should do stricter validation on the fetch
                    // request here.  For instance, we shouldn't "push" MMIO
                    // space or non-existent RAM regions.  While we de facto
                    // do not because of the way access is implemented, we
                    // should probably disallow it at the protocol level.
                    self.xfer_ram(start, end, &bits).await?;
                    probes::migrate_xfer_ram_region!(|| {
                        let bits = BitSlice::<_, Lsb0>::from_slice(&bits);
                        let pages = bits.count_ones() as u64;
                        (
                            pages,
                            pages * PAGE_SIZE as u64,
                            match phase {
                                MigratePhase::RamPushPrePause => 0,
                                MigratePhase::RamPushPostPause => 1,
                                _ => unreachable!(),
                            },
                        )
                    });
                }
                _ => return Err(MigrateError::UnexpectedMessage),
            };
        }
        info!(self.log(), "ram_push: done sending ram");
        self.update_state(MigrationState::Pause);
        Ok(())
    }

    async fn offer_ram(
        &mut self,
        vmm_ram_range: RangeInclusive<GuestAddr>,
        req_ram_range: Range<u64>,
        offer_discipline: RamOfferDiscipline,
    ) -> Result<(), MigrateError> {
        info!(
            self.log(),
            "offering ram";
            "discipline" => ?offer_discipline,
            "can_redirty_pages" => self.dirt.is_some(),
        );
        let vmm_ram_start = *vmm_ram_range.start();
        let vmm_ram_end = *vmm_ram_range.end();
        let mut bits = [0u8; PAGE_BITMAP_SIZE];
        let req_start_gpa = req_ram_range.start;
        let req_end_gpa = req_ram_range.end;
        let start_gpa = req_start_gpa.max(vmm_ram_start.0);

        // The RAM bounds reported to this routine set the end of the range to
        // the last valid address in the address space (e.g. 0x6FFF for a
        // one-page range beginning at 0x6000), but this routine's callees
        // expect the end address to be the first invalid (page-aligned) address
        // (in our example, 0x7000 instead of 0x6FFF). Correct that here.
        //
        // N.B. This assumes that the guest address space is small enough to
        //      add 1 in this fashion without overflowing, i.e. the last valid
        //      GPA cannot be `u64::MAX`.
        let end_gpa = req_end_gpa.min(vmm_ram_end.0);
        assert!(end_gpa < u64::MAX);
        let end_gpa = end_gpa + 1;

        let step = bits.len() * 8 * PAGE_SIZE;
        for gpa in (start_gpa..end_gpa).step_by(step) {
            let mut pages_offered = 0;
            // Always capture the dirty page mask even if the offer discipline
            // says to offer all pages. This ensures that pages that are
            // transferred now and not touched again will not be offered again
            // by a subsequent phase.
            self.track_dirty(GuestAddr(gpa), &mut bits).await?;

            match offer_discipline {
                RamOfferDiscipline::OfferAll => {
                    for byte in bits.iter_mut() {
                        *byte = 0xff;
                    }
                    pages_offered = PAGE_BITMAP_SIZE * 8;
                }
                RamOfferDiscipline::OfferDirty => {
                    let bits = BitSlice::<_, Lsb0>::from_slice(&bits);
                    let dirty_pages = bits.count_ones();
                    if dirty_pages == 0 {
                        continue;
                    }

                    pages_offered += dirty_pages;

                    // If we're on a bhyve version that supports
                    // VM_NPT_OPERATION, we'll be able to put the dirty bits
                    // back in the event of a migration failure. Therefore,
                    // we need to hang onto any bytes and their indices so
                    // that we can rebuild the dirty page mask later, if
                    // necessary.
                    //
                    // If bhyve doesn't support VM_NPT_OPERATION, no sense
                    // hanging onto this. We'll just have to offer all pages
                    // in the initial RAM Offer phase, instead.
                    if let Some(ref mut dirt) = self.dirt {
                        let saved = dirt
                            .entry(GuestAddr(gpa))
                            .or_insert_with(|| [0u8; PAGE_BITMAP_SIZE]);
                        let saved = BitSlice::<_, Lsb0>::from_slice_mut(saved);
                        *saved |= bits;
                    }
                }
            }

            let end = end_gpa.min(gpa + step as u64);
            info!(
                self.log(),
                "ram_push: offering {pages_offered} pages between {gpa:#x} and {end:#x}"
            );
            if pages_offered > 0 {
                self.send_msg(memx::make_mem_offer(gpa, end, &bits)).await?;
            }
        }
        self.send_msg(codec::Message::MemEnd(req_start_gpa, req_end_gpa)).await
    }

    async fn xfer_ram(
        &mut self,
        start: u64,
        end: u64,
        bits: &[u8],
    ) -> Result<(), MigrateError> {
        info!(self.log(), "ram_push: xfer RAM between {start:#x} and {end:#x}",);
        self.send_msg(memx::make_mem_xfer(start, end, bits)).await?;
        for addr in PageIter::new(start, end, bits) {
            let mut byte_buffer = [0u8; PAGE_SIZE];
            {
                let mut bytes = GuestData::from(byte_buffer.as_mut_slice());
                self.read_guest_mem(GuestAddr(addr), &mut bytes).await?;
            }
            self.send_msg(codec::Message::Page(byte_buffer.into())).await?;
            probes::migrate_xfer_ram_page!(|| (addr, PAGE_SIZE as u64));
        }
        Ok(())
    }

    async fn pause(&mut self) -> Result<(), MigrateError> {
        self.update_state(MigrationState::Pause);
        // Ask the instance to begin transitioning to the paused state
        // This will inform each device to pause.
        info!(self.log(), "Pausing devices");
        self.pause_vm().await;
        Ok(())
    }

    async fn device_state(&mut self) -> Result<(), MigrateError> {
        self.update_state(MigrationState::Device);
        let mut device_states = vec![];
        {
            let objects = self.vm.lock_shared().await;
            let migrate_ctx =
                MigrateCtx { mem: &objects.access_mem().unwrap() };

            // Collect together the serialized state for all the devices
            objects.for_each_device_fallible(|name, devop| {
                let mut dev = Device {
                    instance_name: name.to_string(),
                    payload: Vec::new(),
                };
                match devop.migrate() {
                    Migrator::NonMigratable => {
                        error!(self.log(),
                            "Can't migrate instance with non-migratable device ({})",
                            name);
                        return Err(MigrateError::DeviceState(
                                MigrateStateError::NonMigratable.to_string()));
                    },
                    // No device state needs to be trasmitted for 'Empty' devices
                    Migrator::Empty => {},
                    Migrator::Single(mech) => {
                        let out = mech.export(&migrate_ctx)?;
                        dev.payload.push(DevicePayload {
                            kind: out.kind.to_owned(),
                            version: out.version,
                            data: ron::ser::to_string(&out.payload)
                                .map_err(codec::ProtocolError::from)?,
                        });
                        device_states.push(dev);
                    }
                    Migrator::Multi(mech) => {
                        let mut outputs = PayloadOutputs::new();
                        mech.export(&mut outputs, &migrate_ctx)?;

                        for part in outputs {
                            dev.payload.push(DevicePayload {
                                kind: part.kind.to_owned(),
                                version: part.version,
                                data: ron::ser::to_string(&part.payload)
                                    .map_err(codec::ProtocolError::from)?,
                            });
                        }
                        device_states.push(dev);
                    }
                }
                Ok(())
            })?;
        }

        info!(self.log(), "Device States: {device_states:#?}");

        self.send_msg(codec::Message::Serialized(
            ron::ser::to_string(&device_states)
                .map_err(codec::ProtocolError::from)?,
        ))
        .await?;

        self.send_msg(codec::Message::Okay).await?;
        self.read_ok().await
    }

    // Read and send over the time data
    async fn time_data(&mut self) -> Result<(), MigrateError> {
        let vmm_hdl = &self.vm.lock_shared().await.vmm_hdl().clone();
        let vm_time_data =
            vmm::time::export_time_data(vmm_hdl).map_err(|e| {
                MigrateError::TimeData(format!(
                    "VMM Time Data export error: {e}"
                ))
            })?;
        info!(self.log(), "VMM Time Data: {:#?}", vm_time_data);

        let time_data_serialized = ron::ser::to_string(&vm_time_data)
            .map_err(codec::ProtocolError::from)?;
        info!(self.log(), "VMM Time Data: {:#?}", time_data_serialized);
        self.send_msg(codec::Message::Serialized(time_data_serialized)).await?;

        self.read_ok().await
    }

    async fn ram_pull(&mut self) -> Result<(), MigrateError> {
        self.update_state(MigrationState::RamPush);
        let m = self.read_msg().await?;
        info!(self.log(), "ram_pull: got query {:?}", m);
        self.update_state(MigrationState::Pause);
        self.update_state(MigrationState::RamPushDirty);
        self.send_msg(codec::Message::MemEnd(0, !0)).await?;
        let m = self.read_msg().await?;
        info!(self.log(), "ram_pull: got done {:?}", m);
        Ok(())
    }

    async fn server_state(&mut self) -> Result<(), MigrateError> {
        self.update_state(MigrationState::Server);
        let remote_addr = match self.read_msg().await? {
            Message::Serialized(s) => {
                ron::from_str(&s).map_err(codec::ProtocolError::from)?
            }
            _ => return Err(MigrateError::UnexpectedMessage),
        };
        let com1_history = self
            .vm
            .lock_shared()
            .await
            .com1()
            .export_history(remote_addr)
            .await?;
        self.send_msg(codec::Message::Serialized(com1_history)).await?;
        self.read_ok().await
    }

    async fn finish(&mut self) -> Result<(), MigrateError> {
        // Wait for the destination to acknowledge that it's ready to run the
        // VM.
        self.read_ok().await?;

        // Hand control over to the destination. If this send fails, the
        // destination won't run the VM and it can resume here.
        //
        // N.B. After this message is sent, this Propolis (and any of its
        //      overseers) must assume that the destination has begun running
        //      the guest.
        self.send_msg(codec::Message::Okay).await?;

        // Now that handoff is complete, publish that the migration has
        // succeeded.
        self.update_state(MigrationState::Finish);

        // This VMM is going away, so if any guest memory is still dirty, it
        // won't be transferred. Assert that there is no such memory.
        //
        // The unwraps in the block below amount to assertions that the VMM
        // exists at this point (it should). Note that returning an error here
        // is not permitted because that will cause migration to unwind and the
        // VM to resume, which is forbidden at this point (see above).
        let vmm_range = self.vmm_ram_bounds().await.unwrap();
        let mut bits = [0u8; PAGE_BITMAP_SIZE];
        let step = bits.len() * 8 * PAGE_SIZE;
        for gpa in (vmm_range.start().0..vmm_range.end().0).step_by(step) {
            self.track_dirty(GuestAddr(gpa), &mut bits).await.unwrap();
            let pages_left_behind =
                BitSlice::<_, Lsb0>::from_slice(&bits).count_ones() as u64;
            assert_eq!(
                0,
                pages_left_behind,
                "{pages_left_behind} dirty pages left behind between {:#x}..{:#x}",
                gpa,
                gpa + step as u64,
            );
        }

        Ok(())
    }

    async fn read_msg(&mut self) -> Result<codec::Message, MigrateError> {
        self.conn
            .next()
            .await
            .ok_or_else(|| {
                codec::ProtocolError::Io(io::Error::from(
                    io::ErrorKind::BrokenPipe,
                ))
            })?
            .map_err(|e| codec::ProtocolError::WebsocketError(Box::new(e)))
            // convert tungstenite::Message to codec::Message
            .and_then(std::convert::TryInto::try_into)
            // If this is an error message, lift that out
            .map(|msg| match msg {
                codec::Message::Error(err) => {
                    error!(
                        self.log(),
                        "migration failed due to error from target: {err}"
                    );
                    Err(MigrateError::RemoteError(
                        MigrateRole::Destination,
                        err.to_string(),
                    ))
                }
                msg => Ok(msg),
            })?
    }

    async fn read_ok(&mut self) -> Result<(), MigrateError> {
        match self.read_msg().await? {
            codec::Message::Okay => Ok(()),
            msg => {
                error!(self.log(), "expected `Okay` but received: {msg:?}");
                Err(MigrateError::UnexpectedMessage)
            }
        }
    }

    async fn read_mem_query(&mut self) -> Result<Range<u64>, MigrateError> {
        match self.read_msg().await? {
            codec::Message::MemQuery(start, end) => {
                if start % PAGE_SIZE as u64 != 0
                    || (end % PAGE_SIZE as u64 != 0 && end != !0)
                {
                    return Err(MigrateError::Phase);
                }
                Ok(start..end)
            }
            msg => {
                error!(self.log(), "expected `MemQuery` but received: {msg:?}");
                Err(MigrateError::UnexpectedMessage)
            }
        }
    }

    async fn send_msg(
        &mut self,
        m: codec::Message,
    ) -> Result<(), MigrateError> {
        Ok(self.conn.send(m.try_into()?).await?)
    }

    async fn vmm_ram_bounds(
        &mut self,
    ) -> Result<RangeInclusive<GuestAddr>, MigrateError> {
        let objects = self.vm.lock_shared().await;
        let memctx = objects.access_mem().unwrap();
        memctx.mem_bounds().ok_or(MigrateError::InvalidInstanceState)
    }

    async fn track_dirty(
        &mut self,
        start_gpa: GuestAddr,
        bits: &mut [u8],
    ) -> Result<(), MigrateError> {
        self.vm
            .lock_shared()
            .await
            .vmm_hdl()
            .track_dirty_pages(start_gpa.0, bits)
            .map_err(|_| MigrateError::InvalidInstanceState)
    }

    async fn read_guest_mem(
        &mut self,
        addr: GuestAddr,
        buf: &mut GuestData<&mut [u8]>,
    ) -> Result<(), MigrateError> {
        let objects = self.vm.lock_shared().await;
        let memctx = objects.access_mem().unwrap();
        let len = buf.len();
        memctx.direct_read_into(addr, buf, len);
        Ok(())
    }
}


================================================
FILE: bin/propolis-server/src/lib/serial/history_buffer.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Maintains a buffer of an instance's serial console data, holding both the
//! first mebibyte and the most recent mebibyte of console output.

use dropshot::HttpError;
use propolis_api_types::serial::{
    InstanceSerialConsoleHistoryRequest, InstanceSerialConsoleStreamRequest,
};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::convert::TryFrom;

#[derive(thiserror::Error, Debug)]
pub enum Error {
    #[error("Requested byte offset {requested_from_start} (-{requested_from_end}) is no longer cached: Only have first {beginning_len} + most recent {rolling_len} saved out of a total {total_bytes} of output")]
    ExpiredRange {
        requested_from_start: usize,
        requested_from_end: usize,
        beginning_len: usize,
        rolling_len: usize,
        total_bytes: usize,
    },
}

const TTY_BUFFER_SIZE: usize = 1024 * 1024;
const DEFAULT_MAX_LENGTH: isize = 16 * 1024;

/// An abstraction for storing the contents of the instance's serial console
/// output, intended for retrieval by the web console or other monitoring or
/// troubleshooting tools.
#[derive(Deserialize, Serialize, Clone)]
pub(crate) struct HistoryBuffer {
    beginning: Vec<u8>,
    rolling: VecDeque<u8>,
    total_bytes: usize,
    buffer_size: usize,
}

#[derive(Copy, Clone)]
pub(crate) enum SerialHistoryOffset {
    /// The byte index since instance start.
    FromStart(usize),
    /// The byte index *backwards* from the most recently buffered data.
    MostRecent(usize),
}

impl TryFrom<&InstanceSerialConsoleStreamRequest> for SerialHistoryOffset {
    type Error = ();
    fn try_from(req: &InstanceSerialConsoleStreamRequest) -> Result<Self, ()> {
        match req {
            InstanceSerialConsoleStreamRequest {
                from_start: Some(offset),
                most_recent: None,
            } => Ok(SerialHistoryOffset::FromStart(*offset as usize)),
            InstanceSerialConsoleStreamRequest {
                from_start: None,
                most_recent: Some(offset),
            } => Ok(SerialHistoryOffset::MostRecent(*offset as usize)),
            _ => Err(()),
        }
    }
}

impl TryFrom<&InstanceSerialConsoleHistoryRequest> for SerialHistoryOffset {
    type Error = HttpError;

    fn try_from(
        req: &InstanceSerialConsoleHistoryRequest,
    ) -> Result<SerialHistoryOffset, HttpError> {
        match req {
            InstanceSerialConsoleHistoryRequest {
                from_start: Some(offset),
                most_recent: None,
                ..
            } => Ok(SerialHistoryOffset::FromStart(*offset as usize)),
            InstanceSerialConsoleHistoryRequest {
                from_start: None,
                most_recent: Some(offset),
                ..
            } => Ok(SerialHistoryOffset::MostRecent(*offset as usize)),
            _ => Err(HttpError::for_bad_request(
                None,
                "Exactly one of 'from_start' or 'most_recent' must be specified."
                    .to_string(),
            )),
        }
    }
}

impl Default for HistoryBuffer {
    fn default() -> Self {
        Self::new(TTY_BUFFER_SIZE)
    }
}

impl HistoryBuffer {
    pub fn new(buffer_size: usize) -> Self {
        HistoryBuffer {
            beginning: Vec::with_capacity(buffer_size),
            rolling: VecDeque::with_capacity(buffer_size),
            buffer_size,
            total_bytes: 0,
        }
    }

    /// Feeds the buffer new bytes from the serial console.
    pub fn consume(&mut self, data: &[u8]) {
        self.rolling.extend(data);
        if self.rolling.len() > self.buffer_size {
            let to_drain = self.rolling.len() - self.buffer_size;
            let to_capture = self.buffer_size - self.beginning.len();
            let drain = self.rolling.drain(0..to_drain).take(to_capture);
            self.beginning.extend(drain);
        }
        self.total_bytes += data.len();
        super::probes::serial_buffer_size!(|| self.total_bytes);
    }

    /// Returns a tuple containing:
    /// - an iterator of serial console bytes from the live buffer.
    /// - the absolute byte index since instance start at which the iterator
    ///   *begins*.
    pub fn contents_iter(
        &self,
        byte_offset: SerialHistoryOffset,
    ) -> Result<(Box<dyn Iterator<Item = u8> + '_>, usize), Error> {
        let (from_start, from_end) =
            self.offsets_from_start_and_end(byte_offset);

        // determine whether we should pull from beginning or rolling (or if
        // we're straddling both)
        if self.total_bytes == self.rolling.len() + self.beginning.len() {
            // still contiguous
            Ok((
                Box::new(
                    self.beginning
                        .iter()
                        .chain(self.rolling.iter())
                        .skip(from_start)
                        .copied(),
                ),
                from_start,
            ))
        } else if from_start < self.beginning.len() {
            // requesting from beginning buffer
            Ok((
                Box::new(self.beginning.iter().copied().skip(from_start)),
                from_start,
            ))
        } else if from_end <= self.rolling.len() {
            // (apologies to Takenobu Mitsuyoshi)
            let rolling_start = self.rolling.len() - from_end;
            Ok((
                Box::new(self.rolling.iter().copied().skip(rolling_start)),
                from_start,
            ))
        } else {
            Err(Error::ExpiredRange {
                requested_from_start: from_start,
                requested_from_end: from_end,
                beginning_len: self.beginning.len(),
                rolling_len: self.rolling.len(),
                total_bytes: self.total_bytes,
            })
        }
    }

    /// Returns a tuple containing:
    /// - a `Vec` of the requested range of serial console bytes from the live
    ///   buffer.
    /// - the absolute byte index since instance start at which the `Vec<u8>`
    ///   *ends*.
    ///
    /// This given that `byte_offset` indicates the index from which the
    /// returned `Vec<u8>` should start, and a `max_bytes` parameter, specifies
    /// a maximum length for the returned `Vec<u8>`, which will be
    /// `DEFAULT_MAX_LENGTH` if left unspecified.
    pub fn contents_vec(
        &self,
        byte_offset: SerialHistoryOffset,
        max_bytes: Option<usize>,
    ) -> Result<(Vec<u8>, usize), Error> {
        let (iter, from_start) = self.contents_iter(byte_offset)?;
        let data: Vec<u8> = iter
            .take(max_bytes.unwrap_or(DEFAULT_MAX_LENGTH as usize))
            .collect();
        let end_offset = from_start + data.len();
        Ok((data, end_offset))
    }

    fn offsets_from_start_and_end(
        &self,
        byte_offset: SerialHistoryOffset,
    ) -> (usize, usize) {
        match byte_offset {
            SerialHistoryOffset::FromStart(offset) => {
                if self.total_bytes > offset {
                    (offset, self.total_bytes - offset)
                } else {
                    // if asking for a byte offset we haven't reached yet, just
                    // start from the end.
                    (self.total_bytes, 0)
                }
            }
            SerialHistoryOffset::MostRecent(offset) => {
                if self.total_bytes > offset {
                    (self.total_bytes - offset, offset)
                } else {
                    // if asking for the most recent N > total_bytes, just start
                    // from the beginning.
                    (0, self.total_bytes)
                }
            }
        }
    }

    /// Returns the number of bytes output since instance boot.
    pub fn bytes_from_start(&self) -> usize {
        self.total_bytes
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use SerialHistoryOffset::*;

    // for more legible assertions
    fn sugar(
        buf: &HistoryBuffer,
        byte_offset: SerialHistoryOffset,
        max_bytes: usize,
    ) -> (String, usize) {
        buf.contents_vec(byte_offset, Some(max_bytes))
            .map(|x| (String::from_utf8(x.0).expect("invalid utf-8"), x.1))
            .expect("serial range query failed")
    }

    #[test]
    fn test_continuous_buffer_range_abstraction() {
        let mut buf = HistoryBuffer::new(16);

        assert_eq!(buf.contents_vec(FromStart(0), None).unwrap(), (vec![], 0));
        assert_eq!(sugar(&buf, FromStart(0), 0), (String::new(), 0));
        assert_eq!(sugar(&buf, FromStart(0), 11), (String::new(), 0));
        assert_eq!(sugar(&buf, FromStart(11), 0), (String::new(), 0));
        assert_eq!(sugar(&buf, FromStart(11), 11), (String::new(), 0));

        let line = "This is an example of text.";
        let line_bytes = line.as_bytes().to_vec();

        buf.consume(&Vec::from(&line_bytes[..9]));
        assert_eq!(sugar(&buf, FromStart(8), 5), ("a".to_string(), 9));
        buf.consume(&Vec::from(&line_bytes[9..]));

        assert_eq!(
            buf.contents_vec(FromStart(0), None).unwrap(),
            (line_bytes, line.len())
        );
        assert_eq!(
            sugar(&buf, FromStart(0), line.len() + 10),
            (line.to_string(), line.len())
        );
        assert_eq!(sugar(&buf, FromStart(8), 5), ("an ex".to_string(), 13));
        assert_eq!(
            sugar(&buf, FromStart(100), 10),
            (String::new(), line.len())
        );
        assert_eq!(sugar(&buf, MostRecent(10), 4), ("e of".to_string(), 21));
        assert_eq!(
            sugar(&buf, MostRecent(10), 400),
            ("e of text.".to_string(), line.len())
        );
        assert_eq!(sugar(&buf, MostRecent(100), 4), ("This".to_string(), 4));

        buf.consume("\nNo thing beside remains.".as_bytes());
        assert_eq!(sugar(&buf, MostRecent(10), 4), ("e re".to_string(), 46));
        assert_eq!(sugar(&buf, FromStart(8), 8), ("an examp".to_string(), 16));
        assert_eq!(sugar(&buf, FromStart(8), 12), ("an examp".to_string(), 16));

        assert!(buf.contents_vec(FromStart(16), None).is_err());
    }
}


================================================
FILE: bin/propolis-server/src/lib/serial/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Routines to expose a connection to an instance's serial port.

use crate::migrate::MigrateError;

use std::collections::HashMap;
use std::net::SocketAddr;
use std::num::NonZeroUsize;
use std::ops::Range;
use std::sync::Arc;
use std::time::Duration;

use crate::serial::history_buffer::{HistoryBuffer, SerialHistoryOffset};
use futures::future::Fuse;
use futures::stream::SplitSink;
use futures::{FutureExt, SinkExt, StreamExt};
use propolis::chardev::{pollers, Sink, Source};
use propolis_api_types::serial::InstanceSerialConsoleControlMessage;
use slog::{info, warn, Logger};
use thiserror::Error;
use tokio::sync::{mpsc, oneshot, Mutex, RwLock as AsyncRwLock};
use tokio::task::JoinHandle;
use tokio_tungstenite::tungstenite::protocol::{
    frame::coding::CloseCode, CloseFrame,
};
use tokio_tungstenite::tungstenite::Message;
use tokio_tungstenite::{tungstenite, WebSocketStream};

pub(crate) mod history_buffer;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn serial_close_recv() {}
    fn serial_new_ws() {}
    fn serial_uart_write(n: usize) {}
    fn serial_uart_out() {}
    fn serial_uart_read(n: usize) {}
    fn serial_inject_uart() {}
    fn serial_ws_recv() {}
    fn serial_buffer_size(n: usize) {}
}

/// Errors which may occur during the course of a serial connection.
#[derive(Error, Debug)]
pub enum SerialTaskError {
    #[error("Cannot upgrade HTTP request to WebSockets: {0}")]
    Upgrade(#[from] hyper::Error),

    #[error("WebSocket Error: {0}")]
    WebSocket(#[from] tungstenite::Error),

    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),

    #[error("Mismatched websocket streams while closing")]
    MismatchedStreams,

    #[error("Error while waiting for notification: {0}")]
    OneshotRecv(#[from] oneshot::error::RecvError),

    #[error("JSON marshalling error while processing control message: {0}")]
    Json(#[from] serde_json::Error),
}

pub enum SerialTaskControlMessage {
    Stopping,
    Migration { destination: SocketAddr, from_start: u64 },
}

pub struct SerialTask {
    /// Handle to attached serial session
    pub task: JoinHandle<()>,
    /// Channel used to signal the task to terminate gracefully or notify
    /// clients of a migration
    pub control_ch: mpsc::Sender<SerialTaskControlMessage>,
    /// Channel used to send new client connections to the streaming task
    pub websocks_ch:
        mpsc::Sender<WebSocketStream<dropshot::WebsocketConnectionRaw>>,
}

pub async fn instance_serial_task<Device: Sink + Source>(
    mut websocks_recv: mpsc::Receiver<
        WebSocketStream<dropshot::WebsocketConnectionRaw>,
    >,
    mut control_recv: mpsc::Receiver<SerialTaskControlMessage>,
    serial: Arc<Serial<Device>>,
    log: Logger,
) -> Result<(), SerialTaskError> {
    info!(log, "Entered serial task");
    let mut output = [0u8; 1024];
    let mut cur_output: Option<Range<usize>> = None;
    let mut cur_input: Option<(Vec<u8>, usize)> = None;

    let mut ws_sinks: HashMap<
        usize,
        SplitSink<WebSocketStream<dropshot::WebsocketConnectionRaw>, Message>,
    > = HashMap::new();
    let mut ws_streams: HashMap<
        usize,
        futures::stream::SplitStream<
            WebSocketStream<dropshot::WebsocketConnectionRaw>,
        >,
    > = HashMap::new();

    let (send_ch, mut recv_ch) = mpsc::channel(4);

    let mut next_stream_id = 0usize;

    loop {
        let (uart_read, ws_send) =
            if ws_sinks.is_empty() || cur_output.is_none() {
                (serial.read_source(&mut output).fuse(), Fuse::terminated())
            } else {
                let range = cur_output.clone().unwrap();
                (
                    Fuse::terminated(),
                    if !ws_sinks.is_empty() {
                        futures::stream::iter(
                            ws_sinks.iter_mut().zip(std::iter::repeat(
                                Vec::from(&output[range]),
                            )),
                        )
                        .for_each_concurrent(4, |((_i, ws), bin)| {
                            ws.send(Message::binary(bin)).map(|_| ())
                        })
                        .fuse()
                    } else {
                        Fuse::terminated()
                    },
                )
            };

        let (ws_recv, uart_write) = match &cur_input {
            None => (
                if !ws_streams.is_empty() {
                    futures::stream::iter(ws_streams.iter_mut())
                        .for_each_concurrent(4, |(i, ws)| {
                            ws.next()
                                .then(|msg| send_ch.send((*i, msg)))
                                .map(|_| ())
                        })
                        .fuse()
                } else {
                    Fuse::terminated()
                },
                Fuse::terminated(),
            ),
            Some((data, consumed)) => (
                Fuse::terminated(),
                serial.write_sink(&data[*consumed..]).fuse(),
            ),
        };

        let input_recv_ch_fut = recv_ch.recv().fuse();
        let new_ws_recv = websocks_recv.recv().fuse();
        let control_recv_fut = control_recv.recv().fuse();

        tokio::select! {
            // Poll in the order written
            biased;

            // It's important we always poll the close channel first
            // so that a constant stream of incoming/outgoing messages
            // don't cause us to ignore it
            message = control_recv_fut => {
                probes::serial_close_recv!(|| {});
                match message {
                    Some(SerialTaskControlMessage::Stopping) | None => {
                        // Gracefully close the connections to any clients
                        for (i, ws0) in ws_sinks.into_iter() {
                            let ws1 = ws_streams.remove(&i).ok_or(SerialTaskError::MismatchedStreams)?;
                            let mut ws = ws0.reunite(ws1).map_err(|_| SerialTaskError::MismatchedStreams)?;
                            let _ = ws.close(Some(CloseFrame {
                                code: CloseCode::Away,
                                reason: "VM stopped".into(),
                            })).await;
                        }
                    }
                    Some(SerialTaskControlMessage::Migration { destination, from_start }) => {
                        let mut failures = 0;
                        for sink in ws_sinks.values_mut() {
                            if sink.send(Message::Text(serde_json::to_string(
                                &InstanceSerialConsoleControlMessage::Migrating {
                                    destination,
                                    from_start,
                                }
                            )?)).await.is_err() {
                                failures += 1;
                            }
                        }
                        if failures > 0 {
                            warn!(log, "Failed to send migration info to {} connected clients.", failures);
                        }
                    }
                }
                info!(log, "Terminating serial task");
                break;
            }

            new_ws = new_ws_recv => {
                probes::serial_new_ws!(|| {});
                if let Some(ws) = new_ws {
                    let (ws_sink, ws_stream) = ws.split();
                    ws_sinks.insert(next_stream_id, ws_sink);
                    ws_streams.insert(next_stream_id, ws_stream);
                    next_stream_id += 1;
                }
            }

            // Write bytes into the UART from the WS
            written = uart_write => {
                probes::serial_uart_write!(|| { written.unwrap_or(0) });
                match written {
                    Some(0) | None => {
                        break;
                    }
                    Some(n) => {
                        let (data, consumed) = cur_input.as_mut().unwrap();
                        *consumed += n;
                        if *consumed == data.len() {
                            cur_input = None;
                        }
                    }
                }
            }

            // Transmit bytes from the UART through the WS
            _ = ws_send => {
                probes::serial_uart_out!(|| {});
                cur_output = None;
            }

            // Read bytes from the UART to be transmitted out the WS
            nread = uart_read => {
                // N.B. Putting this probe inside the match arms below causes
                //      the `break` arm to be taken unexpectedly. See
                //      propolis#292 for details.
                probes::serial_uart_read!(|| { nread.unwrap_or(0) });
                match nread {
                    Some(0) | None => {
                        break;
                    }
                    Some(n) => {
                        cur_output = Some(0..n)
                    }
                }
            }

            // Receive bytes from the intermediate channel to be injected into
            // the UART. This needs to be checked before `ws_recv` so that
            // "close" messages can be processed and their indicated
            // sinks/streams removed before they are polled again.
            pair = input_recv_ch_fut => {
                probes::serial_inject_uart!(|| {});
                if let Some((i, msg)) = pair {
                    match msg {
                        Some(Ok(Message::Binary(input))) => {
                            cur_input = Some((input, 0));
                        }
                        Some(Ok(Message::Close(..))) | None => {
                            info!(log, "Removing closed serial connection {}.", i);
                            let sink = ws_sinks.remove(&i).ok_or(SerialTaskError::MismatchedStreams)?;
                            let stream = ws_streams.remove(&i).ok_or(SerialTaskError::MismatchedStreams)?;
                            if let Err(e) = sink.reunite(stream).map_err(|_| SerialTaskError::MismatchedStreams)?.close(None).await {
                                warn!(log, "Failed while closing stream {}: {}", i, e);
                            }
                        },
                        _ => continue,
                    }
                }
            }

            // Receive bytes from connected WS clients to feed to the
            // intermediate recv_ch
            _ = ws_recv => {
                probes::serial_ws_recv!(|| {});
            }
        }
    }
    info!(log, "Returning from serial task");
    Ok(())
}

/// Represents a serial connection into the VM.
pub struct Serial<Device: Sink + Source> {
    uart: Arc<Device>,

    task_control_ch: Mutex<Option<mpsc::Sender<SerialTaskControlMessage>>>,

    sink_poller: Arc<pollers::SinkBuffer>,
    source_poller: Arc<pollers::SourceBuffer>,
    history: AsyncRwLock<HistoryBuffer>,
}

impl<Device: Sink + Source> Serial<Device> {
    /// Creates a new buffered serial connection on top of `uart.`
    ///
    /// Creation of this object disables "autodiscard", and destruction
    /// of the object re-enables "autodiscard" mode.
    ///
    /// # Arguments
    ///
    /// * `uart` - The device which data will be read from / written to.
    /// * `sink_size` - A lower bound on the size of the writeback buffer.
    /// * `source_size` - A lower bound on the size of the read buffer.
    pub fn new(
        uart: Arc<Device>,
        sink_size: NonZeroUsize,
        source_size: NonZeroUsize,
    ) -> Serial<Device> {
        let sink_poller = pollers::SinkBuffer::new(sink_size);
        let source_poller = pollers::SourceBuffer::new(pollers::Params {
            buf_size: source_size,
            poll_interval: Duration::from_millis(10),
            poll_miss_thresh: 5,
        });
        let history = Default::default();
        sink_poller.attach(uart.as_ref());
        source_poller.attach(uart.as_ref());
        uart.set_autodiscard(false);

        let task_control_ch = Default::default();

        Serial { uart, task_control_ch, sink_poller, source_poller, history }
    }

    pub async fn read_source(&self, buf: &mut [u8]) -> Option<usize> {
        let uart = self.uart.clone();
        let bytes_read = self.source_poller.read(buf, uart.as_ref()).await?;
        self.history.write().await.consume(&buf[..bytes_read]);
        Some(bytes_read)
    }

    pub async fn write_sink(&self, buf: &[u8]) -> Option<usize> {
        let uart = self.uart.clone();
        self.sink_poller.write(buf, uart.as_ref()).await
    }

    pub(crate) async fn history_vec(
        &self,
        byte_offset: SerialHistoryOffset,
        max_bytes: Option<usize>,
    ) -> Result<(Vec<u8>, usize), history_buffer::Error> {
        self.history.read().await.contents_vec(byte_offset, max_bytes)
    }

    // provide the channel through which we inform connected websocket clients
    // that a migration has occurred, and where to reconnect.
    // (the server's serial-to-websocket task -- and thus the receiving end of
    // this channel -- are spawned in `instance_ensure_common`, after the
    // construction of `Serial`)
    pub(crate) async fn set_task_control_sender(
        &self,
        control_ch: mpsc::Sender<SerialTaskControlMessage>,
    ) {
        self.task_control_ch.lock().await.replace(control_ch);
    }

    pub(crate) async fn export_history(
        &self,
        destination: SocketAddr,
    ) -> Result<String, MigrateError> {
        let read_hist = self.history.read().await;
        let from_start = read_hist.bytes_from_start() as u64;
        let encoded = ron::to_string(&*read_hist)
            .map_err(|e| MigrateError::Codec(e.to_string()))?;
        drop(read_hist);
        if let Some(ch) = self.task_control_ch.lock().await.as_ref() {
            ch.send(SerialTaskControlMessage::Migration {
                destination,
                from_start,
            })
            .await
            .map_err(|_| MigrateError::InvalidInstanceState)?;
        }
        Ok(encoded)
    }

    pub(crate) async fn import(
        &self,
        serialized_hist: &str,
    ) -> Result<(), MigrateError> {
        self.sink_poller.attach(self.uart.as_ref());
        self.source_poller.attach(self.uart.as_ref());
        self.uart.set_autodiscard(false);
        let decoded = ron::from_str(serialized_hist)
            .map_err(|e| MigrateError::Codec(e.to_string()))?;
        let mut write_hist = self.history.write().await;
        *write_hist = decoded;
        Ok(())
    }
}

impl<Device: Sink + Source> Drop for Serial<Device> {
    fn drop(&mut self) {
        self.uart.set_autodiscard(true);
    }
}


================================================
FILE: bin/propolis-server/src/lib/server.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! HTTP server callback functions.
//!
//! Functions in this module verify parameters and convert between types (API
//! request types to Propolis-native types and Propolis-native error types to
//! HTTP error codes) before sending operations to the VM state machine for
//! processing.

use std::convert::TryFrom;
use std::error::Error;
use std::net::IpAddr;
use std::net::Ipv6Addr;
use std::net::SocketAddr;
use std::net::SocketAddrV6;
use std::path::PathBuf;
use std::sync::Arc;

use crate::migrate::destination::MigrationTargetInfo;
use crate::vm::ensure::VmInitializationMethod;
use crate::{
    serial::history_buffer::SerialHistoryOffset,
    vm::{ensure::VmEnsureRequest, VmError},
    vnc::{self, VncServer},
};

use dropshot::{
    ApiDescription, ClientErrorStatusCode, HttpError, HttpResponseCreated,
    HttpResponseOk, HttpResponseUpdatedNoContent, Path, Query, RequestContext,
    TypedBody, WebsocketConnection,
};
use futures::SinkExt;
use internal_dns_resolver::{ResolveError, Resolver};
use internal_dns_types::names::ServiceName;
pub use nexus_client::Client as NexusClient;
use oximeter::types::ProducerRegistry;
use propolis::attestation::server::AttestationServerConfig;
use propolis_api_types::disk::{
    InstanceVCRReplace, SnapshotRequestPathParams, VCRRequestPathParams,
    VolumeStatus, VolumeStatusPathParams,
};
use propolis_api_types::instance::{
    ErrorCode, Instance, InstanceEnsureRequest, InstanceEnsureResponse,
    InstanceGetResponse, InstanceInitializationMethod,
    InstanceStateMonitorRequest, InstanceStateMonitorResponse,
    InstanceStateRequested,
};
use propolis_api_types::instance_spec::{InstanceSpecGetResponse, SpecKey};
use propolis_api_types::migration::{
    InstanceMigrateStartRequest, InstanceMigrateStatusResponse,
};
use propolis_api_types::serial::{
    InstanceSerialConsoleHistoryRequest, InstanceSerialConsoleHistoryResponse,
    InstanceSerialConsoleStreamRequest,
};
use propolis_api_types_versions::v1::disk::VolumeStatus as VolumeStatusV1;
use propolis_server_api::PropolisServerApi;
use rfb::tungstenite::BinaryWs;
use slog::{error, warn, Logger};
use tokio::sync::MutexGuard;
use tokio_tungstenite::{
    tungstenite::protocol::{Role, WebSocketConfig},
    WebSocketStream,
};

/// Configuration used to set this server up to provide Oximeter metrics.
#[derive(Debug, Clone)]
pub struct MetricsEndpointConfig {
    /// The address at which the Oximeter endpoint will be hosted (i.e., this
    /// server's address).
    pub listen_addr: IpAddr,

    /// The address of the Nexus instance with which we should register our own
    /// server's address.
    ///
    /// If this is None _and the listen address is IPv6_, then internal DNS will
    /// be used to register for metrics.
    pub registration_addr: Option<SocketAddr>,
}

/// Static configuration for objects owned by this server. The server obtains
/// this configuration at startup time and refers to it when manipulating its
/// objects.
pub struct StaticConfig {
    /// The path to the bootrom image to expose to the guest.
    pub bootrom_path: PathBuf,

    /// The bootrom version string to expose to the guest. If None, machine
    /// initialization chooses a default value.
    pub bootrom_version: Option<String>,

    /// Whether to use the host's guest memory reservoir to back guest memory.
    pub use_reservoir: bool,

    /// The configuration to use when setting up this server's Oximeter
    /// endpoint.
    metrics: Option<MetricsEndpointConfig>,

    /// TODO: comment
    attest_config: Option<AttestationServerConfig>,
}

/// Context accessible from HTTP callbacks.
pub struct DropshotEndpointContext {
    static_config: StaticConfig,
    pub vnc_server: Arc<VncServer>,
    pub(crate) vm: Arc<crate::vm::Vm>,
    log: Logger,
}

impl DropshotEndpointContext {
    /// Creates a new server context object.
    pub fn new(
        bootrom_path: PathBuf,
        bootrom_version: Option<String>,
        use_reservoir: bool,
        log: slog::Logger,
        metric_config: Option<MetricsEndpointConfig>,
        attest_config: Option<AttestationServerConfig>,
    ) -> Self {
        let vnc_server = VncServer::new(log.clone());
        Self {
            static_config: StaticConfig {
                bootrom_path,
                bootrom_version,
                use_reservoir,
                metrics: metric_config,
                attest_config,
            },
            vnc_server,
            vm: crate::vm::Vm::new(&log),
            log,
        }
    }
}

/// Wrapper around a [`NexusClient`] object, which allows deferring
/// the DNS lookup until accessed.
///
/// Without the assistance of OS-level DNS lookups, the [`NexusClient`]
/// interface requires knowledge of the target service IP address.
/// For some services, like Nexus, this can be painful, as the IP address
/// may not have even been allocated when the Sled Agent starts.
///
/// This structure allows clients to access the client on-demand, performing
/// the DNS lookup only once it is actually needed.
struct LazyNexusClientInner {
    log: Logger,
    resolver: Resolver,
}
#[derive(Clone)]
pub struct LazyNexusClient {
    inner: Arc<LazyNexusClientInner>,
}

impl LazyNexusClient {
    pub fn new(log: Logger, addr: Ipv6Addr) -> Result<Self, ResolveError> {
        Ok(Self {
            inner: Arc::new(LazyNexusClientInner {
                log: log.clone(),
                resolver: Resolver::new_from_ip(log, addr)?,
            }),
        })
    }

    pub async fn get_ip(&self) -> Result<SocketAddrV6, ResolveError> {
        self.inner.resolver.lookup_socket_v6(ServiceName::Nexus).await
    }

    pub async fn get(&self) -> Result<NexusClient, ResolveError> {
        let address = self.get_ip().await?;

        Ok(NexusClient::new(
            &format!("http://{address}"),
            self.inner.log.clone(),
        ))
    }
}

// Use our local address as basis for calculating a Nexus endpoint,
// Return that endpoint if successful.
async fn find_local_nexus_client(
    local_addr: SocketAddr,
    log: Logger,
) -> Option<NexusClient> {
    // At the moment, we only support converting an IPv6 address into a
    // Nexus endpoint.
    let address = match local_addr {
        SocketAddr::V6(my_address) => *my_address.ip(),
        SocketAddr::V4(_) => {
            warn!(log, "Unable to determine Nexus endpoint for IPv4 addresses");
            return None;
        }
    };

    // We have an IPv6 address, so could be in a rack.  See if there is a
    // Nexus at the expected location.
    match LazyNexusClient::new(log.clone(), address) {
        Ok(lnc) => match lnc.get().await {
            Ok(client) => Some(client),
            Err(e) => {
                warn!(log, "Failed to resolve Nexus endpoint"; "error" => ?e);
                None
            }
        },
        Err(e) => {
            warn!(log, "Failed to create Nexus client"; "error" => ?e);
            None
        }
    }
}

async fn instance_get(
    rqctx: &RequestContext<Arc<DropshotEndpointContext>>,
) -> Result<InstanceSpecGetResponse, HttpError> {
    let ctx = rqctx.context();
    ctx.vm.get().await.ok_or_else(not_created_error)
}

enum PropolisServerImpl {}

impl PropolisServerApi for PropolisServerImpl {
    type Context = Arc<DropshotEndpointContext>;

    async fn instance_ensure(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<InstanceEnsureRequest>,
    ) -> Result<HttpResponseCreated<InstanceEnsureResponse>, HttpError> {
        let server_context = rqctx.context();
        let InstanceEnsureRequest { properties, init } = request.into_inner();
        let oximeter_registry = server_context
            .static_config
            .metrics
            .as_ref()
            .map(|_| ProducerRegistry::with_id(properties.id));

        let nexus_client =
            find_local_nexus_client(rqctx.server.local_addr, rqctx.log.clone())
                .await;

        let ensure_options = crate::vm::EnsureOptions {
            bootrom_path: server_context.static_config.bootrom_path.clone(),
            bootrom_version: server_context
                .static_config
                .bootrom_version
                .clone(),
            use_reservoir: server_context.static_config.use_reservoir,
            metrics_config: server_context.static_config.metrics.clone(),
            oximeter_registry,
            nexus_client,
            vnc_server: server_context.vnc_server.clone(),
            local_server_addr: rqctx.server.local_addr,
            attest_config: server_context.static_config.attest_config,
        };

        let vm_init = match init {
            InstanceInitializationMethod::Spec { spec } => spec
                .try_into()
                .map(|s| VmInitializationMethod::Spec(Box::new(s)))
                .map_err(|e| {
                    if let Some(s) = e.source() {
                        format!("{e}: {s}")
                    } else {
                        e.to_string()
                    }
                }),
            InstanceInitializationMethod::MigrationTarget {
                migration_id,
                src_addr,
                replace_components,
            } => Ok(VmInitializationMethod::Migration(MigrationTargetInfo {
                migration_id,
                src_addr,
                replace_components,
            })),
        }
        .map_err(|e| {
            HttpError::for_bad_request(
                None,
                format!("failed to generate internal instance spec: {e}"),
            )
        })?;

        let request = VmEnsureRequest { properties, init: vm_init };
        server_context
            .vm
            .ensure(&server_context.log, request, ensure_options)
            .await
            .map(HttpResponseCreated)
            .map_err(|e| match e {
                VmError::ResultChannelClosed => HttpError::for_internal_error(
                    "state driver unexpectedly dropped result channel"
                        .to_string(),
                ),
                VmError::WaitingToInitialize
                | VmError::AlreadyInitialized
                | VmError::RundownInProgress => HttpError::for_client_error(
                    Some(ErrorCode::AlreadyInitialized.to_string()),
                    ClientErrorStatusCode::CONFLICT,
                    "instance already initialized".to_string(),
                ),
                VmError::InitializationFailed(e) => {
                    HttpError::for_internal_error(format!(
                        "VM initialization failed: {e}"
                    ))
                }
                _ => HttpError::for_internal_error(format!(
                    "unexpected error from VM controller: {e}"
                )),
            })
    }

    async fn instance_spec_get(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<InstanceSpecGetResponse>, HttpError> {
        Ok(HttpResponseOk(instance_get(&rqctx).await?))
    }

    async fn instance_get(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<InstanceGetResponse>, HttpError> {
        instance_get(&rqctx).await.map(|full| {
            HttpResponseOk(InstanceGetResponse {
                instance: Instance {
                    properties: full.properties,
                    state: full.state,
                },
            })
        })
    }

    async fn instance_state_monitor(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<InstanceStateMonitorRequest>,
    ) -> Result<HttpResponseOk<InstanceStateMonitorResponse>, HttpError> {
        let ctx = rqctx.context();
        let gen = request.into_inner().gen;
        let mut state_watcher =
            ctx.vm.state_watcher().await.ok_or_else(not_created_error)?;

        loop {
            let last = state_watcher.borrow().clone();
            if gen <= last.gen {
                return Ok(HttpResponseOk(last));
            }

            // An error from `changed` indicates that the sender was destroyed,
            // which means that the generation number will never change again, which
            // means it will never reach the number the client wants it to reach.
            // Inform the client of this condition so it doesn't wait forever.
            state_watcher.changed().await.map_err(|_| {
                HttpError::for_client_error(
                    Some(ErrorCode::NoInstance.to_string()),
                    ClientErrorStatusCode::GONE,
                    format!(
                        "No instance present; will never reach generation {gen}",
                    ),
                )
            })?;
        }
    }

    async fn instance_state_put(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<InstanceStateRequested>,
    ) -> Result<HttpResponseUpdatedNoContent, HttpError> {
        let ctx = rqctx.context();
        let requested_state = request.into_inner();
        let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?;
        let result = vm
            .put_state(requested_state)
            .map(|_| HttpResponseUpdatedNoContent {})
            .map_err(|e| match e {
                VmError::WaitingToInitialize => HttpError::for_unavail(
                    None,
                    "instance is still initializing".to_string(),
                ),
                VmError::ForbiddenStateChange(reason) => {
                    HttpError::for_client_error_with_status(
                        Some(format!(
                            "instance state change not allowed: {reason}"
                        )),
                        ClientErrorStatusCode::FORBIDDEN,
                    )
                }
                _ => HttpError::for_internal_error(format!(
                    "unexpected error from VM controller: {e}"
                )),
            });

        if result.is_ok() {
            if let InstanceStateRequested::Reboot = requested_state {
                let stats = MutexGuard::map(
                    vm.services().oximeter.lock().await,
                    |state| &mut state.stats,
                );
                if let Some(stats) = stats.as_ref() {
                    stats.count_reset();
                }
            }
        }

        result
    }

    async fn instance_serial_history_get(
        rqctx: RequestContext<Self::Context>,
        query: Query<InstanceSerialConsoleHistoryRequest>,
    ) -> Result<HttpResponseOk<InstanceSerialConsoleHistoryResponse>, HttpError>
    {
        let ctx = rqctx.context();
        let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?;
        let serial = vm.objects().lock_shared().await.com1().clone();
        let query_params = query.into_inner();

        let byte_offset = SerialHistoryOffset::try_from(&query_params)?;

        let max_bytes = query_params.max_bytes.map(|x| x as usize);
        let (data, end) = serial
            .history_vec(byte_offset, max_bytes)
            .await
            .map_err(|e| HttpError::for_bad_request(None, e.to_string()))?;

        Ok(HttpResponseOk(InstanceSerialConsoleHistoryResponse {
            data,
            last_byte_offset: end as u64,
        }))
    }

    async fn instance_serial(
        rqctx: RequestContext<Self::Context>,
        query: Query<InstanceSerialConsoleStreamRequest>,
        websock: WebsocketConnection,
    ) -> dropshot::WebsocketChannelResult {
        let ctx = rqctx.context();
        let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?;
        let serial = vm.objects().lock_shared().await.com1().clone();

        // Use the default buffering paramters for the websocket configuration
        //
        // Because messages are written with [`StreamExt::send`], the buffer on the
        // websocket is flushed for every message, preventing both unecessary delays
        // of messages and the potential for the buffer to grow without bound.
        let config = WebSocketConfig::default();

        let mut ws_stream = WebSocketStream::from_raw_socket(
            websock.into_inner(),
            Role::Server,
            Some(config),
        )
        .await;

        let byte_offset =
            SerialHistoryOffset::try_from(&query.into_inner()).ok();
        if let Some(mut byte_offset) = byte_offset {
            loop {
                let (data, offset) =
                    serial.history_vec(byte_offset, None).await?;
                if data.is_empty() {
                    break;
                }
                ws_stream
                    .send(tokio_tungstenite::tungstenite::Message::Binary(data))
                    .await?;
                byte_offset = SerialHistoryOffset::FromStart(offset);
            }
        }

        // Get serial task's handle and send it the websocket stream
        let serial_task = vm.services().serial_task.lock().await;
        serial_task
            .as_ref()
            .ok_or("Instance has no serial task")?
            .websocks_ch
            .send(ws_stream)
            .await
            .map_err(|e| format!("Serial socket hand-off failed: {e}").into())
    }

    async fn instance_vnc(
        rqctx: RequestContext<Self::Context>,
        _query: Query<()>,
        websock: WebsocketConnection,
    ) -> dropshot::WebsocketChannelResult {
        let ctx = rqctx.context();

        let ws_stream = WebSocketStream::from_raw_socket(
            websock.into_inner(),
            Role::Server,
            None,
        )
        .await;

        if let Err(e) = ctx
            .vnc_server
            .connect(
                Box::new(BinaryWs::new(ws_stream)) as Box<dyn vnc::Connection>,
                rqctx.request_id.clone(),
            )
            .await
        {
            // Log the error, but since the request has already been upgraded, there
            // is no sense in trying to emit a formal error to the client
            error!(rqctx.log, "VNC initialization failed: {:?}", e);
        }

        Ok(())
    }

    async fn instance_migrate_start(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<InstanceMigrateStartRequest>,
        websock: WebsocketConnection,
    ) -> dropshot::WebsocketChannelResult {
        let ctx = rqctx.context();
        let migration_id = path_params.into_inner().migration_id;
        let vm = ctx.vm.active_vm().await.ok_or_else(not_created_error)?;
        Ok(vm.request_migration_out(migration_id, websock).await?)
    }

    async fn instance_migrate_status(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<InstanceMigrateStatusResponse>, HttpError> {
        let ctx = rqctx.context();
        ctx.vm
            .state_watcher()
            .await
            .map(|rx| HttpResponseOk(rx.borrow().migration.clone()))
            .ok_or_else(not_created_error)
    }

    async fn instance_issue_crucible_snapshot_request(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<SnapshotRequestPathParams>,
    ) -> Result<HttpResponseOk<()>, HttpError> {
        let vm = rqctx
            .context()
            .vm
            .active_vm()
            .await
            .ok_or_else(not_created_error)?;
        let objects = vm.objects().lock_shared().await;
        let path_params = path_params.into_inner();

        let backend = objects
            .crucible_backends()
            .get(&SpecKey::from(path_params.id.clone()))
            .ok_or_else(|| {
                let s = format!("no disk with id {}!", path_params.id);
                HttpError::for_not_found(Some(s.clone()), s)
            })?;
        backend.snapshot(path_params.snapshot_id).await.map_err(|e| {
            HttpError::for_bad_request(Some(e.to_string()), e.to_string())
        })?;

        Ok(HttpResponseOk(()))
    }

    async fn disk_volume_status_v1(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<VolumeStatusPathParams>,
    ) -> Result<HttpResponseOk<VolumeStatusV1>, HttpError> {
        let path_params = path_params.into_inner();
        let vm = rqctx
            .context()
            .vm
            .active_vm()
            .await
            .ok_or_else(not_created_error)?;
        let objects = vm.objects().lock_shared().await;
        let backend = objects
            .crucible_backends()
            .get(&SpecKey::from(path_params.id.clone()))
            .ok_or_else(|| {
                let s =
                    format!("No crucible backend for id {}", path_params.id);
                HttpError::for_not_found(Some(s.clone()), s)
            })?;

        Ok(HttpResponseOk(VolumeStatusV1 {
            active: backend.volume_is_active().await.map_err(|e| {
                HttpError::for_bad_request(Some(e.to_string()), e.to_string())
            })?,
        }))
    }

    async fn disk_volume_status(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<VolumeStatusPathParams>,
    ) -> Result<HttpResponseOk<VolumeStatus>, HttpError> {
        let path_params = path_params.into_inner();
        let vm = rqctx
            .context()
            .vm
            .active_vm()
            .await
            .ok_or_else(not_created_error)?;
        let objects = vm.objects().lock_shared().await;
        let backend = objects
            .crucible_backends()
            .get(&SpecKey::from(path_params.id.clone()))
            .ok_or_else(|| {
                let s =
                    format!("No crucible backend for id {}", path_params.id);
                HttpError::for_not_found(Some(s.clone()), s)
            })?;

        let volume_info = backend.query_volume_info().await.map_err(|e| {
            HttpError::for_bad_request(Some(e.to_string()), e.to_string())
        })?;

        Ok(HttpResponseOk(VolumeStatus { volume_info }))
    }

    async fn instance_issue_crucible_vcr_request(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<VCRRequestPathParams>,
        request: TypedBody<InstanceVCRReplace>,
    ) -> Result<HttpResponseOk<crucible_client_types::ReplaceResult>, HttpError>
    {
        let path_params = path_params.into_inner();
        let request = request.into_inner();
        let new_vcr_json = request.vcr_json;

        let (tx, rx) = tokio::sync::oneshot::channel();
        let vm = rqctx
            .context()
            .vm
            .active_vm()
            .await
            .ok_or_else(not_created_error)?;

        vm.reconfigure_crucible_volume(
            SpecKey::from(path_params.id),
            new_vcr_json,
            tx,
        )
        .map_err(|e| match e {
            VmError::ForbiddenStateChange(reason) => {
                HttpError::for_client_error_with_status(
                    Some(format!(
                        "instance state change not allowed: {reason}"
                    )),
                    ClientErrorStatusCode::FORBIDDEN,
                )
            }
            _ => HttpError::for_internal_error(format!(
                "unexpected error from VM controller: {e}"
            )),
        })?;

        let result = rx.await.map_err(|_| {
            HttpError::for_internal_error(
                "VM worker task unexpectedly dropped result channel"
                    .to_string(),
            )
        })?;

        result.map(HttpResponseOk)
    }

    async fn instance_issue_nmi(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<()>, HttpError> {
        let vm = rqctx
            .context()
            .vm
            .active_vm()
            .await
            .ok_or_else(not_created_error)?;
        let _ = vm.objects().lock_shared().await.machine().inject_nmi();

        Ok(HttpResponseOk(()))
    }
}

/// Returns a Dropshot [`ApiDescription`] object to launch a server.
pub fn api() -> ApiDescription<Arc<DropshotEndpointContext>> {
    propolis_server_api::propolis_server_api_mod::api_description::<
        PropolisServerImpl,
    >()
    .unwrap()
}

fn not_created_error() -> HttpError {
    HttpError::for_client_error(
        Some(ErrorCode::NoInstance.to_string()),
        ClientErrorStatusCode::FAILED_DEPENDENCY,
        "Server not initialized (no instance)".to_string(),
    )
}


================================================
FILE: bin/propolis-server/src/lib/spec/api_spec_v0.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Conversions from version-0 instance specs in the [`propolis_api_types`]
//! crate to the internal [`super::Spec`] representation.

use std::collections::BTreeMap;

use propolis_api_types::instance_spec::{
    components::{
        backends::{DlpiNetworkBackend, VirtioNetworkBackend},
        board::Board as InstanceSpecBoard,
        devices::{BootSettings, SerialPort as SerialPortDesc},
    },
    SpecKey,
};
use propolis_api_types_versions::v1;
use thiserror::Error;

#[cfg(feature = "falcon")]
use propolis_api_types::instance_spec::components::devices::SoftNpuPort as SoftNpuPortSpec;

use super::{
    builder::{SpecBuilder, SpecBuilderError},
    Disk, Nic, QemuPvpanic, SerialPortDevice, Spec, StorageBackend,
    StorageDevice,
};

#[cfg(feature = "failure-injection")]
use super::MigrationFailure;

#[cfg(feature = "falcon")]
use super::SoftNpuPort;

#[derive(Debug, Error)]
pub(crate) enum ApiSpecError {
    #[error(transparent)]
    Builder(#[from] SpecBuilderError),

    #[error("storage backend {backend} not found for device {device}")]
    StorageBackendNotFound { backend: SpecKey, device: SpecKey },

    #[error("network backend {backend} not found for device {device}")]
    NetworkBackendNotFound { backend: SpecKey, device: SpecKey },

    #[allow(dead_code)]
    #[error("support for component {component} compiled out via {feature}")]
    FeatureCompiledOut { component: SpecKey, feature: &'static str },

    #[error("backend {0} not used by any device")]
    BackendNotUsed(SpecKey),
}

impl From<Spec> for v1::instance_spec::InstanceSpec {
    fn from(val: Spec) -> Self {
        // Exhaustively destructure the input spec so that adding a new field
        // without considering it here will break the build.
        let Spec {
            board,
            cpuid,
            disks,
            nics,
            boot_settings,
            serial,
            pci_pci_bridges,
            pvpanic,
            #[cfg(feature = "failure-injection")]
            migration_failure,
            #[cfg(feature = "falcon")]
            softnpu,

            // Not part of `v1::instance_spec::InstanceSpec`. Added in
            // `InstanceSpec` in API Version 2.0.0.
            smbios_type1_input: _,

            // Not part of `v1::instance_spec::InstanceSpec`. Added in
            // `InstanceSpec` in API Version 3.0.0.
            vsock: _,
        } = val;

        // Inserts a component entry into the supplied map, asserting first that
        // the supplied key is not present in that map.
        //
        // This assertion is valid because internal instance specs should assign
        // a unique name to each component they describe. The spec builder
        // upholds this invariant at spec creation time.
        #[track_caller]
        fn insert_component(
            spec: &mut v1::instance_spec::InstanceSpec,
            key: SpecKey,
            val: v1::instance_spec::Component,
        ) {
            assert!(
                !spec.components.contains_key(&key),
                "component name {} already exists in output spec",
                &key
            );
            spec.components.insert(key, val);
        }

        let board = InstanceSpecBoard {
            cpus: board.cpus,
            memory_mb: board.memory_mb,
            chipset: board.chipset,
            guest_hv_interface: board.guest_hv_interface,
            cpuid: Some(cpuid.into_instance_spec_cpuid()),
        };
        let mut spec = v1::instance_spec::InstanceSpec {
            board,
            components: Default::default(),
        };

        for (disk_id, disk) in disks {
            let backend_id = disk.device_spec.backend_id().to_owned();
            insert_component(&mut spec, disk_id, disk.device_spec.into());
            insert_component(&mut spec, backend_id, disk.backend_spec.into());
        }

        for (nic_id, nic) in nics {
            let backend_id = nic.device_spec.backend_id.clone();
            insert_component(
                &mut spec,
                nic_id,
                v1::instance_spec::Component::VirtioNic(nic.device_spec),
            );

            insert_component(
                &mut spec,
                backend_id,
                v1::instance_spec::Component::VirtioNetworkBackend(
                    nic.backend_spec,
                ),
            );
        }

        for (name, desc) in serial {
            if desc.device == SerialPortDevice::Uart {
                insert_component(
                    &mut spec,
                    name,
                    v1::instance_spec::Component::SerialPort(SerialPortDesc {
                        num: desc.num,
                    }),
                );
            }
        }

        for (bridge_name, bridge) in pci_pci_bridges {
            insert_component(
                &mut spec,
                bridge_name,
                v1::instance_spec::Component::PciPciBridge(bridge),
            );
        }

        if let Some(pvpanic) = pvpanic {
            insert_component(
                &mut spec,
                pvpanic.id,
                v1::instance_spec::Component::QemuPvpanic(pvpanic.spec),
            );
        }

        if let Some(settings) = boot_settings {
            insert_component(
                &mut spec,
                settings.name,
                v1::instance_spec::Component::BootSettings(BootSettings {
                    order: settings.order.into_iter().map(Into::into).collect(),
                }),
            );
        }

        #[cfg(feature = "failure-injection")]
        if let Some(mig) = migration_failure {
            insert_component(
                &mut spec,
                mig.id,
                v1::instance_spec::Component::MigrationFailureInjector(
                    mig.spec,
                ),
            );
        }

        #[cfg(feature = "falcon")]
        {
            if let Some(softnpu_pci) = softnpu.pci_port {
                insert_component(
                    &mut spec,
                    SpecKey::Name(format!(
                        "softnpu-pci-{}",
                        softnpu_pci.pci_path
                    )),
                    v1::instance_spec::Component::SoftNpuPciPort(softnpu_pci),
                );
            }

            if let Some(p9) = softnpu.p9_device {
                insert_component(
                    &mut spec,
                    SpecKey::Name(format!("softnpu-p9-{}", p9.pci_path)),
                    v1::instance_spec::Component::SoftNpuP9(p9),
                );
            }

            if let Some(p9fs) = softnpu.p9fs {
                insert_component(
                    &mut spec,
                    SpecKey::Name(format!("p9fs-{}", p9fs.pci_path)),
                    v1::instance_spec::Component::P9fs(p9fs),
                );
            }

            for (port_name, port) in softnpu.ports {
                insert_component(
                    &mut spec,
                    port_name.clone(),
                    v1::instance_spec::Component::SoftNpuPort(
                        SoftNpuPortSpec {
                            link_name: port.link_name,
                            backend_id: port.backend_name.clone(),
                        },
                    ),
                );

                insert_component(
                    &mut spec,
                    port.backend_name,
                    v1::instance_spec::Component::DlpiNetworkBackend(
                        port.backend_spec,
                    ),
                );
            }
        }

        spec
    }
}

impl TryFrom<v1::instance_spec::InstanceSpec> for Spec {
    type Error = ApiSpecError;

    fn try_from(
        value: v1::instance_spec::InstanceSpec,
    ) -> Result<Self, Self::Error> {
        Ok(v1_to_spec_builder(value)?.finish())
    }
}

/// Parses a v1 instance spec into a [`SpecBuilder`], validating component
/// names, PCI paths, and backend references along the way. Callers can add
/// additional (non-v1) components to the builder before calling `finish()`.
pub(crate) fn v1_to_spec_builder(
    value: v1::instance_spec::InstanceSpec,
) -> Result<SpecBuilder, ApiSpecError> {
    let mut builder = SpecBuilder::with_instance_spec_board(value.board)?;
    let mut devices: Vec<(SpecKey, v1::instance_spec::Component)> = vec![];
    let mut boot_settings = None;
    let mut storage_backends: BTreeMap<SpecKey, StorageBackend> =
        BTreeMap::new();
    let mut viona_backends: BTreeMap<SpecKey, VirtioNetworkBackend> =
        BTreeMap::new();
    let mut dlpi_backends: BTreeMap<SpecKey, DlpiNetworkBackend> =
        BTreeMap::new();

    for (id, component) in value.components.into_iter() {
        match component {
            v1::instance_spec::Component::CrucibleStorageBackend(_)
            | v1::instance_spec::Component::FileStorageBackend(_)
            | v1::instance_spec::Component::BlobStorageBackend(_) => {
                storage_backends.insert(
                    id,
                    component
                        .try_into()
                        .expect("component is known to be a storage backend"),
                );
            }
            v1::instance_spec::Component::VirtioNetworkBackend(viona) => {
                viona_backends.insert(id, viona);
            }
            v1::instance_spec::Component::DlpiNetworkBackend(dlpi) => {
                dlpi_backends.insert(id, dlpi);
            }
            device => {
                devices.push((id, device));
            }
        }
    }

    for (device_id, device_spec) in devices {
        match device_spec {
            v1::instance_spec::Component::VirtioDisk(_)
            | v1::instance_spec::Component::NvmeDisk(_) => {
                let device_spec = StorageDevice::try_from(device_spec)
                    .expect("component is known to be a disk");

                let (_, backend_spec) = storage_backends
                    .remove_entry(device_spec.backend_id())
                    .ok_or_else(|| ApiSpecError::StorageBackendNotFound {
                        backend: device_spec.backend_id().to_owned(),
                        device: device_id.clone(),
                    })?;

                builder.add_storage_device(
                    device_id,
                    Disk { device_spec, backend_spec },
                )?;
            }
            v1::instance_spec::Component::VirtioNic(nic) => {
                let (_, backend_spec) = viona_backends
                    .remove_entry(&nic.backend_id)
                    .ok_or_else(|| ApiSpecError::NetworkBackendNotFound {
                        backend: nic.backend_id.clone(),
                        device: device_id.clone(),
                    })?;

                builder.add_network_device(
                    device_id,
                    Nic { device_spec: nic, backend_spec },
                )?;
            }
            v1::instance_spec::Component::SerialPort(port) => {
                builder.add_serial_port(device_id, port.num)?;
            }
            v1::instance_spec::Component::PciPciBridge(bridge) => {
                builder.add_pci_bridge(device_id, bridge)?;
            }
            v1::instance_spec::Component::QemuPvpanic(pvpanic) => {
                builder.add_pvpanic_device(QemuPvpanic {
                    id: device_id,
                    spec: pvpanic,
                })?;
            }
            v1::instance_spec::Component::BootSettings(settings) => {
                // The builder returns an error if its caller tries to add
                // a boot option that isn't in the set of attached disks.
                // Since there may be more disk devices left in the
                // component map, just capture the boot order for now and
                // apply it to the builder later.
                boot_settings = Some((device_id, settings));
            }
            #[cfg(not(feature = "failure-injection"))]
            v1::instance_spec::Component::MigrationFailureInjector(_) => {
                return Err(ApiSpecError::FeatureCompiledOut {
                    component: device_id,
                    feature: "failure-injection",
                });
            }
            #[cfg(feature = "failure-injection")]
            v1::instance_spec::Component::MigrationFailureInjector(mig) => {
                builder.add_migration_failure_device(MigrationFailure {
                    id: device_id,
                    spec: mig,
                })?;
            }
            #[cfg(not(feature = "falcon"))]
            v1::instance_spec::Component::SoftNpuPciPort(_)
            | v1::instance_spec::Component::SoftNpuPort(_)
            | v1::instance_spec::Component::SoftNpuP9(_)
            | v1::instance_spec::Component::P9fs(_) => {
                return Err(ApiSpecError::FeatureCompiledOut {
                    component: device_id,
                    feature: "falcon",
                });
            }
            #[cfg(feature = "falcon")]
            v1::instance_spec::Component::SoftNpuPciPort(port) => {
                builder.set_softnpu_pci_port(port)?;
            }
            #[cfg(feature = "falcon")]
            v1::instance_spec::Component::SoftNpuPort(port) => {
                let (_, backend_spec) = dlpi_backends
                    .remove_entry(&port.backend_id)
                    .ok_or_else(|| ApiSpecError::NetworkBackendNotFound {
                        backend: port.backend_id.clone(),
                        device: device_id.clone(),
                    })?;

                let port = SoftNpuPort {
                    link_name: port.link_name,
                    backend_name: port.backend_id,
                    backend_spec,
                };

                builder.add_softnpu_port(device_id, port)?;
            }
            #[cfg(feature = "falcon")]
            v1::instance_spec::Component::SoftNpuP9(p9) => {
                builder.set_softnpu_p9(p9)?;
            }
            #[cfg(feature = "falcon")]
            v1::instance_spec::Component::P9fs(p9fs) => {
                builder.set_p9fs(p9fs)?;
            }
            v1::instance_spec::Component::CrucibleStorageBackend(_)
            | v1::instance_spec::Component::FileStorageBackend(_)
            | v1::instance_spec::Component::BlobStorageBackend(_)
            | v1::instance_spec::Component::VirtioNetworkBackend(_)
            | v1::instance_spec::Component::DlpiNetworkBackend(_) => {
                unreachable!("already filtered out backends")
            }
        }
    }

    // Now that all disks have been attached, try to establish the boot
    // order if one was supplied.
    if let Some(settings) = boot_settings {
        builder.add_boot_order(
            settings.0,
            settings.1.order.into_iter().map(Into::into),
        )?;
    }

    if let Some(backend) = storage_backends.into_keys().next() {
        return Err(ApiSpecError::BackendNotUsed(backend));
    }

    if let Some(backend) = viona_backends.into_keys().next() {
        return Err(ApiSpecError::BackendNotUsed(backend));
    }

    if let Some(backend) = dlpi_backends.into_keys().next() {
        return Err(ApiSpecError::BackendNotUsed(backend));
    }

    Ok(builder)
}


================================================
FILE: bin/propolis-server/src/lib/spec/builder.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A builder for instance specs.

use std::collections::{BTreeSet, HashSet};

use propolis_api_types::instance_spec::{
    components::{
        board::Board as InstanceSpecBoard,
        devices::{PciPciBridge, SerialPortNumber},
    },
    PciPath, SpecKey,
};
use thiserror::Error;

#[cfg(feature = "falcon")]
use propolis_api_types::instance_spec::components::devices::{
    P9fs, SoftNpuP9, SoftNpuPciPort,
};

use crate::spec::SerialPortDevice;

use super::{
    Board, BootOrderEntry, BootSettings, Disk, Nic, QemuPvpanic, SerialPort,
    VirtioSocket,
};

#[cfg(feature = "failure-injection")]
use super::MigrationFailure;

#[cfg(feature = "falcon")]
use super::SoftNpuPort;

/// Errors that can arise while building an instance spec from component parts.
#[derive(Debug, Error)]
pub(crate) enum SpecBuilderError {
    #[error("device {0} has the same name as its backend")]
    DeviceAndBackendNamesIdentical(SpecKey),

    #[error("a component with name {0} already exists")]
    ComponentNameInUse(SpecKey),

    #[error("a PCI device is already attached at {0:?}")]
    PciPathInUse(PciPath),

    #[error("serial port {0:?} is already specified")]
    SerialPortInUse(SerialPortNumber),

    #[error("pvpanic device already specified")]
    PvpanicInUse,

    #[error("vsock device already specified")]
    VsockInUse,

    #[cfg(feature = "failure-injection")]
    #[error("migration failure injection already enabled")]
    MigrationFailureInjectionInUse,

    #[error("boot settings were already specified")]
    BootSettingsInUse,

    #[error("boot option {0} is not an attached device")]
    BootOptionMissing(SpecKey),

    #[error("instance spec's CPUID entries are invalid")]
    CpuidEntriesInvalid(#[from] cpuid_utils::CpuidMapConversionError),

    #[error("failed to read default CPUID settings from the host")]
    DefaultCpuidReadFailed(#[from] cpuid_utils::host::GetHostCpuidError),
}

#[derive(Debug, Default)]
pub(crate) struct SpecBuilder {
    spec: super::Spec,
    pci_paths: BTreeSet<PciPath>,
    serial_ports: HashSet<SerialPortNumber>,
    component_names: BTreeSet<SpecKey>,
}

impl SpecBuilder {
    pub(super) fn with_instance_spec_board(
        board: InstanceSpecBoard,
    ) -> Result<Self, SpecBuilderError> {
        let cpuid = match board.cpuid {
            Some(cpuid) => cpuid_utils::CpuidSet::from_map(
                cpuid.entries.try_into()?,
                cpuid.vendor,
            ),
            None => cpuid_utils::host::query_complete(
                cpuid_utils::host::CpuidSource::BhyveDefault,
            )?,
        };

        Ok(Self {
            spec: super::Spec {
                board: Board {
                    cpus: board.cpus,
                    memory_mb: board.memory_mb,
                    chipset: board.chipset,
                    guest_hv_interface: board.guest_hv_interface,
                },
                cpuid,
                ..Default::default()
            },
            ..Default::default()
        })
    }

    /// Sets the spec's boot order to the list of disk devices specified in
    /// `boot_options`.
    ///
    /// All of the items in the supplied `boot_options` must already be present
    /// in the spec's disk map.
    pub fn add_boot_order(
        &mut self,
        component_id: SpecKey,
        boot_options: impl Iterator<Item = BootOrderEntry>,
    ) -> Result<(), SpecBuilderError> {
        if self.component_names.contains(&component_id) {
            return Err(SpecBuilderError::ComponentNameInUse(component_id));
        }

        if self.spec.boot_settings.is_some() {
            return Err(SpecBuilderError::BootSettingsInUse);
        }

        let mut order = vec![];
        for item in boot_options {
            if !self.spec.disks.contains_key(&item.device_id) {
                return Err(SpecBuilderError::BootOptionMissing(
                    item.device_id.clone(),
                ));
            }

            order.push(crate::spec::BootOrderEntry {
                device_id: item.device_id.clone(),
            });
        }

        self.spec.boot_settings =
            Some(BootSettings { name: component_id, order });
        Ok(())
    }

    /// Adds a PCI path to this builder's record of PCI locations with an
    /// attached device. If the path is already in use, returns an error.
    fn register_pci_device(
        &mut self,
        pci_path: PciPath,
    ) -> Result<(), SpecBuilderError> {
        if self.pci_paths.contains(&pci_path) {
            Err(SpecBuilderError::PciPathInUse(pci_path))
        } else {
            self.pci_paths.insert(pci_path);
            Ok(())
        }
    }

    /// Adds a storage device with an associated backend.
    pub(super) fn add_storage_device(
        &mut self,
        disk_id: SpecKey,
        disk: Disk,
    ) -> Result<&Self, SpecBuilderError> {
        if disk_id == *disk.device_spec.backend_id() {
            return Err(SpecBuilderError::DeviceAndBackendNamesIdentical(
                disk_id,
            ));
        }

        if self.component_names.contains(&disk_id) {
            return Err(SpecBuilderError::ComponentNameInUse(disk_id));
        }

        if self.component_names.contains(disk.device_spec.backend_id()) {
            return Err(SpecBuilderError::ComponentNameInUse(
                disk.device_spec.backend_id().to_owned(),
            ));
        }

        self.register_pci_device(disk.device_spec.pci_path())?;
        self.component_names.insert(disk_id.clone());
        self.component_names.insert(disk.device_spec.backend_id().to_owned());
        let _old = self.spec.disks.insert(disk_id, disk);
        assert!(_old.is_none());
        Ok(self)
    }

    /// Adds a network device with an associated backend.
    pub(super) fn add_network_device(
        &mut self,
        nic_id: SpecKey,
        nic: Nic,
    ) -> Result<&Self, SpecBuilderError> {
        if nic_id == nic.device_spec.backend_id {
            return Err(SpecBuilderError::DeviceAndBackendNamesIdentical(
                nic_id,
            ));
        }

        if self.component_names.contains(&nic_id) {
            return Err(SpecBuilderError::ComponentNameInUse(nic_id));
        }

        if self.component_names.contains(&nic.device_spec.backend_id) {
            return Err(SpecBuilderError::ComponentNameInUse(
                nic.device_spec.backend_id,
            ));
        }

        self.register_pci_device(nic.device_spec.pci_path)?;
        self.component_names.insert(nic_id.clone());
        self.component_names.insert(nic.device_spec.backend_id.clone());
        let _old = self.spec.nics.insert(nic_id, nic);
        assert!(_old.is_none());
        Ok(self)
    }

    /// Adds a PCI-PCI bridge.
    pub fn add_pci_bridge(
        &mut self,
        id: SpecKey,
        bridge: PciPciBridge,
    ) -> Result<&Self, SpecBuilderError> {
        if self.component_names.contains(&id) {
            return Err(SpecBuilderError::ComponentNameInUse(id));
        }

        self.register_pci_device(bridge.pci_path)?;
        self.component_names.insert(id.clone());
        let _old = self.spec.pci_pci_bridges.insert(id, bridge);
        assert!(_old.is_none());
        Ok(self)
    }

    /// Adds a serial port.
    pub fn add_serial_port(
        &mut self,
        id: SpecKey,
        num: SerialPortNumber,
    ) -> Result<&Self, SpecBuilderError> {
        if self.component_names.contains(&id) {
            return Err(SpecBuilderError::ComponentNameInUse(id));
        }

        if self.serial_ports.contains(&num) {
            return Err(SpecBuilderError::SerialPortInUse(num));
        }

        let desc = SerialPort { num, device: SerialPortDevice::Uart };
        self.spec.serial.insert(id.clone(), desc);
        self.component_names.insert(id);
        self.serial_ports.insert(num);
        Ok(self)
    }

    pub fn add_pvpanic_device(
        &mut self,
        pvpanic: QemuPvpanic,
    ) -> Result<&Self, SpecBuilderError> {
        if self.component_names.contains(&pvpanic.id) {
            return Err(SpecBuilderError::ComponentNameInUse(pvpanic.id));
        }

        if self.spec.pvpanic.is_some() {
            return Err(SpecBuilderError::PvpanicInUse);
        }

        self.component_names.insert(pvpanic.id.clone());
        self.spec.pvpanic = Some(pvpanic);
        Ok(self)
    }

    pub fn add_vsock_device(
        &mut self,
        vsock: VirtioSocket,
    ) -> Result<&Self, SpecBuilderError> {
        if self.component_names.contains(&vsock.id) {
            return Err(SpecBuilderError::ComponentNameInUse(vsock.id));
        }

        if self.spec.vsock.is_some() {
            return Err(SpecBuilderError::VsockInUse);
        }

        self.register_pci_device(vsock.spec.pci_path)?;
        self.component_names.insert(vsock.id.clone());
        self.spec.vsock = Some(vsock);

        Ok(self)
    }

    #[cfg(feature = "failure-injection")]
    pub fn add_migration_failure_device(
        &mut self,
        mig: MigrationFailure,
    ) -> Result<&Self, SpecBuilderError> {
        if self.component_names.contains(&mig.id) {
            return Err(SpecBuilderError::ComponentNameInUse(mig.id));
        }

        if self.spec.migration_failure.is_some() {
            return Err(SpecBuilderError::MigrationFailureInjectionInUse);
        }

        self.component_names.insert(mig.id.clone());
        self.spec.migration_failure = Some(mig);
        Ok(self)
    }

    #[cfg(feature = "falcon")]
    pub fn set_softnpu_pci_port(
        &mut self,
        pci_port: SoftNpuPciPort,
    ) -> Result<&Self, SpecBuilderError> {
        // SoftNPU squats on COM4.
        let id = SpecKey::Name("com4".to_string());
        let num = SerialPortNumber::Com4;
        if self.component_names.contains(&id) {
            return Err(SpecBuilderError::ComponentNameInUse(id));
        }

        if self.serial_ports.contains(&num) {
            return Err(SpecBuilderError::SerialPortInUse(num));
        }

        self.register_pci_device(pci_port.pci_path)?;
        self.spec.softnpu.pci_port = Some(pci_port);
        self.spec
            .serial
            .insert(id, SerialPort { num, device: SerialPortDevice::SoftNpu });
        Ok(self)
    }

    #[cfg(feature = "falcon")]
    pub fn set_softnpu_p9(
        &mut self,
        p9: SoftNpuP9,
    ) -> Result<&Self, SpecBuilderError> {
        self.register_pci_device(p9.pci_path)?;
        self.spec.softnpu.p9_device = Some(p9);
        Ok(self)
    }

    #[cfg(feature = "falcon")]
    pub fn set_p9fs(&mut self, p9fs: P9fs) -> Result<&Self, SpecBuilderError> {
        self.register_pci_device(p9fs.pci_path)?;
        self.spec.softnpu.p9fs = Some(p9fs);
        Ok(self)
    }

    #[cfg(feature = "falcon")]
    pub fn add_softnpu_port(
        &mut self,
        port_name: SpecKey,
        port: SoftNpuPort,
    ) -> Result<&Self, SpecBuilderError> {
        if port_name == port.backend_name {
            return Err(SpecBuilderError::DeviceAndBackendNamesIdentical(
                port_name,
            ));
        }

        if self.component_names.contains(&port_name) {
            return Err(SpecBuilderError::ComponentNameInUse(port_name));
        }

        if self.component_names.contains(&port.backend_name) {
            return Err(SpecBuilderError::ComponentNameInUse(
                port.backend_name,
            ));
        }

        let _old = self.spec.softnpu.ports.insert(port_name, port);
        assert!(_old.is_none());
        Ok(self)
    }

    /// Yields the completed spec, consuming the builder.
    pub fn finish(self) -> super::Spec {
        self.spec
    }
}

#[cfg(test)]
mod test {
    use propolis_api_types::instance_spec::components::{
        backends::{BlobStorageBackend, VirtioNetworkBackend},
        board::{Chipset, GuestHypervisorInterface, I440Fx},
        devices::{VirtioDisk, VirtioNic},
    };
    use uuid::Uuid;

    use crate::spec::{StorageBackend, StorageDevice};

    use super::*;

    fn test_builder() -> SpecBuilder {
        let board = Board {
            cpus: 4,
            memory_mb: 512,
            chipset: Chipset::I440Fx(I440Fx { enable_pcie: false }),
            guest_hv_interface: GuestHypervisorInterface::Bhyve,
        };

        SpecBuilder {
            spec: crate::spec::Spec { board, ..Default::default() },
            ..Default::default()
        }
    }

    #[test]
    fn duplicate_pci_slot() {
        let mut builder = test_builder();
        assert!(builder
            .add_storage_device(
                SpecKey::Name("storage".to_owned()),
                Disk {
                    device_spec: StorageDevice::Virtio(VirtioDisk {
                        backend_id: SpecKey::Name("storage-backend".to_owned()),
                        pci_path: PciPath::new(0, 4, 0).unwrap()
                    }),
                    backend_spec: StorageBackend::Blob(BlobStorageBackend {
                        base64: "".to_string(),
                        readonly: false
                    })
                }
            )
            .is_ok());

        assert!(builder
            .add_network_device(
                SpecKey::Name("network".to_owned()),
                Nic {
                    device_spec: VirtioNic {
                        backend_id: SpecKey::Name("network-backend".to_owned()),
                        interface_id: Uuid::nil(),
                        pci_path: PciPath::new(0, 4, 0).unwrap()
                    },
                    backend_spec: VirtioNetworkBackend {
                        vnic_name: "vnic0".to_owned()
                    }
                }
            )
            .is_err());
    }

    #[test]
    fn duplicate_serial_port() {
        let mut builder = test_builder();
        assert!(builder
            .add_serial_port(
                SpecKey::Name("com1".to_owned()),
                SerialPortNumber::Com1
            )
            .is_ok());
        assert!(builder
            .add_serial_port(
                SpecKey::Name("com2".to_owned()),
                SerialPortNumber::Com2
            )
            .is_ok());
        assert!(builder
            .add_serial_port(
                SpecKey::Name("com3".to_owned()),
                SerialPortNumber::Com3
            )
            .is_ok());
        assert!(builder
            .add_serial_port(
                SpecKey::Name("com4".to_owned()),
                SerialPortNumber::Com4
            )
            .is_ok());
        assert!(builder
            .add_serial_port(
                SpecKey::Name("com1".to_owned()),
                SerialPortNumber::Com1
            )
            .is_err());
    }

    #[test]
    fn device_with_same_name_as_backend() {
        let mut builder = test_builder();
        assert!(builder
            .add_storage_device(
                SpecKey::Name("storage".to_owned()),
                Disk {
                    device_spec: StorageDevice::Virtio(VirtioDisk {
                        backend_id: SpecKey::Name("storage".to_owned()),
                        pci_path: PciPath::new(0, 4, 0).unwrap()
                    }),
                    backend_spec: StorageBackend::Blob(BlobStorageBackend {
                        base64: "".to_string(),
                        readonly: false
                    })
                }
            )
            .is_err());

        assert!(builder
            .add_network_device(
                SpecKey::Name("network".to_owned()),
                Nic {
                    device_spec: VirtioNic {
                        backend_id: SpecKey::Name("network".to_owned()),
                        interface_id: Uuid::nil(),
                        pci_path: PciPath::new(0, 5, 0).unwrap()
                    },
                    backend_spec: VirtioNetworkBackend {
                        vnic_name: "vnic0".to_owned()
                    }
                }
            )
            .is_err());
    }
}


================================================
FILE: bin/propolis-server/src/lib/spec/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance specs describe how to configure a VM and what components it has.
//!
//! This module defines a crate-internal instance spec type, [`Spec`], and its
//! constituent types, like [`Disk`] and [`Nic`]. Unlike the types in
//! [`propolis_api_types::instance_spec`], these internal types are not
//! `Serialize` and are never meant to be used over the wire in API requests or
//! the migration protocol. This allows them to change freely between Propolis
//! versions, so long as they can consistently be converted to and from the
//! wire-format types in the [`propolis_api_types`] crate. This, in turn, allows
//! [`Spec`] and its component types to take forms that might otherwise be hard
//! to change in a backward-compatible way.

use std::collections::BTreeMap;

use crate::spec::api_spec_v0::ApiSpecError;
use cpuid_utils::CpuidSet;
use propolis_api_types::instance_spec::{
    components::{
        backends::{
            BlobStorageBackend, CrucibleStorageBackend, FileStorageBackend,
            VirtioNetworkBackend,
        },
        board::{Chipset, GuestHypervisorInterface, I440Fx},
        devices::{
            NvmeDisk, PciPciBridge, QemuPvpanic as QemuPvpanicDesc,
            SerialPortNumber, VirtioDisk, VirtioNic,
            VirtioSocket as VirtioSocketDesc,
        },
    },
    PciPath, SpecKey,
};
use propolis_api_types::instance_spec::{
    Component, InstanceSpec, SmbiosType1Input,
};
use propolis_api_types_versions::{v1, v2};
use thiserror::Error;

#[cfg(feature = "failure-injection")]
use propolis_api_types::instance_spec::components::devices::MigrationFailureInjector;

#[cfg(feature = "falcon")]
use propolis_api_types::instance_spec::components::{
    backends::DlpiNetworkBackend,
    devices::{P9fs, SoftNpuP9, SoftNpuPciPort},
};

// mod api_request;
pub(crate) mod api_spec_v0;
pub(crate) mod builder;

/// The code related to latest types does not go into a versioned module
impl From<Spec> for InstanceSpec {
    fn from(val: Spec) -> Self {
        let smbios = val.smbios_type1_input.clone();
        let vsock = val.vsock.clone();

        let v1_spec: v1::instance_spec::InstanceSpec = val.into();
        let v2_spec =
            v2::instance_spec::InstanceSpec { smbios, ..v1_spec.into() };
        let mut spec: InstanceSpec = v2_spec.into();

        if let Some(vsock) = vsock {
            spec.components
                .insert(vsock.id, Component::VirtioSocket(vsock.spec));
        }
        spec
    }
}

/// The code related to latest types does not go into a versioned module
impl TryFrom<InstanceSpec> for Spec {
    type Error = ApiSpecError;

    fn try_from(value: InstanceSpec) -> Result<Self, Self::Error> {
        // Extract vsock before conversion since it's v3-only and will be
        // filtered out during the v3→v2→v1 chain.
        let mut vsock_entry = None;
        for (id, component) in &value.components {
            if let Component::VirtioSocket(v) = component {
                vsock_entry = Some(VirtioSocket { id: id.clone(), spec: *v });
                break;
            }
        }

        let v2_spec: v2::instance_spec::InstanceSpec = value.into();
        let smbios = v2_spec.smbios.clone();
        let v1_spec: v1::instance_spec::InstanceSpec = v2_spec.into();

        let mut builder = api_spec_v0::v1_to_spec_builder(v1_spec)?;
        if let Some(vsock) = vsock_entry {
            builder.add_vsock_device(vsock)?;
        }
        let mut spec = builder.finish();
        spec.smbios_type1_input = smbios;
        Ok(spec)
    }
}

#[derive(Debug, Error)]
#[error("input component type can't convert to output type")]
pub struct ComponentTypeMismatch;

/// An instance specification that describes a VM's configuration and
/// components.
///
/// NOTE: This struct's fields are `pub` to make it convenient to access the
/// individual parts of a fully-constructed spec. Modules that consume specs may
/// assert that they are valid (no duplicate component names, no duplicate PCI
/// device paths, etc.). When constructing a new spec, use the
/// [`builder::SpecBuilder`] struct to catch requests that violate these
/// invariants.
#[derive(Clone, Debug, Default)]
pub(crate) struct Spec {
    pub board: Board,
    pub cpuid: CpuidSet,
    pub disks: BTreeMap<SpecKey, Disk>,
    pub nics: BTreeMap<SpecKey, Nic>,
    pub boot_settings: Option<BootSettings>,

    pub serial: BTreeMap<SpecKey, SerialPort>,

    pub pci_pci_bridges: BTreeMap<SpecKey, PciPciBridge>,
    pub pvpanic: Option<QemuPvpanic>,

    pub vsock: Option<VirtioSocket>,

    #[cfg(feature = "failure-injection")]
    pub migration_failure: Option<MigrationFailure>,

    #[cfg(feature = "falcon")]
    pub softnpu: SoftNpu,

    // TODO: This is an option because there is no good way to generate a
    // default implementation of `SmbiosType1Input`. The default `serial_number`
    // field of `SmbiosType1Input` should be equivalent to the VM UUID for
    // backwards compatibility, but that isn't currently possible.
    //
    // One way to fix this would be to remove the `Builder` and directly
    // construct `Spec` from a function that takes an `v1::instance_spec::InstanceSpec` and the
    // VM UUID. This would replace `impl TryFrom<v1::instance_spec::InstanceSpec> for Spec`, and
    // would allow removing the `Default` derive on `Spec`, and the `Option`
    // from the `smbios_type1_input` field.
    pub smbios_type1_input: Option<SmbiosType1Input>,
}

/// The VM's mainboard.
///
/// This is distinct from the [instance spec `Board`] so that it can exclude
/// fields (such as CPUID information) that need to be checked for validity
/// before being included in an internal spec.
///
/// [instance spec `Board`]: propolis_api_types::instance_spec::components::board::Board
#[derive(Clone, Debug)]
pub(crate) struct Board {
    pub cpus: u8,
    pub memory_mb: u64,
    pub chipset: Chipset,
    pub guest_hv_interface: GuestHypervisorInterface,
}

impl Default for Board {
    fn default() -> Self {
        Self {
            cpus: 0,
            memory_mb: 0,
            chipset: Chipset::I440Fx(I440Fx { enable_pcie: false }),
            guest_hv_interface: GuestHypervisorInterface::Bhyve,
        }
    }
}

#[derive(Clone, Debug)]
pub(crate) struct BootSettings {
    pub name: SpecKey,
    pub order: Vec<BootOrderEntry>,
}

#[derive(Clone, Debug)]
pub(crate) struct BootOrderEntry {
    pub device_id: SpecKey,
}

impl
    From<propolis_api_types::instance_spec::components::devices::BootOrderEntry>
    for BootOrderEntry
{
    fn from(
        value: propolis_api_types::instance_spec::components::devices::BootOrderEntry,
    ) -> Self {
        Self { device_id: value.id.clone() }
    }
}

impl From<BootOrderEntry>
    for propolis_api_types::instance_spec::components::devices::BootOrderEntry
{
    fn from(value: BootOrderEntry) -> Self {
        Self { id: value.device_id }
    }
}

/// Describes the device half of a [`Disk`].
#[derive(Clone, Debug)]
pub enum StorageDevice {
    Virtio(VirtioDisk),
    Nvme(NvmeDisk),
}

impl StorageDevice {
    pub fn kind(&self) -> &'static str {
        match self {
            StorageDevice::Virtio(_) => "virtio",
            StorageDevice::Nvme(_) => "nvme",
        }
    }

    pub fn pci_path(&self) -> PciPath {
        match self {
            StorageDevice::Virtio(disk) => disk.pci_path,
            StorageDevice::Nvme(disk) => disk.pci_path,
        }
    }

    pub fn backend_id(&self) -> &SpecKey {
        match self {
            StorageDevice::Virtio(disk) => &disk.backend_id,
            StorageDevice::Nvme(disk) => &disk.backend_id,
        }
    }
}

impl From<StorageDevice> for v1::instance_spec::Component {
    fn from(value: StorageDevice) -> Self {
        match value {
            StorageDevice::Virtio(d) => Self::VirtioDisk(d),
            StorageDevice::Nvme(d) => Self::NvmeDisk(d),
        }
    }
}

impl TryFrom<v1::instance_spec::Component> for StorageDevice {
    type Error = ComponentTypeMismatch;

    fn try_from(
        value: v1::instance_spec::Component,
    ) -> Result<Self, Self::Error> {
        match value {
            v1::instance_spec::Component::VirtioDisk(d) => Ok(Self::Virtio(d)),
            v1::instance_spec::Component::NvmeDisk(d) => Ok(Self::Nvme(d)),
            _ => Err(ComponentTypeMismatch),
        }
    }
}

/// Describes the backend half of a [`Disk`].
#[derive(Clone, Debug)]
pub enum StorageBackend {
    Crucible(CrucibleStorageBackend),
    File(FileStorageBackend),
    Blob(BlobStorageBackend),
}

impl StorageBackend {
    pub fn kind(&self) -> &'static str {
        match self {
            StorageBackend::Crucible(_) => "crucible",
            StorageBackend::File(_) => "file",
            StorageBackend::Blob(_) => "backend",
        }
    }

    pub fn read_only(&self) -> bool {
        match self {
            StorageBackend::Crucible(be) => be.readonly,
            StorageBackend::File(be) => be.readonly,
            StorageBackend::Blob(be) => be.readonly,
        }
    }
}

impl From<StorageBackend> for v1::instance_spec::Component {
    fn from(value: StorageBackend) -> Self {
        match value {
            StorageBackend::Crucible(be) => Self::CrucibleStorageBackend(be),
            StorageBackend::File(be) => Self::FileStorageBackend(be),
            StorageBackend::Blob(be) => Self::BlobStorageBackend(be),
        }
    }
}

impl TryFrom<v1::instance_spec::Component> for StorageBackend {
    type Error = ComponentTypeMismatch;

    fn try_from(
        value: v1::instance_spec::Component,
    ) -> Result<Self, Self::Error> {
        match value {
            v1::instance_spec::Component::CrucibleStorageBackend(be) => {
                Ok(Self::Crucible(be))
            }
            v1::instance_spec::Component::FileStorageBackend(be) => {
                Ok(Self::File(be))
            }
            v1::instance_spec::Component::BlobStorageBackend(be) => {
                Ok(Self::Blob(be))
            }
            _ => Err(ComponentTypeMismatch),
        }
    }
}

#[derive(Clone, Debug)]
pub struct Disk {
    pub device_spec: StorageDevice,
    pub backend_spec: StorageBackend,
}

#[derive(Clone, Debug)]
pub struct Nic {
    pub device_spec: VirtioNic,
    pub backend_spec: VirtioNetworkBackend,
}

/// A kind of device to install as the listener on a COM port.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SerialPortDevice {
    Uart,

    #[cfg(feature = "falcon")]
    SoftNpu,
}

impl std::fmt::Display for SerialPortDevice {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}",
            match self {
                SerialPortDevice::Uart => "uart",

                #[cfg(feature = "falcon")]
                SerialPortDevice::SoftNpu => "softnpu",
            }
        )
    }
}

#[derive(Clone, Debug)]
pub struct SerialPort {
    pub num: SerialPortNumber,
    pub device: SerialPortDevice,
}

#[derive(Clone, Debug)]
pub struct QemuPvpanic {
    #[allow(dead_code)]
    pub id: SpecKey,
    pub spec: QemuPvpanicDesc,
}

#[derive(Clone, Debug)]
pub struct VirtioSocket {
    pub id: SpecKey,
    pub spec: VirtioSocketDesc,
}

#[cfg(feature = "failure-injection")]
#[derive(Clone, Debug)]
pub struct MigrationFailure {
    pub id: SpecKey,
    pub spec: MigrationFailureInjector,
}

#[cfg(feature = "falcon")]
#[derive(Clone, Debug)]
pub struct SoftNpuPort {
    pub link_name: String,
    pub backend_name: SpecKey,
    pub backend_spec: DlpiNetworkBackend,
}

#[cfg(feature = "falcon")]
#[derive(Clone, Debug, Default)]
pub struct SoftNpu {
    pub pci_port: Option<SoftNpuPciPort>,
    pub ports: BTreeMap<SpecKey, SoftNpuPort>,
    pub p9_device: Option<SoftNpuP9>,
    pub p9fs: Option<P9fs>,
}


================================================
FILE: bin/propolis-server/src/lib/stats/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Methods for starting an Oximeter endpoint and gathering server-level stats.

use std::net::SocketAddr;
use std::sync::{Arc, Mutex};

use omicron_common::api::internal::nexus::{ProducerEndpoint, ProducerKind};
use oximeter::{
    types::{ProducerRegistry, Sample},
    MetricsError, Producer,
};
use oximeter_instruments::kstat::KstatSampler;
use oximeter_producer::{Config, Error, Server};
use slog::Logger;
use uuid::Uuid;

use crate::spec::Spec;
use crate::{server::MetricsEndpointConfig, vm::NetworkInterfaceIds};

mod network_interface;
mod pvpanic;
mod virtual_disk;
mod virtual_machine;

#[cfg(all(not(test), target_os = "illumos"))]
use self::network_interface::InstanceNetworkInterfaces;
pub(crate) use self::pvpanic::PvpanicProducer;
pub(crate) use self::virtual_disk::{BlockMetrics, VirtualDisk};
pub(crate) use self::virtual_machine::VirtualMachine;

/// Interval on which we ask `oximeter` to poll us for metric data.
//
// Note that some statistics, like those based on kstats, are sampled more
// densely than this proactively. Their sampling rate is decoupled from this
// poll interval. Others, like the virtual disk stats, are updated all the time,
// but we only generate _samples_ from that when `oximeter` comes polling.
//
// In short, set this to the minimum interval on which you'd like those
// statistics to be sampled.
const OXIMETER_COLLECTION_INTERVAL: tokio::time::Duration =
    tokio::time::Duration::from_secs(10);

/// Interval on which we sample instance/guest network interface metrics.
///
/// This matches what we're currently using for sampling
/// sled link metrics.
#[cfg(all(not(test), target_os = "illumos"))]
const NETWORK_INTERFACE_SAMPLE_INTERVAL: std::time::Duration =
    std::time::Duration::from_secs(10);

/// Interval on which we produce vCPU metrics.
#[cfg(all(not(test), target_os = "illumos"))]
const VCPU_KSTAT_INTERVAL: std::time::Duration =
    std::time::Duration::from_secs(5);

/// The kstat sampler includes a limit to its internal buffers for each target,
/// to avoid growing without bound. We introduce this buffer as a multiplier
/// for extra space.
const SAMPLE_BUFFER: u32 = 64;

/// The kstat sampler includes a limit to its internal buffers for each target,
/// to avoid growing without bound. This defaults to 500 samples. Since we have 5
/// vCPU microstates for which we track occupancy and up to 64 vCPUs, we can
/// easily run up against this default.
///
/// This limit provides extra space for up to 64 samples per vCPU per microstate,
/// to ensure we don't throw away too much data if oximeter cannot reach us.
const KSTAT_LIMIT_PER_VCPU: u32 =
    crate::stats::virtual_machine::N_VCPU_MICROSTATES * SAMPLE_BUFFER;

/// Shared type for tracking metrics about the Propolis API server itself.
#[derive(Clone, Debug)]
struct ServerStatsInner {
    /// The oximeter Target identifying this instance as the source of metric
    /// data.
    virtual_machine: VirtualMachine,

    /// The reset count for the relevant instance.
    run_count: virtual_machine::Reset,
}

impl ServerStatsInner {
    pub fn new(virtual_machine: VirtualMachine) -> Self {
        ServerStatsInner {
            virtual_machine,
            run_count: virtual_machine::Reset { datum: Default::default() },
        }
    }
}

/// Type publishing metrics about the Propolis API server itself.
//
// NOTE: This type is shared with the server and the oximeter producer. The
// former updates stats as API requests are handled or other actions taken, and
// the latter collects the stats when oximeter requests them.
#[derive(Clone, Debug)]
pub struct ServerStats {
    inner: Arc<Mutex<ServerStatsInner>>,
}

impl ServerStats {
    /// Create new server stats, representing the provided instance.
    pub fn new(vm: VirtualMachine) -> Self {
        Self { inner: Arc::new(Mutex::new(ServerStatsInner::new(vm))) }
    }

    /// Increments the number of times the managed instance was reset.
    pub fn count_reset(&self) {
        self.inner.lock().unwrap().run_count.datum.increment();
    }
}

impl Producer for ServerStats {
    fn produce(
        &mut self,
    ) -> Result<Box<dyn Iterator<Item = Sample> + 'static>, MetricsError> {
        let run_count = {
            let inner = self.inner.lock().unwrap();
            std::iter::once(Sample::new(
                &inner.virtual_machine,
                &inner.run_count,
            )?)
        };
        Ok(Box::new(run_count))
    }
}

/// Launches and returns an Oximeter metrics server.
///
/// # Parameters
///
/// - `id`: The ID of the instance for whom this server is being started.
/// - `config`: The metrics config options, including our address (on which we
///   serve metrics for oximeter to collect), and the registration address (a
///   Nexus instance through which we request registration as an oximeter
///   producer).
/// - `log`: A logger to use when logging from this routine.
/// - `registry`: The oximeter [`ProducerRegistry`] that the spawned server will
///   use to return metric data to oximeter on request.
///
/// The returned server will attempt to register with Nexus in a background
/// task, and will periodically renew that registration. The returned server is
/// running, and need not be poked or renewed to successfully serve metric data.
pub fn start_oximeter_server(
    id: Uuid,
    config: &MetricsEndpointConfig,
    log: &Logger,
    registry: &ProducerRegistry,
) -> Result<Server, Error> {
    // Request an ephemeral port on which to serve metrics.
    let producer_address = SocketAddr::new(config.listen_addr, 0);
    let registration_address = config.registration_addr;

    let server_info = ProducerEndpoint {
        id,
        kind: ProducerKind::Instance,
        address: producer_address,
        interval: OXIMETER_COLLECTION_INTERVAL,
    };

    // Create a child logger, to avoid intermingling the producer server output
    // with the main Propolis server.
    let producer_log = oximeter_producer::LogConfig::Logger(
        log.new(slog::o!("component" => "oximeter-producer")),
    );

    // The maximum size of a single Dropshot request.
    //
    // This is a pretty arbitrary limit, but one that should be big enough for
    // the statistics we serve today (vCPU usage and panic counts), with
    // headroom for adding quite a few more.
    const MAX_REQUEST_SIZE: usize = 1024 * 1024;
    let config = Config {
        server_info,
        registration_address,
        default_request_body_max_bytes: MAX_REQUEST_SIZE,
        log: producer_log,
    };

    // Create the server which will attempt to register with Nexus.
    Server::with_registry(registry.clone(), &config)
}

/// Create an object that can be used to sample kstat-based metrics.
pub(crate) fn create_kstat_sampler(
    log: &Logger,
    spec: &Spec,
) -> Option<KstatSampler> {
    let kstat_limit = usize::try_from(
        (u32::from(spec.board.cpus) * KSTAT_LIMIT_PER_VCPU)
            + (spec.nics.len() as u32 * SAMPLE_BUFFER),
    )
    .unwrap();

    match KstatSampler::with_sample_limit(log, kstat_limit) {
        Ok(sampler) => Some(sampler),
        Err(e) => {
            slog::error!(
                log,
                "failed to create KstatSampler, \
                kstat-based stats will be unavailable";
                "error" => ?e,
            );
            None
        }
    }
}

/// Track kstats required to publish vCPU metrics for this instance.
#[cfg(any(test, not(target_os = "illumos")))]
pub(crate) async fn track_vcpu_kstats(
    log: &Logger,
    _: &KstatSampler,
    _: &VirtualMachine,
) {
    slog::error!(log, "vCPU stats are not supported on this platform");
}

/// Track kstats required to publish vCPU metrics for this instance.
#[cfg(all(not(test), target_os = "illumos"))]
pub(crate) async fn track_vcpu_kstats(
    log: &Logger,
    sampler: &KstatSampler,
    virtual_machine: &VirtualMachine,
) {
    let details = oximeter_instruments::kstat::CollectionDetails::never(
        VCPU_KSTAT_INTERVAL,
    );
    if let Err(e) = sampler.add_target(virtual_machine.clone(), details).await {
        slog::error!(
            log,
            "failed to add VirtualMachine target, \
            vCPU stats will be unavailable";
            "error" => ?e,
        );
    }
}

/// Track kstats required to publish network interface metrics for this instance.
#[cfg(any(test, not(target_os = "illumos")))]
pub(crate) async fn track_network_interface_kstats(
    log: &Logger,
    _: &KstatSampler,
    _: &VirtualMachine,
    _: NetworkInterfaceIds,
) {
    slog::error!(
        log,
        "network interface stats are not supported on this platform"
    );
}

/// Track kstats required to publish network interface metrics for this instance.
#[cfg(all(not(test), target_os = "illumos"))]
pub(crate) async fn track_network_interface_kstats(
    log: &Logger,
    sampler: &KstatSampler,
    virtual_machine: &VirtualMachine,
    interface_ids: NetworkInterfaceIds,
) {
    let nics = InstanceNetworkInterfaces::new(virtual_machine, &interface_ids);
    let details = oximeter_instruments::kstat::CollectionDetails::never(
        NETWORK_INTERFACE_SAMPLE_INTERVAL,
    );
    if let Err(e) = sampler.add_target(nics, details).await {
        let network_interface_ids = interface_ids
            .iter()
            .map(|(uuid, device_id)| {
                format!("{uuid} (kstat-instance: {device_id})")
            })
            .collect::<Vec<_>>()
            .join(", ");

        slog::error!(
            log,
            "failed to add network interface targets, \
            network interface stats will be unavailable";
            "network_interface_ids" => network_interface_ids,
            "error" => ?e,
        );
    }
}

#[cfg(all(not(test), target_os = "illumos"))]
mod kstat_types {
    pub(crate) use kstat_rs::{Data, Kstat, Named, NamedData};
    pub(crate) use oximeter_instruments::kstat::{
        hrtime_to_utc, ConvertNamedData, Error, KstatList, KstatTarget,
    };
}

/// Mock the relevant subset of `kstat-rs` types needed for tests.
#[cfg(not(all(not(test), target_os = "illumos")))]
#[allow(dead_code, unused)]
mod kstat_types {
    use chrono::DateTime;
    use chrono::Utc;
    use oximeter::{Sample, Target};

    pub(crate) type KstatList<'a, 'k> =
        &'a [(DateTime<Utc>, Kstat<'k>, Data<'k>)];

    pub(crate) trait KstatTarget:
        Target + Send + Sync + 'static + std::fmt::Debug
    {
        /// Return true for any kstat you're interested in.
        fn interested(&self, kstat: &Kstat<'_>) -> bool;

        /// Convert from a kstat and its data to a list of samples.
        fn to_samples(
            &self,
            kstats: KstatList<'_, '_>,
        ) -> Result<Vec<Sample>, Error>;
    }

    #[derive(Debug, Clone)]
    pub(crate) enum Data<'a> {
        Named(Vec<Named<'a>>),
        Null,
    }

    #[derive(Debug, Clone)]
    pub(crate) enum NamedData<'a> {
        UInt32(u32),
        UInt64(u64),
        String(&'a str),
    }

    #[derive(Debug)]
    pub(crate) struct Kstat<'a> {
        pub ks_module: &'a str,
        pub ks_instance: i32,
        pub ks_name: &'a str,
        pub ks_snaptime: i64,
    }

    #[derive(Debug, Clone)]
    pub(crate) struct Named<'a> {
        pub name: &'a str,
        pub value: NamedData<'a>,
    }

    #[allow(unused)]
    pub(crate) trait ConvertNamedData {
        fn as_i32(&self) -> Result<i32, Error>;
        fn as_u32(&self) -> Result<u32, Error>;
        fn as_i64(&self) -> Result<i64, Error>;
        fn as_u64(&self) -> Result<u64, Error>;
    }

    impl ConvertNamedData for NamedData<'_> {
        fn as_i32(&self) -> Result<i32, Error> {
            unimplemented!()
        }

        fn as_u32(&self) -> Result<u32, Error> {
            if let NamedData::UInt32(x) = self {
                Ok(*x)
            } else {
                Err(Error::InvalidNamedData)
            }
        }

        fn as_i64(&self) -> Result<i64, Error> {
            unimplemented!()
        }

        fn as_u64(&self) -> Result<u64, Error> {
            if let NamedData::UInt64(x) = self {
                Ok(*x)
            } else {
                Err(Error::InvalidNamedData)
            }
        }
    }

    #[derive(thiserror::Error, Clone, Debug)]
    pub(crate) enum Error {
        #[error("No such kstat")]
        NoSuchKstat,
        #[error("Expected a named kstat")]
        ExpectedNamedKstat,
        #[error("Invalid named data")]
        InvalidNamedData,
        #[error("Sample error")]
        Sample(#[from] oximeter::MetricsError),
    }

    pub(crate) fn hrtime_to_utc(_: i64) -> Result<DateTime<Utc>, Error> {
        Ok(Utc::now())
    }
}


================================================
FILE: bin/propolis-server/src/lib/stats/network_interface.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2024 Oxide Computer Company

//! Types and functions for tracking statistics about an instance's network
//! interfaces.

// Propolis is built in a variety of configurations, including checks and tests
// run on non-illumos machines where kstats are meaningless. This is a big
// hammer, but a large number of the values in this module are not referenced in
// those configurations, and so this is more straightfoward than littering the
// code with cfg directives.
#![cfg_attr(any(test, not(target_os = "illumos")), allow(dead_code))]

use chrono::{DateTime, Utc};
use oximeter::{types::Cumulative, FieldType, FieldValue, Sample, Target};

use super::kstat_types::{
    hrtime_to_utc, ConvertNamedData, Data, Error, Kstat, KstatList,
    KstatTarget, Named,
};
use crate::vm::NetworkInterfaceIds;

// NOTE: TOML definitions of timeseries are centralized in Omicron, so this file
// lives in that repo, at
// `./omicron/oximeter/oximeter/schema/instance-network-interface.toml`.
oximeter::use_timeseries!("instance-network-interface.toml");
use self::instance_network_interface::{
    BytesReceived, BytesSent, ErrorsReceived, ErrorsSent,
    InstanceNetworkInterface, PacketsDropped, PacketsReceived, PacketsSent,
};

const KSTAT_RX_BYTES: &str = "rx_bytes";
const KSTAT_TX_BYTES: &str = "tx_bytes";
const KSTAT_RX_PACKETS: &str = "rx_packets";
const KSTAT_TX_PACKETS: &str = "tx_packets";
const KSTAT_RX_DROPS: &str = "rx_drops";
const KSTAT_RX_ERRORS: &str = "rx_errors";
const KSTAT_TX_ERRORS: &str = "tx_errors";

/// The names of the kstat fields that represent the instance network interface metrics
/// we are interested in tracking.
const KSTAT_FIELDS: &[&str] = &[
    KSTAT_RX_BYTES,
    KSTAT_TX_BYTES,
    KSTAT_RX_PACKETS,
    KSTAT_TX_PACKETS,
    KSTAT_RX_DROPS,
    KSTAT_RX_ERRORS,
    KSTAT_TX_ERRORS,
];

/// The name of the kstat module that contains the instance network interface
/// metrics.
const KSTAT_MODULE_NAME: &str = "viona";

/// The name of the kstat that contains the instance network interface metrics.
const KSTAT_NAME: &str = "viona_stat";

/// Helper function to extract the same kstat metrics from all link targets.
fn extract_nic_kstats(
    target: &InstanceNetworkInterface,
    named_data: &Named,
    creation_time: DateTime<Utc>,
    snapshot_time: DateTime<Utc>,
) -> Option<Result<Sample, Error>> {
    let Named { name, value } = named_data;
    if *name == KSTAT_RX_BYTES {
        Some(value.as_u64().and_then(|x| {
            let metric = BytesReceived {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_TX_BYTES {
        Some(value.as_u64().and_then(|x| {
            let metric = BytesSent {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_RX_PACKETS {
        Some(value.as_u64().and_then(|x| {
            let metric = PacketsReceived {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_TX_PACKETS {
        Some(value.as_u64().and_then(|x| {
            let metric = PacketsSent {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_RX_DROPS {
        Some(value.as_u64().and_then(|x| {
            let metric = PacketsDropped {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_RX_ERRORS {
        Some(value.as_u64().and_then(|x| {
            let metric = ErrorsReceived {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else if *name == KSTAT_TX_ERRORS {
        Some(value.as_u64().and_then(|x| {
            let metric = ErrorsSent {
                datum: Cumulative::with_start_time(creation_time, x),
            };
            Sample::new_with_timestamp(snapshot_time, target, &metric)
                .map_err(Error::Sample)
        }))
    } else {
        None
    }
}

/// A wrapper around the `oximeter::Target` representing all instance network interfaces.
#[derive(Clone, Debug)]
pub(crate) struct InstanceNetworkInterfaces {
    /// The `oximeter::Target` itself, storing the target fields for the
    /// timeseries.
    ///
    /// **NOTE**: While this struct represents multiple instance network interfaces,
    /// they all share the same target fields.
    ///
    /// We default `interface_id` by generating a `uuid::Uuid::nil()` on first
    /// creation, before creating multiple targets in the `to_samples` method.
    pub(crate) target: InstanceNetworkInterface,

    /// A tuple-mapping of the interface UUIDs to the kstat instance IDs.
    pub(crate) interface_ids: NetworkInterfaceIds,
}

impl InstanceNetworkInterfaces {
    /// Create a new instance network interface metrics target from the given
    /// instance properties and add the interface_ids to match and gather
    /// metrics from.
    #[cfg(all(not(test), target_os = "illumos"))]
    pub(crate) fn new(
        virtual_machine: &super::VirtualMachine,
        interface_ids: &NetworkInterfaceIds,
    ) -> Self {
        Self {
            target: InstanceNetworkInterface {
                // Default `interface_id` to a new UUID, as we will create
                // multiple targets in the `to_samples` method and override
                // this.
                interface_id: uuid::Uuid::nil(),
                instance_id: virtual_machine.target.instance_id,
                project_id: virtual_machine.target.project_id,
                silo_id: virtual_machine.target.silo_id,
            },
            interface_ids: interface_ids.to_vec(),
        }
    }
}

impl KstatTarget for InstanceNetworkInterfaces {
    fn interested(&self, kstat: &Kstat<'_>) -> bool {
        kstat.ks_module == KSTAT_MODULE_NAME
            && kstat.ks_name == KSTAT_NAME
            && self.interface_ids.iter().any(|(_id, device_instance_id)| {
                kstat.ks_instance as u32 == *device_instance_id
            })
    }

    fn to_samples(
        &self,
        kstats: KstatList<'_, '_>,
    ) -> Result<Vec<Sample>, Error> {
        let kstats_for_nics =
            kstats.iter().filter_map(|(creation_time, kstat, data)| {
                self.interface_ids.iter().find_map(
                    |(id, device_instance_id)| {
                        if kstat.ks_instance as u32 == *device_instance_id {
                            let target = InstanceNetworkInterface {
                                interface_id: *id,
                                instance_id: self.target.instance_id,
                                project_id: self.target.project_id,
                                silo_id: self.target.silo_id,
                            };
                            Some((*creation_time, kstat, data, target))
                        } else {
                            None
                        }
                    },
                )
            });

        // Capacity is determined by the number of interfaces times the number
        // of kstat fields we track.
        let mut out =
            Vec::with_capacity(self.interface_ids.len() * KSTAT_FIELDS.len());
        for (creation_time, kstat, data, target) in kstats_for_nics {
            let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?;
            if let Data::Named(named) = data {
                named
                    .iter()
                    .filter_map(|nd| {
                        extract_nic_kstats(
                            &target,
                            nd,
                            creation_time,
                            snapshot_time,
                        )
                        .and_then(|opt_result| opt_result.ok())
                    })
                    .for_each(|sample| out.push(sample));
            }
        }

        Ok(out)
    }
}

// Implement the `oximeter::Target` trait for `InstanceNetworkInterfaces` using
// the single `InstanceNetworkInterface` target as it represents all the same fields.
impl Target for InstanceNetworkInterfaces {
    fn name(&self) -> &'static str {
        self.target.name()
    }
    fn field_names(&self) -> &'static [&'static str] {
        self.target.field_names()
    }

    fn field_types(&self) -> Vec<FieldType> {
        self.target.field_types()
    }

    fn field_values(&self) -> Vec<FieldValue> {
        self.target.field_values()
    }
}

#[cfg(test)]
mod test {
    use std::collections::HashSet;

    use uuid::Uuid;

    use super::*;
    use crate::stats::kstat_types::NamedData;

    fn test_network_interface() -> InstanceNetworkInterface {
        const INTERFACE_ID: Uuid =
            uuid::uuid!("f4b3b3b3-3b3b-3b3b-3b3b-3b3b3b3b3b3b");
        const INSTANCE_ID: Uuid =
            uuid::uuid!("96d6ec78-543a-4188-830e-37e2a0eeff16");
        const PROJECT_ID: Uuid =
            uuid::uuid!("7b61df02-0794-4b37-93bc-89f03c7289ca");
        const SILO_ID: Uuid =
            uuid::uuid!("6a4bd4b6-e9aa-44d1-b616-399d48baa173");

        InstanceNetworkInterface {
            interface_id: INTERFACE_ID,
            instance_id: INSTANCE_ID,
            project_id: PROJECT_ID,
            silo_id: SILO_ID,
        }
    }

    #[test]
    fn test_kstat_interested() {
        let target = InstanceNetworkInterfaces {
            target: test_network_interface(),
            interface_ids: vec![(Uuid::new_v4(), 2), (Uuid::new_v4(), 3)],
        };

        let ks_interested2 = Kstat {
            ks_module: KSTAT_MODULE_NAME,
            ks_instance: 2,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };

        assert!(target.interested(&ks_interested2));

        let ks_interested3 = Kstat {
            ks_module: KSTAT_MODULE_NAME,
            ks_instance: 3,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };

        assert!(target.interested(&ks_interested3));

        let ks_not_interested_module = Kstat {
            ks_module: "not-viona",
            ks_instance: 2,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };
        assert!(!target.interested(&ks_not_interested_module));

        let ks_not_interested_instance = Kstat {
            ks_module: KSTAT_MODULE_NAME,
            ks_instance: 4,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };
        assert!(!target.interested(&ks_not_interested_instance));
    }

    #[test]
    fn test_kstat_to_samples() {
        let target = InstanceNetworkInterfaces {
            target: test_network_interface(),
            interface_ids: vec![(Uuid::new_v4(), 2), (Uuid::new_v4(), 3)],
        };

        let kstat2 = Kstat {
            ks_module: KSTAT_MODULE_NAME,
            ks_instance: 2,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };

        let kstat3 = Kstat {
            ks_module: KSTAT_MODULE_NAME,
            ks_instance: 3,
            ks_snaptime: 0,
            ks_name: KSTAT_NAME,
        };

        let bytes_received =
            Named { name: KSTAT_RX_BYTES, value: NamedData::UInt64(100) };
        let bytes_sent =
            Named { name: KSTAT_TX_BYTES, value: NamedData::UInt64(200) };
        let packets_received =
            Named { name: KSTAT_RX_PACKETS, value: NamedData::UInt64(4) };
        let packets_sent =
            Named { name: KSTAT_TX_PACKETS, value: NamedData::UInt64(4) };
        let packets_dropped =
            Named { name: KSTAT_RX_DROPS, value: NamedData::UInt64(0) };
        let errors_received =
            Named { name: KSTAT_RX_ERRORS, value: NamedData::UInt64(0) };
        let errors_sent =
            Named { name: KSTAT_TX_ERRORS, value: NamedData::UInt64(0) };

        let data = Data::Named(vec![
            bytes_received,
            bytes_sent,
            packets_received,
            packets_sent,
            packets_dropped,
            errors_received,
            errors_sent,
        ]);

        let kstat_list = vec![
            (Utc::now(), kstat2, data.clone()),
            (Utc::now(), kstat3, data),
        ];
        let samples = target.to_samples(kstat_list.as_slice()).unwrap();
        assert_eq!(samples.len(), 2 * KSTAT_FIELDS.len());

        let mut interface_uuids = HashSet::new();
        for sample in samples {
            assert_eq!(sample.target_name(), "instance_network_interface");
            for field in sample.fields() {
                assert!(target.field_names().contains(&field.name.as_str()));
                if field.name == "interface_id" {
                    interface_uuids.insert(field.value);
                }
            }
        }
        // We should have two unique interface UUIDs.
        assert_eq!(interface_uuids.len(), 2);
    }
}


================================================
FILE: bin/propolis-server/src/lib/stats/pvpanic.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::virtual_machine::VirtualMachine;
use chrono::Utc;
use oximeter::{types::Sample, Metric, MetricsError, Producer};
use propolis::hw::qemu::pvpanic;
use std::sync::Arc;

// NOTE: TOML definitions of timeseries are centralized in Omicron, so this file
// lives in that repo, at
// `./omicron/oximeter/oximeter/schema/virtual-machine.toml`.
oximeter::use_timeseries!("virtual-machine.toml");
use self::virtual_machine::{PvPanicGuestHandled, PvPanicHostHandled};

#[derive(Clone, Debug)]
pub struct PvpanicProducer {
    /// The oximeter Target identifying this instance as the source of metric
    /// data.
    virtual_machine: VirtualMachine,

    /// Kernel panic counts for the relevant instance.
    host_handled_panics: PvPanicHostHandled,
    guest_handled_panics: PvPanicGuestHandled,

    pvpanic: Arc<pvpanic::QemuPvpanic>,
}

impl PvpanicProducer {
    pub fn new(
        virtual_machine: VirtualMachine,
        pvpanic: Arc<pvpanic::QemuPvpanic>,
    ) -> Self {
        // Construct a single counter and copy, so the timeseries are aligned to
        // the same start time.
        let datum = Default::default();
        PvpanicProducer {
            virtual_machine,
            host_handled_panics: PvPanicHostHandled { datum },
            guest_handled_panics: PvPanicGuestHandled { datum },
            pvpanic,
        }
    }
}

impl Producer for PvpanicProducer {
    fn produce(
        &mut self,
    ) -> Result<Box<dyn Iterator<Item = Sample> + 'static>, MetricsError> {
        let pvpanic::PanicCounts { guest_handled, host_handled } =
            self.pvpanic.panic_counts();

        self.host_handled_panics.datum_mut().set(host_handled as u64);
        self.guest_handled_panics.datum_mut().set(guest_handled as u64);

        // Provide both the same timestamp, to simplify alignment.
        let now = Utc::now();
        let data = [
            Sample::new_with_timestamp(
                now,
                &self.virtual_machine,
                &self.guest_handled_panics,
            )?,
            Sample::new_with_timestamp(
                now,
                &self.virtual_machine,
                &self.host_handled_panics,
            )?,
        ];

        Ok(Box::new(data.into_iter()))
    }
}


================================================
FILE: bin/propolis-server/src/lib/stats/virtual_disk.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2024 Oxide Computer Company

//! Types for tracking statistics about virtual disks.

use std::{
    num::NonZeroUsize,
    sync::{Arc, Mutex},
    time::Duration,
};

// NOTE: TOML definitions of timeseries are centralized in Omicron, so this file
// lives in that repo, at
// `./omicron/oximeter/oximeter/schema/virtual-machine.toml`.
oximeter::use_timeseries!("virtual-disk.toml");
use chrono::Utc;
use oximeter::{
    histogram::{Histogram, Record as _},
    types::Cumulative,
    MetricsError, Producer, Sample,
};
use propolis::block::{self, Operation};

pub use self::virtual_disk::VirtualDisk;
use self::virtual_disk::{
    BytesRead, BytesWritten, FailedFlushes, FailedReads, FailedWrites, Flushes,
    IoLatency, IoSize, Reads, Writes,
};

/// Type for tracking virtual disk stats.
///
/// This is shared between the oximeter producer registry and the actual block
/// device that implements the I/O processing. As I/Os are completed, this is
/// called back into in order to update the stats. Note that the actual shared
/// type is an Arc+Mutex around this, the [`VirtualDiskProducer`].
#[derive(Debug)]
struct VirtualDiskStats {
    /// The oximeter::Target representing this disk.
    disk: VirtualDisk,
    /// Cumulative number of reads.
    reads: Reads,
    /// Cumulative number of bytes read.
    bytes_read: BytesRead,
    /// Cumulative number of failed reads, by failure reason.
    failed_reads: [FailedReads; N_FAILURE_KINDS],
    /// Cumulative number of writes.
    writes: Writes,
    /// Cumulative number of bytes written.
    bytes_written: BytesWritten,
    /// Cumulative number of failed writes, by failure reason.
    failed_writes: [FailedWrites; N_FAILURE_KINDS],
    /// Cumulative number of flushes.
    flushes: Flushes,
    /// Cumulative number of failed flushes, by failure reason.
    failed_flushes: [FailedFlushes; N_FAILURE_KINDS],
    /// Histogram tracking I/O latency, by the I/O kind.
    io_latency: [IoLatency; N_IO_KINDS],
    /// Histogram tracking I/O sizes, by the I/O kind.
    ///
    /// Note that we have 1 fewer histogram here, since flush operations do not
    /// have a size.
    io_size: [IoSize; N_IO_KINDS - 1],
}

impl VirtualDiskStats {
    /// Update the tracked statistics with the result of an I/O completion.
    fn on_completion(&mut self, sample: BlockSample) {
        let BlockSample { op, result, duration } = sample;
        match op {
            Operation::Read(_, len) => {
                self.on_read_completion(result, len, duration)
            }
            Operation::Write(_, len) => {
                self.on_write_completion(result, len, duration)
            }
            Operation::Flush => self.on_flush_completion(result, duration),
            Operation::Discard => {
                // Discard is now wired up for local disks. We need to add support for it to the
                // schema in Omicron before we can report stats for it. For now, just ignore it.
            }
        }
    }

    fn on_read_completion(
        &mut self,
        result: block::Result,
        len: usize,
        duration: Duration,
    ) {
        let index = match result {
            block::Result::Success => {
                let _ = self.io_latency[READ_INDEX]
                    .datum
                    .sample(duration.as_nanos() as u64);
                let _ = self.io_size[READ_INDEX].datum.sample(len as u64);
                self.reads.datum += 1;
                self.bytes_read.datum += len as u64;
                return;
            }
            block::Result::Failure => FAILURE_INDEX,
            block::Result::ReadOnly => READONLY_INDEX,
            block::Result::Unsupported => UNSUPPORTED_INDEX,
        };
        self.failed_reads[index].datum.increment();
    }

    fn on_write_completion(
        &mut self,
        result: block::Result,
        len: usize,
        duration: Duration,
    ) {
        let index = match result {
            block::Result::Success => {
                let _ = self.io_latency[WRITE_INDEX]
                    .datum
                    .sample(duration.as_nanos() as u64);
                let _ = self.io_size[WRITE_INDEX].datum.sample(len as u64);
                self.writes.datum += 1;
                self.bytes_written.datum += len as u64;
                return;
            }
            block::Result::Failure => FAILURE_INDEX,
            block::Result::ReadOnly => READONLY_INDEX,
            block::Result::Unsupported => UNSUPPORTED_INDEX,
        };
        self.failed_writes[index].datum.increment();
    }

    fn on_flush_completion(
        &mut self,
        result: block::Result,
        duration: Duration,
    ) {
        let index = match result {
            block::Result::Success => {
                let _ = self.io_latency[FLUSH_INDEX]
                    .datum
                    .sample(duration.as_nanos() as u64);
                self.flushes.datum += 1;
                return;
            }
            block::Result::Failure => FAILURE_INDEX,
            block::Result::ReadOnly => READONLY_INDEX,
            block::Result::Unsupported => UNSUPPORTED_INDEX,
        };
        self.failed_flushes[index].datum.increment();
    }
}

/// Number of I/O kinds we track.
const N_IO_KINDS: usize = 3;

/// Indices into arrays tracking operations broken out by I/O kind.
const READ_INDEX: usize = 0;
const WRITE_INDEX: usize = 1;
const FLUSH_INDEX: usize = 2;

/// String representations of I/O kinds we report to Oximeter.
const READ_KIND: &str = "read";
const WRITE_KIND: &str = "write";
const FLUSH_KIND: &str = "flush";

/// Number of failure kinds we track.
const N_FAILURE_KINDS: usize = 3;

/// Indices into arrays tracking operations broken out by failure kind.
const FAILURE_INDEX: usize = 0;
const READONLY_INDEX: usize = 1;
const UNSUPPORTED_INDEX: usize = 2;

/// String representations of failure kinds we report to Oximeter.
const FAILURE_KIND: &str = "failed";
const READONLY_KIND: &str = "read-only";
const UNSUPPORTED_KIND: &str = "unsupported";

/// Latency is measured in nanoseconds. We want to track between 1 microsecond,
/// which is 10 ** 3 nanos, and 10s, which is 10 * 1e9 == 10 ** 10 nanoseconds.
const LATENCY_POWERS: (u16, u16) = (3, 10);

/// Sizes are measured in powers of 2 from 512B to 1GiB.
///
/// We use 512B as the minimum since that is the minimum supported block size.
const SIZE_POWERS: (u16, u16) = (9, 30);

/// Maximum number of samples to buffer for a given device queue before
/// consolidating them into the totals for the device.
///
/// This value was arbitrarily chosen
const MAX_BUFFERED_SAMPLES: usize = 512;

/// A [`Producer`] that emits statistics about virtual disks.
///
/// This type is shared between the block devie that handles guest I/Os, and the
/// oximeter producer server, which publishes data to oximeter when it polls us.
/// As I/Os are completed, the [`VirtualDiskProducer::on_completion()`] method
/// is called, to update stats with the results of the I/O operation.
///
/// As oximeter polls us, the producer server also collects these updated
/// statistics.

#[derive(Debug)]
pub(crate) struct BlockMetrics {
    sample_buffer: Vec<Mutex<Vec<BlockSample>>>,
    stats: Mutex<VirtualDiskStats>,
}
impl BlockMetrics {
    pub fn new(disk: VirtualDisk, max_queues: NonZeroUsize) -> Arc<Self> {
        let now = Utc::now();
        let datum = Cumulative::with_start_time(now, 0);
        let latency_histogram = Self::latency_histogram();
        let size_histogram = Self::size_histogram();
        let stats = VirtualDiskStats {
            disk,
            reads: Reads { datum },
            bytes_read: BytesRead { datum },
            failed_reads: [
                FailedReads { failure_reason: FAILURE_KIND.into(), datum },
                FailedReads { failure_reason: READONLY_KIND.into(), datum },
                FailedReads { failure_reason: UNSUPPORTED_KIND.into(), datum },
            ],
            writes: Writes { datum },
            bytes_written: BytesWritten { datum },
            failed_writes: [
                FailedWrites { failure_reason: FAILURE_KIND.into(), datum },
                FailedWrites { failure_reason: READONLY_KIND.into(), datum },
                FailedWrites { failure_reason: UNSUPPORTED_KIND.into(), datum },
            ],
            flushes: Flushes { datum },
            failed_flushes: [
                FailedFlushes { failure_reason: FAILURE_KIND.into(), datum },
                FailedFlushes { failure_reason: READONLY_KIND.into(), datum },
                FailedFlushes {
                    failure_reason: UNSUPPORTED_KIND.into(),
                    datum,
                },
            ],
            io_latency: [
                IoLatency {
                    io_kind: READ_KIND.into(),
                    datum: latency_histogram.clone(),
                },
                IoLatency {
                    io_kind: WRITE_KIND.into(),
                    datum: latency_histogram.clone(),
                },
                IoLatency {
                    io_kind: FLUSH_KIND.into(),
                    datum: latency_histogram.clone(),
                },
            ],
            io_size: [
                IoSize {
                    io_kind: READ_KIND.into(),
                    datum: size_histogram.clone(),
                },
                IoSize {
                    io_kind: WRITE_KIND.into(),
                    datum: size_histogram.clone(),
                },
            ],
        };
        let mut sample_buffer = Vec::with_capacity(max_queues.get());
        sample_buffer.resize_with(max_queues.get(), Default::default);

        Arc::new(Self { sample_buffer, stats: Mutex::new(stats) })
    }

    pub(crate) fn producer(self: &Arc<Self>) -> VirtualDiskProducer {
        VirtualDiskProducer(self.clone())
    }

    /// Construct a histogram for tracking I/O latencies.
    ///
    /// This builds a "log-linear" histogram, which has 10 bins for each power
    /// of 10, spaced between 1 microsecond and 10s inclusive.
    fn latency_histogram() -> Histogram<u64> {
        // Safety: This only fails if the bins are not valid.
        Histogram::span_decades(LATENCY_POWERS.0, LATENCY_POWERS.1).unwrap()
    }

    /// Construct a histogram for tracking I/O sizes.
    ///
    /// This creates a power-of-2 histogram for tracking I/O sizes between 512B
    /// and 1GiB.
    fn size_histogram() -> Histogram<u64> {
        let bins: Vec<_> =
            (SIZE_POWERS.0..=SIZE_POWERS.1).map(|p| 1u64 << p).collect();

        // Safety: This only fails if the bins are not valid.
        Histogram::new(&bins).unwrap()
    }

    fn consolidate_one(&self, idx: usize) {
        let mut stats = self.stats.lock().unwrap();
        let mut buf = self.sample_buffer.get(idx).unwrap().lock().unwrap();
        for sample in buf.drain(..) {
            stats.on_completion(sample);
        }
    }
    fn consolidate_all(&self) {
        let mut stats = self.stats.lock().unwrap();
        for buf in self.sample_buffer.iter() {
            let mut buf = buf.lock().unwrap();
            for sample in buf.drain(..) {
                stats.on_completion(sample);
            }
        }
    }
}

#[derive(Debug)]
struct BlockSample {
    op: block::Operation,
    result: block::Result,
    duration: Duration,
}

#[derive(Clone, Debug)]
pub struct VirtualDiskProducer(Arc<BlockMetrics>);

impl block::MetricConsumer for BlockMetrics {
    fn request_completed(
        &self,
        queue_id: block::QueueId,
        op: Operation,
        result: block::Result,
        _time_queued: Duration,
        time_processed: Duration,
    ) {
        let idx = usize::from(queue_id);
        let buf = self
            .sample_buffer
            .get(idx)
            .expect("queue ID should be within maximum");
        let mut guard = buf.lock().unwrap();
        guard.push(BlockSample { op, result, duration: time_processed });

        // Do not let an unbounded number of samples accumulate
        if guard.len() > MAX_BUFFERED_SAMPLES {
            drop(guard);
            self.consolidate_one(idx);
        }
    }
}

impl Producer for VirtualDiskProducer {
    fn produce(
        &mut self,
    ) -> Result<Box<dyn Iterator<Item = Sample>>, MetricsError> {
        // Consolidate any buffer samples first
        self.0.consolidate_all();

        // 5 scalar samples (reads, writes, flushes, bytes read / written)
        // 3 scalars broken out by failure kind
        // 2 histograms broken out by I/O kind
        const N_SAMPLES: usize = 5 + 3 * N_FAILURE_KINDS + 2 * N_IO_KINDS;
        let mut out = Vec::with_capacity(N_SAMPLES);
        let stats = self.0.stats.lock().unwrap();

        // Read statistics.
        out.push(Sample::new(&stats.disk, &stats.reads)?);
        out.push(Sample::new(&stats.disk, &stats.bytes_read)?);
        for failed in stats.failed_reads.iter() {
            out.push(Sample::new(&stats.disk, failed)?);
        }

        // Write statistics.
        out.push(Sample::new(&stats.disk, &stats.writes)?);
        out.push(Sample::new(&stats.disk, &stats.bytes_written)?);
        for failed in stats.failed_writes.iter() {
            out.push(Sample::new(&stats.disk, failed)?);
        }

        // Flushes
        out.push(Sample::new(&stats.disk, &stats.flushes)?);
        for failed in stats.failed_flushes.iter() {
            out.push(Sample::new(&stats.disk, failed)?);
        }

        // Histograms for latency and size.
        for hist in stats.io_latency.iter() {
            out.push(Sample::new(&stats.disk, hist)?);
        }
        for hist in stats.io_size.iter() {
            out.push(Sample::new(&stats.disk, hist)?);
        }
        drop(stats);
        Ok(Box::new(out.into_iter()))
    }
}

#[cfg(test)]
mod test {
    use super::BlockMetrics;
    use super::LATENCY_POWERS;
    use super::SIZE_POWERS;

    #[test]
    fn test_latency_histogram() {
        let hist = BlockMetrics::latency_histogram();
        println!("{:#?}", hist.iter().map(|bin| bin.range).collect::<Vec<_>>());
        // The math here is a bit silly, but we end up with 9 bins in each
        // "interior" power of 10, plus one more bin on the right and left for
        // the bins from [0, 1us) and [10s, inf)
        assert_eq!(
            hist.n_bins(),
            (LATENCY_POWERS.1 - LATENCY_POWERS.0) as usize * 9 + 1 + 1
        );
    }

    #[test]
    fn test_size_histogram() {
        let hist = BlockMetrics::size_histogram();
        println!("{:#?}", hist.iter().map(|bin| bin.range).collect::<Vec<_>>());
        // 1 extra left bin for [0, 512), and 1 because the range is inclusive.
        assert_eq!(
            hist.n_bins(),
            (SIZE_POWERS.1 - SIZE_POWERS.0) as usize + 1 + 1
        );
    }
}


================================================
FILE: bin/propolis-server/src/lib/stats/virtual_machine.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Copyright 2024 Oxide Computer Company

//! Types for tracking statistics about virtual machine instances.

// Propolis is built in a variety of configurations, including checks and tests
// run on non-illumos machines where kstats are meaningless. This is a big
// hammer, but a large number of the values in this module are not referenced in
// those configurations, and so this is more straightfoward than littering the
// code with cfg directives.
#![cfg_attr(any(test, not(target_os = "illumos")), allow(dead_code))]

use chrono::{DateTime, Utc};
use oximeter::{types::Cumulative, FieldType, FieldValue, Sample, Target};
use std::borrow::Cow;
use std::collections::BTreeMap;

use super::kstat_types::{
    hrtime_to_utc, ConvertNamedData, Data, Error, Kstat, NamedData,
};
#[cfg(all(not(test), target_os = "illumos"))]
use super::kstat_types::{KstatList, KstatTarget};

// NOTE: TOML definitions of timeseries are centralized in Omicron, so this file
// lives in that repo, at
// `./omicron/oximeter/oximeter/schema/virtual-machine.toml`.
oximeter::use_timeseries!("virtual-machine.toml");
pub use self::virtual_machine::Reset;
use self::virtual_machine::{
    VcpuUsage, VirtualMachine as VirtualMachineTarget,
};

/// A wrapper around the `oximeter::Target` representing a VM instance.
///
/// This is used to combine the "real" target,
/// `self::virtual_machine::VirtualMachine`, with some additional fields to help
/// collect data via kstats. It's not currently possible to attach fields like
/// this to the code generated by the `oximeter::use_timeseries!()` macro.
#[derive(Clone, Debug)]
pub struct VirtualMachine {
    /// The `oximeter::Target` itself, storing the metric fields for the
    /// timeseries.
    pub target: VirtualMachineTarget,

    /// This field is needed because the hypervisor currently creates kstats for
    /// each vCPU, regardless of whether they're activated. There is no way to
    /// tell from userland today which vCPU kstats are "real".
    ///
    /// This field is not published as part of the target field definitions.
    /// We include this value here, and implement `oximeter::Target` manually,
    /// so that this field is not published as a field on the timeseries.
    n_vcpus: u32,

    /// Used to find the right kstats for this VM instance.
    ///
    /// This field is also not published as part of the target, but used to
    /// find the right kstats.
    vm_name: String,
}

impl VirtualMachine {
    pub fn new(
        n_vcpus: u8,
        properties: &propolis_api_types::instance::InstanceProperties,
    ) -> Self {
        Self {
            target: VirtualMachineTarget {
                silo_id: properties.metadata.silo_id,
                project_id: properties.metadata.project_id,
                instance_id: properties.id,
                sled_id: properties.metadata.sled_id,
                sled_model: properties.metadata.sled_model.clone().into(),
                sled_revision: properties.metadata.sled_revision,
                sled_serial: properties.metadata.sled_serial.clone().into(),
            },
            n_vcpus: u32::from(n_vcpus),
            vm_name: properties.vm_name(),
        }
    }
}

// NOTE: Delegate to the inner target type for this implementation.
impl Target for VirtualMachine {
    fn name(&self) -> &'static str {
        self.target.name()
    }

    fn field_names(&self) -> &'static [&'static str] {
        self.target.field_names()
    }

    fn field_types(&self) -> Vec<FieldType> {
        self.target.field_types()
    }

    fn field_values(&self) -> Vec<FieldValue> {
        self.target.field_values()
    }
}

// The kstats tracking occupancy in the various microstates have specific names.
// We avoid exposing that in the oximeter samples, and instead map the micro
// state names into our own set of state names.
//
// This returns the public named state to which a microstate maps, if any.
//
// See https://github.com/illumos/illumos-gate/blob/297b0dea3578abea9526441154d0dfa29697c891/usr/src/uts/intel/io/vmm/vmm_sol_dev.c#L2815
// for a definition of these states.
fn kstat_microstate_to_state_name(ustate: &str) -> Option<&'static str> {
    match ustate {
        "time_emu_kern" | "time_emu_user" => Some(OXIMETER_EMULATION_STATE),
        "time_run" => Some(OXIMETER_RUN_STATE),
        "time_init" | "time_idle" => Some(OXIMETER_IDLE_STATE),
        "time_sched" => Some(OXIMETER_WAITING_STATE),
        _ => None,
    }
}

// The definitions of each oximeter-level microstate we track.
const OXIMETER_EMULATION_STATE: &str = "emulation";
const OXIMETER_RUN_STATE: &str = "run";
const OXIMETER_IDLE_STATE: &str = "idle";
const OXIMETER_WAITING_STATE: &str = "waiting";
const OXIMETER_STATES: [&str; 4] = [
    OXIMETER_EMULATION_STATE,
    OXIMETER_RUN_STATE,
    OXIMETER_IDLE_STATE,
    OXIMETER_WAITING_STATE,
];

/// The number of expected vCPU microstates we track.
///
/// This is used to preallocate data structures for holding samples, and to
/// limit the number of samples in the `KstatSampler`, if it is not pulled
/// quickly enough by `oximeter`.
pub const N_VCPU_MICROSTATES: u32 = OXIMETER_STATES.len() as _;

// The name of the kstat module containing virtual machine kstats.
const VMM_KSTAT_MODULE_NAME: &str = "vmm";

// The name of the kstat with virtual machine metadata (VM name currently).
const VM_KSTAT_NAME: &str = "vm";

// The named kstat holding the virtual machine's name. This is currently the
// UUID assigned by the control plane to the virtual machine instance.
const VM_NAME_KSTAT: &str = "vm_name";

// The name of kstat containing vCPU usage data.
const VCPU_KSTAT_PREFIX: &str = "vcpu";

#[cfg(all(not(test), target_os = "illumos"))]
impl KstatTarget for VirtualMachine {
    // The VMM kstats are organized like so:
    //
    // - module: vmm
    // - instance: a kernel-assigned integer
    // - name: vm -> generic VM info, vcpuX -> info for each vCPU
    //
    // At this part of the code, we don't have that kstat instance, only the
    // virtual machine instance's control plane UUID. However, the VM's "name"
    // is assigned to be that control plane UUID in the hypervisor. See
    // https://github.com/oxidecomputer/propolis/blob/759bf4a19990404c135e608afbe0d38b70bfa370/bin/propolis-server/src/lib/vm/mod.rs#L420
    // for the current code which does that.
    //
    // That means we need to indicate interest in both the `vm` and `vcpuX`
    // kstats for any instance, and then filter to the right instance in the
    // `to_samples()` method below, because interest is defined on each
    // individual kstat.
    fn interested(&self, kstat: &Kstat<'_>) -> bool {
        kstat.ks_module == VMM_KSTAT_MODULE_NAME
    }

    fn to_samples(
        &self,
        kstats: KstatList<'_, '_>,
    ) -> Result<Vec<Sample>, Error> {
        // First, we need to map the instance's control plane UUID to the kstat
        // instance. We'll find this through the `vmm:<instance>:vm:vm_name`
        // kstat, which lists the instance's UUID as its name. The
        // `VirtualMachine` target stores that internally as the `vm_name`
        // field.
        //
        // Note that if this code is run from within a Propolis zone, there is
        // exactly one `vmm` kstat instance in any case.
        let instance = kstats
            .iter()
            .find_map(|(_, kstat, data)| {
                kstat_instance_from_instance_id(kstat, data, &self.vm_name)
            })
            .ok_or(Error::NoSuchKstat)?;

        // Armed with the kstat instance, find all relevant metrics related to
        // this particular VM. For now, we produce only vCPU usage metrics, but
        // others may be chained in the future.
        let vcpu_stats = kstats.iter().filter(|(_, kstat, _)| {
            // Filter out those that don't match our kstat instance.
            if kstat.ks_instance != instance {
                return false;
            }

            // Filter out those which are neither a vCPU stat of any kind, nor
            // for one of the vCPU IDs we know to be active.
            let Some(suffix) = kstat.ks_name.strip_prefix(VCPU_KSTAT_PREFIX)
            else {
                return false;
            };
            let Ok(vcpu_id) = suffix.parse::<u32>() else {
                return false;
            };
            vcpu_id < self.n_vcpus
        });
        produce_vcpu_usage(self, vcpu_stats)
    }
}

// Given a kstat and an instance's ID, return the kstat instance if it matches.
fn kstat_instance_from_instance_id(
    kstat: &Kstat<'_>,
    data: &Data<'_>,
    instance_id: &str,
) -> Option<i32> {
    // Filter out anything that's not a `vmm:<instance>:vm` named kstat.
    if kstat.ks_module != VMM_KSTAT_MODULE_NAME {
        return None;
    }
    if kstat.ks_name != VM_KSTAT_NAME {
        return None;
    }
    let Data::Named(named) = data else {
        return None;
    };

    // Return the instance if the `vm_name` kstat matches our instance UUID.
    if named.iter().any(|nd| {
        if nd.name != VM_NAME_KSTAT {
            return false;
        }
        let NamedData::String(name) = &nd.value else {
            return false;
        };
        instance_id == *name
    }) {
        return Some(kstat.ks_instance);
    }
    None
}

// Produce `Sample`s for the `VcpuUsage` metric from the relevant kstats.
fn produce_vcpu_usage<'a>(
    vm: &'a VirtualMachine,
    vcpu_stats: impl Iterator<Item = &'a (DateTime<Utc>, Kstat<'a>, Data<'a>)> + 'a,
) -> Result<Vec<Sample>, Error> {
    let mut out =
        Vec::with_capacity(vm.n_vcpus as usize * N_VCPU_MICROSTATES as usize);
    for (creation_time, kstat, data) in vcpu_stats {
        let Data::Named(named) = data else {
            return Err(Error::ExpectedNamedKstat);
        };
        let snapshot_time = hrtime_to_utc(kstat.ks_snaptime)?;

        // Find the vCPU ID, from the `vmm:<instance>:vcpuX:vcpu` named kstat.
        let vcpu_id = named
            .iter()
            .find_map(|named| {
                if named.name == VCPU_KSTAT_PREFIX {
                    named.value.as_u32().ok()
                } else {
                    None
                }
            })
            .ok_or(Error::NoSuchKstat)?;

        // We track each vCPU microstate starting with `time_`, and map them
        // into our own definitions of the vCPU states. We need to aggregate all
        // the occupancy times from the microstates that map to the same public
        // state.
        let mut occupancy_by_state = BTreeMap::new();
        for nv in named.iter() {
            // Skip kstats that are not known microstate names.
            let Some(state) = kstat_microstate_to_state_name(nv.name) else {
                continue;
            };

            // Get the current summed state occupancy, or insert one with 0.
            let datum = occupancy_by_state
                .entry(Cow::Owned(state.to_string()))
                .or_insert_with(|| {
                    Cumulative::with_start_time(*creation_time, 0)
                });
            *datum += nv.value.as_u64()?;
        }

        // Now convert the aggregated occupancy times into samples.
        for (state, datum) in occupancy_by_state.into_iter() {
            let metric = VcpuUsage { vcpu_id, state, datum };
            let sample =
                Sample::new_with_timestamp(snapshot_time, vm, &metric)?;
            out.push(sample);
        }
    }
    Ok(out)
}

#[cfg(test)]
mod test {
    use super::kstat_instance_from_instance_id;
    use super::kstat_microstate_to_state_name;
    use super::produce_vcpu_usage;
    use super::Utc;
    use super::VcpuUsage;
    use super::VirtualMachine;
    use super::VirtualMachineTarget;
    use super::VCPU_KSTAT_PREFIX;
    use super::VMM_KSTAT_MODULE_NAME;
    use super::VM_KSTAT_NAME;
    use super::VM_NAME_KSTAT;
    use crate::stats::kstat_types::Data;
    use crate::stats::kstat_types::Kstat;
    use crate::stats::kstat_types::Named;
    use crate::stats::kstat_types::NamedData;
    use crate::stats::virtual_machine::N_VCPU_MICROSTATES;
    use crate::stats::virtual_machine::OXIMETER_EMULATION_STATE;
    use crate::stats::virtual_machine::OXIMETER_IDLE_STATE;
    use crate::stats::virtual_machine::OXIMETER_RUN_STATE;
    use crate::stats::virtual_machine::OXIMETER_WAITING_STATE;
    use oximeter::types::Cumulative;
    use oximeter::Datum;
    use oximeter::FieldValue;
    use std::collections::BTreeMap;
    use uuid::Uuid;

    fn test_virtual_machine() -> VirtualMachine {
        const INSTANCE_ID: Uuid =
            uuid::uuid!("96d6ec78-543a-4188-830e-37e2a0eeff16");
        const PROJECT_ID: Uuid =
            uuid::uuid!("7b61df02-0794-4b37-93bc-89f03c7289ca");
        const SILO_ID: Uuid =
            uuid::uuid!("6a4bd4b6-e9aa-44d1-b616-399d48baa173");
        const SLED_ID: Uuid =
            uuid::uuid!("aa144342-94d7-46d3-9eaa-51d84f7574b5");
        const SLED_REVISION: u32 = 1;
        VirtualMachine {
            target: VirtualMachineTarget {
                silo_id: SILO_ID,
                project_id: PROJECT_ID,
                instance_id: INSTANCE_ID,
                sled_id: SLED_ID,
                sled_model: "some-gimlet".into(),
                sled_revision: SLED_REVISION,
                sled_serial: "abcd".into(),
            },
            n_vcpus: 4,
            vm_name: INSTANCE_ID.to_string(),
        }
    }

    fn test_usage() -> VcpuUsage {
        VcpuUsage {
            state: "run".into(),
            vcpu_id: 0,
            datum: Cumulative::new(100),
        }
    }

    #[test]
    fn test_kstat_instance_from_instance_id() {
        let ks = Kstat {
            ks_module: VMM_KSTAT_MODULE_NAME,
            ks_instance: 0,
            ks_name: VM_KSTAT_NAME,
            ks_snaptime: 1,
        };
        const INSTANCE_ID: &str = "db198b43-2dee-4b4b-8a68-24cb4c0d6ec8";
        let data = Data::Named(vec![Named {
            name: VM_NAME_KSTAT,
            value: NamedData::String(INSTANCE_ID),
        }]);

        assert_eq!(
            kstat_instance_from_instance_id(&ks, &data, INSTANCE_ID)
                .expect("Should have matched the instance ID"),
            ks.ks_instance,
        );

        let data = Data::Named(vec![Named {
            name: VM_NAME_KSTAT,
            value: NamedData::String("something-else"),
        }]);
        assert!(
            kstat_instance_from_instance_id(&ks, &data, INSTANCE_ID).is_none(),
            "Should not have matched an instance ID"
        );
    }

    fn vcpu_state_kstats<'a>() -> (Kstat<'a>, Data<'a>) {
        let ks = Kstat {
            ks_module: VMM_KSTAT_MODULE_NAME,
            ks_instance: 0,
            ks_name: "vcpu0",
            ks_snaptime: 1,
        };
        let data = Data::Named(vec![
            Named { name: VCPU_KSTAT_PREFIX, value: NamedData::UInt32(0) },
            // There are three ustates, but the first two are aggregated.
            Named { name: "time_init", value: NamedData::UInt64(1) },
            Named { name: "time_idle", value: NamedData::UInt64(1) },
            Named { name: "time_run", value: NamedData::UInt64(2) },
        ]);
        (ks, data)
    }

    #[test]
    fn test_produce_vcpu_usage() {
        let (ks, data) = vcpu_state_kstats();
        let kstats = [(Utc::now(), ks, data)];
        let samples =
            produce_vcpu_usage(&test_virtual_machine(), kstats.iter())
                .expect("Should have produced samples");
        assert_eq!(
            samples.len(),
            2,
            "Should have samples for 'run' and 'idle' states"
        );
        for ((sample, state), x) in samples
            .iter()
            .zip([OXIMETER_IDLE_STATE, OXIMETER_RUN_STATE])
            .zip([2, 2])
        {
            let st = sample
                .fields()
                .iter()
                .find_map(|f| {
                    if f.name == "state" {
                        let FieldValue::String(state) = &f.value else {
                            panic!("Expected a string field");
                        };
                        Some(state.clone())
                    } else {
                        None
                    }
                })
                .expect("expected a field with name \"state\"");
            assert_eq!(st, state, "Found an incorrect vCPU state");
            let Datum::CumulativeU64(inner) = sample.measurement.datum() else {
                panic!("Expected a cumulativeu64 datum");
            };
            assert_eq!(inner.value(), x);
        }
    }

    // Sanity check that the mapping from lower-level `kstat` vCPU microstates
    // to the higher-level states we report to `oximeter` do not change.
    #[test]
    fn test_consistent_kstat_to_oximeter_microstate_mapping() {
        // Build our expected mapping from kstat-to-oximeter states.
        //
        // For each oximeter state, we pretend to have observed the kstat-level
        // microstates that go into it some number of times. We then check that
        // the number of actual observed mapped states (for each kstat-level
        // one) is matches our expectation.
        //
        // For example, the `time_emu_{kern,user}` states map to the
        // `"emulation"` state. If we observe 1 and 2 of those, respectively, we
        // should have a total of 3 observations of the `"emulation"` state.
        let mut expected_states = BTreeMap::new();
        expected_states.insert(
            OXIMETER_EMULATION_STATE,
            vec![("time_emu_kern", 1usize), ("time_emu_user", 2)],
        );
        expected_states.insert(
            OXIMETER_RUN_STATE,
            vec![("time_run", 4)], // Not equal to sum above
        );
        expected_states.insert(
            OXIMETER_IDLE_STATE,
            vec![("time_init", 5), ("time_idle", 6)],
        );
        expected_states.insert(OXIMETER_WAITING_STATE, vec![("time_sched", 7)]);
        assert_eq!(
            expected_states.len() as u32,
            N_VCPU_MICROSTATES,
            "Expected set of oximeter states does not match const",
        );

        // "Observe" each kstat-level microstate a certain number of times, and
        // bump our counter of the oximeter state it maps to.
        let mut observed_states: BTreeMap<_, usize> = BTreeMap::new();
        for kstat_states in expected_states.values() {
            for (kstat_state, count) in kstat_states.iter() {
                let oximeter_state = kstat_microstate_to_state_name(
                    kstat_state,
                )
                .unwrap_or_else(|| {
                    panic!(
                        "kstat state '{kstat_state}' did not map to an \
                        oximeter state, which it should have done. Did that \
                        state get mapped to a new oximeter-level state?"
                    )
                });
                *observed_states.entry(oximeter_state).or_default() += count;
            }
        }

        // Check that we've observed all the states correctly.
        assert_eq!(
            observed_states.len(),
            expected_states.len(),
            "Some oximeter-level states were not accounted for. \
            Did the set of oximeter states reported change?",
        );
        for (oximeter_state, count) in observed_states.iter() {
            let kstat_states = expected_states.get(oximeter_state).expect(
                "An unexpected oximeter state was produced. \
                    Did the set of kstat or oximeter microstates \
                    change?",
            );
            let expected_total: usize =
                kstat_states.iter().map(|(_, ct)| ct).sum();
            assert_eq!(
                *count, expected_total,
                "Some oximeter states were not accounted for",
            );
        }
    }
}


================================================
FILE: bin/propolis-server/src/lib/vcpu_tasks.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tasks for vCPU backing threads and controls for them.

use std::sync::{
    atomic::{AtomicUsize, Ordering},
    Arc,
};

use propolis::{
    bhyve_api,
    exits::{self, SuspendDetail, VmExitKind},
    vcpu::Vcpu,
    VmEntry,
};
use slog::{debug, error, info};
use thiserror::Error;

#[derive(Debug, Error)]
pub enum VcpuTaskError {
    #[error("Failed to spawn a vCPU backing thread: {0}")]
    BackingThreadSpawnFailed(std::io::Error),
    #[error("CPU bindings did not match vCPUs: {bindings} bindings for {vcpus} vCPUs")]
    CpuBindingMismatch { bindings: usize, vcpus: usize },
}

pub struct VcpuTasks {
    tasks: Vec<(propolis::tasks::TaskCtrl, std::thread::JoinHandle<()>)>,
    generation: Arc<AtomicUsize>,
}

#[cfg_attr(test, mockall::automock)]
pub(crate) trait VcpuTaskController: Send + Sync + 'static {
    fn new_generation(&self);
    fn pause_all(&mut self);
    fn resume_all(&mut self);
    fn exit_all(&mut self);
}

impl VcpuTasks {
    pub(crate) fn new(
        machine: &propolis::Machine,
        event_handler: Arc<dyn super::vm::guest_event::VcpuEventHandler>,
        bind_cpus: Option<Vec<pbind::processorid_t>>,
        log: slog::Logger,
    ) -> Result<Self, VcpuTaskError> {
        let generation = Arc::new(AtomicUsize::new(0));

        // We take in an `Option<Vec<..>>` but a `Vec<Option<..>>` is more
        // convenient for spawning below, so we have to shuffle values a bit..
        let mut bindings = vec![None; machine.vcpus.len()];
        if let Some(bind_cpus) = bind_cpus {
            if bind_cpus.len() != machine.vcpus.len() {
                return Err(VcpuTaskError::CpuBindingMismatch {
                    bindings: bind_cpus.len(),
                    vcpus: machine.vcpus.len(),
                });
            }
            for i in 0..machine.vcpus.len() {
                bindings[i] = Some(bind_cpus[i]);
            }
        }

        let mut tasks = Vec::new();
        for (vcpu, bind_cpu) in
            machine.vcpus.iter().map(Arc::clone).zip(bindings.into_iter())
        {
            let (task, ctrl) =
                propolis::tasks::TaskHdl::new_held(Some(vcpu.barrier_fn()));
            let task_log = log.new(slog::o!("vcpu" => vcpu.id));
            let task_event_handler = event_handler.clone();
            let task_gen = generation.clone();
            let thread = std::thread::Builder::new()
                .name(format!("vcpu-{}", vcpu.id))
                .spawn(move || {
                    if let Some(bind_cpu) = bind_cpu {
                        pbind::bind_lwp(bind_cpu)
                            .expect("can bind to specified CPU");
                    }
                    Self::vcpu_loop(
                        vcpu.as_ref(),
                        task,
                        task_event_handler,
                        task_gen,
                        task_log,
                    )
                })
                .map_err(VcpuTaskError::BackingThreadSpawnFailed)?;
            tasks.push((ctrl, thread));
        }

        Ok(Self { tasks, generation })
    }

    fn vcpu_loop(
        vcpu: &Vcpu,
        task: propolis::tasks::TaskHdl,
        event_handler: Arc<dyn super::vm::guest_event::VcpuEventHandler>,
        generation: Arc<AtomicUsize>,
        log: slog::Logger,
    ) {
        info!(log, "Starting vCPU thread");
        let mut entry = VmEntry::Run;
        let mut exit = propolis::exits::VmExit::default();
        let mut local_gen = 0;
        loop {
            use propolis::tasks::Event;

            let mut force_exit_when_consistent = false;
            match task.pending_event() {
                Some(Event::Hold) => {
                    if !exit.kind.is_consistent() {
                        // Before the vCPU task can enter the held state, its
                        // associated in-kernel state must be driven to a point
                        // where it is consistent.
                        force_exit_when_consistent = true;
                    } else {
                        info!(log, "vCPU paused");
                        task.hold();
                        info!(log, "vCPU released from hold");

                        // If the VM was reset while the CPU was paused, clear out
                        // any re-entry reasons from the exit that occurred prior to
                        // the pause.
                        let current_gen = generation.load(Ordering::Acquire);
                        if local_gen != current_gen {
                            entry = VmEntry::Run;
                            local_gen = current_gen;
                        }

                        // This hold might have been satisfied by a request for the
                        // CPU to exit. Check for other pending events before
                        // re-entering the guest.
                        continue;
                    }
                }
                Some(Event::Exit) => break,
                None => {}
            }

            exit = match vcpu.run(&entry, force_exit_when_consistent) {
                Err(e) => {
                    event_handler.io_error_event(vcpu.id, e);
                    entry = VmEntry::Run;
                    continue;
                }
                Ok(exit) => exit,
            };

            entry = vcpu.process_vmexit(&exit).unwrap_or_else(|| {
                match exit.kind {
                    VmExitKind::Inout(pio) => {
                        debug!(&log, "Unhandled pio {:x?}", pio;
                                       "rip" => exit.rip);
                        VmEntry::InoutFulfill(exits::InoutRes::emulate_failed(
                            &pio,
                        ))
                    }
                    VmExitKind::Mmio(mmio) => {
                        debug!(&log, "Unhandled mmio {:x?}", mmio;
                                       "rip" => exit.rip);
                        VmEntry::MmioFulfill(exits::MmioRes::emulate_failed(
                            &mmio,
                        ))
                    }
                    VmExitKind::Rdmsr(msr) => {
                        debug!(&log, "Unhandled rdmsr {:08x}", msr;
                                       "rip" => exit.rip);
                        let _ = vcpu.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RAX,
                            0,
                        );
                        let _ = vcpu.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RDX,
                            0,
                        );
                        VmEntry::Run
                    }
                    VmExitKind::Wrmsr(msr, val) => {
                        debug!(&log, "Unhandled wrmsr {:08x} <- {:08x}", msr, val;
                                       "rip" => exit.rip);
                        VmEntry::Run
                    }
                    VmExitKind::Suspended(SuspendDetail { kind, when }) => {
                        use propolis::vcpu::Diagnostics;
                        match kind {
                            exits::Suspend::Halt => {
                                event_handler.suspend_halt_event(when);
                            }
                            exits::Suspend::Reset => {
                                event_handler.suspend_reset_event(when);
                            }
                            exits::Suspend::TripleFault(vcpuid) => {
                                slog::info!(
                                    &log,
                                    "triple fault on vcpu {}",
                                    vcpu.id;
                                    "state" => %Diagnostics::capture(vcpu)
                                );

                                if vcpuid == -1 || vcpuid == vcpu.id {
                                    event_handler
                                        .suspend_triple_fault_event(vcpu.id, when);
                                }
                            }
                        }

                        // This vCPU will not successfully re-enter the guest
                        // until the state worker does something about the
                        // suspend condition, so hold the task until it does so.
                        // Note that this blocks the task immediately.
                        //
                        // N.B.
                        // This usage assumes that it is safe for the VM
                        // controller to ask the task to hold again (which may
                        // occur if a separate pausing event is serviced in
                        // parallel on the state worker).
                        task.force_hold();
                        VmEntry::Run
                    }
                    VmExitKind::InstEmul(inst) => {
                        let diag = propolis::vcpu::Diagnostics::capture(vcpu);
                        error!(log,
                               "instruction emulation exit on vCPU {}",
                               vcpu.id;
                               "context" => ?inst,
                               "vcpu_state" => %diag);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                    VmExitKind::Unknown(code) => {
                        error!(log,
                               "unrecognized exit code on vCPU {}",
                               vcpu.id;
                               "code" => code);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                    // Bhyve emits the `Bogus` exit kind when there is no actual
                    // guest exit for user space to handle, but circumstances
                    // nevertheless dictate that the kernel VMM should exit to
                    // user space (e.g. a caller requested that all vCPUs be
                    // forced to exit to user space so their threads can
                    // rendezvous there).
                    //
                    // `process_vmexit` should always successfully handle this
                    // exit, since it never entails any work that could fail to
                    // be completed.
                    VmExitKind::Bogus => {
                        unreachable!(
                            "propolis-lib always handles VmExitKind::Bogus"
                        );
                    }
                    VmExitKind::Debug => {
                        error!(log,
                               "lib returned debug exit from vCPU {}",
                               vcpu.id);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                    VmExitKind::VmxError(detail) => {
                        error!(log,
                               "unclassified VMX exit on vCPU {}",
                               vcpu.id;
                               "detail" => ?detail);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                    VmExitKind::SvmError(detail) => {
                        error!(log,
                               "unclassified SVM exit on vCPU {}",
                               vcpu.id;
                               "detail" => ?detail);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                    VmExitKind::Paging(gpa, fault_type) => {
                        let diag = propolis::vcpu::Diagnostics::capture(vcpu);
                        error!(log,
                               "unhandled paging exit on vCPU {}",
                               vcpu.id;
                               "gpa" => gpa,
                               "fault_type" => fault_type,
                               "vcpu_state" => %diag);

                        event_handler.unhandled_vm_exit(vcpu.id, exit.kind);
                        VmEntry::Run
                    }
                }
            });
        }
        info!(log, "Exiting vCPU thread for CPU {}", vcpu.id);
    }
}

impl VcpuTaskController for VcpuTasks {
    fn pause_all(&mut self) {
        for task in self.tasks.iter_mut().map(|t| &mut t.0) {
            task.hold().unwrap();
        }
    }

    fn new_generation(&self) {
        self.generation.fetch_add(1, Ordering::Release);
    }

    fn resume_all(&mut self) {
        for task in self.tasks.iter_mut().map(|t| &mut t.0) {
            task.run().unwrap();
        }
    }

    fn exit_all(&mut self) {
        for task in self.tasks.iter_mut().map(|t| &mut t.0) {
            task.exit();
        }

        for thread in self.tasks.drain(..) {
            thread.1.join().unwrap();
        }
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/active.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implements a wrapper around an active VM.

use std::sync::Arc;

use propolis_api_types::instance::{
    InstanceProperties, InstanceStateRequested,
};
use propolis_api_types::instance_spec::SpecKey;
use slog::info;
use uuid::Uuid;

use crate::vm::request_queue::ExternalRequest;

use super::{
    objects::VmObjects, services::VmServices, CrucibleReplaceResultTx,
    InstanceStateRx, VmError,
};

/// The components and services that make up an active Propolis VM.
pub(crate) struct ActiveVm {
    /// The VM's logger.
    pub(super) log: slog::Logger,

    /// The input queue that receives external requests to change the VM's
    /// state.
    pub(super) state_driver_queue: Arc<super::state_driver::InputQueue>,

    /// Receives external state updates from the state driver.
    pub(super) external_state_rx: InstanceStateRx,

    /// The wrapped VM's properties.
    pub(super) properties: InstanceProperties,

    /// A reference to the wrapped VM's components. Callers with a reference to
    /// an `ActiveVm` can clone this to get a handle to those components.
    pub(super) objects: Arc<VmObjects>,

    /// Services that interact with VM users or the control plane outside the
    /// Propolis API (e.g. the serial console, VNC, and metrics reporting).
    pub(super) services: VmServices,

    /// The runtime on which this VM's state driver and any tasks spawned by
    /// the VM's components will run.
    pub(super) tokio_rt: tokio::runtime::Runtime,
}

impl ActiveVm {
    /// Yields a clonable reference to the active VM's components.
    pub(crate) fn objects(&self) -> &Arc<VmObjects> {
        &self.objects
    }

    /// Pushes a state change request to the VM's state change queue.
    pub(crate) fn put_state(
        &self,
        requested: InstanceStateRequested,
    ) -> Result<(), VmError> {
        info!(self.log, "requested state via API";
              "state" => ?requested);

        self.state_driver_queue
            .queue_external_request(match requested {
                InstanceStateRequested::Run => ExternalRequest::start(),
                InstanceStateRequested::Stop => ExternalRequest::stop(),
                InstanceStateRequested::Reboot => ExternalRequest::reboot(),
            })
            .map_err(Into::into)
    }

    /// Pushes a request to migrate out of a VM to the VM's state change queue.
    /// The migration protocol will communicate with the destination over the
    /// provided websocket.
    pub(crate) async fn request_migration_out(
        &self,
        migration_id: Uuid,
        websock: dropshot::WebsocketConnection,
    ) -> Result<(), VmError> {
        Ok(self.state_driver_queue.queue_external_request(
            ExternalRequest::migrate_as_source(migration_id, websock),
        )?)
    }

    /// Pushes a request to reconfigure a Crucible volume to the VM's state
    /// change queue.
    ///
    /// # Arguments
    ///
    /// - `disk_name`: The name of the Crucible disk component (in the instance
    ///   spec) to modify.
    /// - `backend_id`: The UUID to use to find the Crucible backend in the
    ///   VM's Crucible backend map.
    /// - `new_vcr_json`: The new volume construction request to supply to the
    ///   selected backend.
    /// - `result_tx`: The channel to which the state driver should send the
    ///   replacement result after it completes this operation.
    pub(crate) fn reconfigure_crucible_volume(
        &self,
        backend_id: SpecKey,
        new_vcr_json: String,
        result_tx: CrucibleReplaceResultTx,
    ) -> Result<(), VmError> {
        self.state_driver_queue
            .queue_external_request(
                ExternalRequest::reconfigure_crucible_volume(
                    backend_id,
                    new_vcr_json,
                    result_tx,
                ),
            )
            .map_err(Into::into)
    }

    /// Yields a reference to this VM's services.
    pub(crate) fn services(&self) -> &VmServices {
        &self.services
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/ensure.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tools for handling instance ensure requests.
//!
//! This module handles the first high-level phase of a VM's lifecycle, which
//! creates all of the VM's components and attendant data structures. These are
//! handed off to a `StateDriver` that implements the main VM event loop. See
//! the [`state_driver`] module docs for more details.
//!
//! This module uses distinct structs that each represent a distinct phase of VM
//! initialization. When a server receives a new ensure request, it creates the
//! first of these structures, then hands it off to the procedure described in
//! the ensure request to drive the rest of the initialization process, as in
//! the diagram below:
//!
//! ```text
//!                 +-------------------------+
//!                 |                         |
//!                 |  Initial state (no VM)  |
//!                 |                         |
//!                 +-----------+-------------+
//!                             |
//!                    Receive ensure request
//!                             |
//!                             v
//!                     VmEnsureNotStarted
//!                             |
//!                             |
//!                   +---------v----------+
//!          Yes      |                    |        No
//!           +-------+  Live migration?   +---------+
//!           |       |                    |         |
//!           |       +--------------------+         |
//!           |                                      |
//!     +-----v------+                               |
//!     |Get params  |                               |
//!     |from source |                               |
//!     +-----+------+                               |
//!           |                                      |
//!     +-----v------+                     +---------v-----------+
//!     |Initialize  |                     |Initialize components|
//!     |components  |                     |    from params      |
//!     +-----+------+                     +---------+-----------+
//!           |                                      |
//!           v                                      v
//! VmEnsureObjectsCreated                 VmEnsureObjectsCreated
//!           |                                      |
//!           |                                      |
//!     +-----v------+                               |
//!     |Import state|                               |
//!     |from source |                               |
//!     +-----+------+                               |
//!           |                                      |
//!           |                                      |
//!           |        +------------------+          |
//!           +-------->Launch VM services<----------+
//!                    +--------+---------+
//!                             |
//!                             |
//!                    +--------v---------+
//!                    |Move VM to Active |
//!                    +--------+---------+
//!                             |
//!                             |
//!                             v
//!                      VmEnsureActive<'_>
//! ```
//!
//! When initializing a VM from scratch, the ensure request contains a spec that
//! determines what components the VM should create, and they are created into
//! their default initial states. When migrating in, the VM-ensure structs are
//! handed off to the migration protocol, which fetches a spec from the
//! migration source, uses its contents to create the VM's components, and
//! imports the source VM's device state into those components.
//!
//! Once all components exist and are initialized, this module sets up "VM
//! services" (e.g. the serial console and metrics) that connect this VM to
//! other Oxide APIs and services. It then updates the server's VM state machine
//! and yields a completed "active" VM that can be passed into a state driver
//! run loop.
//!
//! Separating the initialization steps in this manner hides the gory details of
//! initializing a VM (and unwinding initialization) from higher-level
//! procedures like the migration protocol. Each initialize phase has a failure
//! handler that allows a higher-level driver to unwind the entire ensure
//! operation and drive the VM state machine to the correct resting state.
//!
//! [`state_driver`]: crate::vm::state_driver

use std::sync::Arc;

use oximeter::types::ProducerRegistry;
use oximeter_instruments::kstat::KstatSampler;
use propolis::enlightenment::{
    bhyve::BhyveGuestInterface,
    hyperv::{Features as HyperVFeatures, HyperV},
    Enlightenment,
};
use propolis_api_types::instance::{
    InstanceEnsureResponse, InstanceProperties, InstanceState,
};
use propolis_api_types::instance_spec::components::board::{
    GuestHypervisorInterface, HyperVFeatureFlag,
};
use propolis_api_types::migration::InstanceMigrateInitiateResponse;
use slog::{debug, info};

use crate::{
    initializer::{
        build_instance, MachineInitializer, MachineInitializerState,
    },
    migrate::destination::MigrationTargetInfo,
    spec::Spec,
    stats::{create_kstat_sampler, VirtualMachine},
    vm::{
        request_queue::InstanceAutoStart, VMM_BASE_RT_THREADS,
        VMM_MIN_RT_THREADS,
    },
};

use super::{
    objects::{InputVmObjects, VmObjects},
    services::VmServices,
    state_driver::InputQueue,
    state_publisher::{ExternalStateUpdate, StatePublisher},
    EnsureOptions, InstanceEnsureResponseTx, VmError,
};

pub(crate) enum VmInitializationMethod {
    Spec(Box<Spec>),
    Migration(MigrationTargetInfo),
}

pub(crate) struct VmEnsureRequest {
    pub(crate) properties: InstanceProperties,
    pub(crate) init: VmInitializationMethod,
}

impl VmEnsureRequest {
    pub(crate) fn is_migration(&self) -> bool {
        matches!(self.init, VmInitializationMethod::Migration(_))
    }

    pub(crate) fn migration_info(&self) -> Option<&MigrationTargetInfo> {
        match &self.init {
            VmInitializationMethod::Spec(_) => None,
            VmInitializationMethod::Migration(info) => Some(info),
        }
    }

    pub(crate) fn spec(&self) -> Option<&Spec> {
        match &self.init {
            VmInitializationMethod::Spec(spec) => Some(spec),
            VmInitializationMethod::Migration(_) => None,
        }
    }
}

/// Holds state about an instance ensure request that has not yet produced any
/// VM objects or driven the VM state machine to the `ActiveVm` state.
pub(crate) struct VmEnsureNotStarted<'a> {
    log: &'a slog::Logger,
    vm: &'a Arc<super::Vm>,
    ensure_request: &'a VmEnsureRequest,

    // VM objects are created on a separate tokio task from the one that drives
    // the instance ensure state machine. This task needs its own copy of the
    // ensure options. `EnsureOptions` is not `Clone`, so take a reference to an
    // `Arc` wrapper around the options to have something that can be cloned and
    // passed to the ensure task.
    ensure_options: &'a Arc<EnsureOptions>,
    ensure_response_tx: InstanceEnsureResponseTx,
    state_publisher: &'a mut StatePublisher,
}

impl<'a> VmEnsureNotStarted<'a> {
    pub(super) fn new(
        log: &'a slog::Logger,
        vm: &'a Arc<super::Vm>,
        ensure_request: &'a VmEnsureRequest,
        ensure_options: &'a Arc<EnsureOptions>,
        ensure_response_tx: InstanceEnsureResponseTx,
        state_publisher: &'a mut StatePublisher,
    ) -> Self {
        Self {
            log,
            vm,
            ensure_request,
            ensure_options,
            ensure_response_tx,
            state_publisher,
        }
    }

    pub(crate) fn state_publisher(&mut self) -> &mut StatePublisher {
        self.state_publisher
    }

    pub(crate) fn migration_info(&self) -> Option<&MigrationTargetInfo> {
        self.ensure_request.migration_info()
    }

    pub(crate) async fn create_objects_from_request(
        self,
    ) -> anyhow::Result<VmEnsureObjectsCreated<'a>> {
        let spec = self
            .ensure_request
            .spec()
            .expect(
                "create_objects_from_request is called with an explicit spec",
            )
            .clone();

        self.create_objects(spec).await
    }

    pub(crate) async fn create_objects_from_spec(
        self,
        spec: Spec,
    ) -> anyhow::Result<VmEnsureObjectsCreated<'a>> {
        self.create_objects(spec).await
    }

    /// Creates a set of VM objects using the instance spec stored in this
    /// ensure request, but does not install them as an active VM.
    async fn create_objects(
        self,
        spec: Spec,
    ) -> anyhow::Result<VmEnsureObjectsCreated<'a>> {
        debug!(self.log, "creating VM objects");

        let input_queue = Arc::new(InputQueue::new(
            self.log.new(slog::o!("component" => "request_queue")),
            if self.ensure_request.is_migration() {
                InstanceAutoStart::Yes
            } else {
                InstanceAutoStart::No
            },
        ));

        let log_for_init = self.log.clone();
        let properties = self.ensure_request.properties.clone();
        let options = self.ensure_options.clone();
        let queue_for_init = input_queue.clone();

        // Either the block following this succeeds with both a Tokio runtime
        // and VM objects, or entirely fails with no partial state for us to
        // clean up.
        type InitResult =
            anyhow::Result<(tokio::runtime::Runtime, InputVmObjects)>;

        // We need to create a new runtime to host the tasks for this VMM's
        // objects, but that initialization is fallible and results in dropping
        // the fledgling VMM runtime itself. Dropping a Tokio runtime on a
        // worker thread in a Tokio runtime will panic, so do all init in a
        // `spawn_blocking` where this won't be an issue.
        //
        // When the runtime is returned to this thread, it must not be dropped.
        // That means that the path between this result and returning an
        // `Ok(VmEnsureObjectsCreated)` must be infallible.
        let result: InitResult = tokio::task::spawn_blocking(move || {
            // Create the runtime that will host tasks created by
            // VMM components (e.g. block device runtime tasks).
            let vmm_rt = {
                let mut builder = tokio::runtime::Builder::new_multi_thread();
                builder.thread_name("tokio-rt-vmm").worker_threads(usize::max(
                    VMM_MIN_RT_THREADS,
                    VMM_BASE_RT_THREADS + spec.board.cpus as usize,
                ));
                oxide_tokio_rt::build(&mut builder)?
            };

            let init_result = vmm_rt
                .block_on(async move {
                    initialize_vm_objects(
                        log_for_init,
                        spec,
                        properties,
                        options,
                        queue_for_init,
                    )
                    .await
                })
                .map_err(|e| {
                    anyhow::anyhow!(
                        "failed to join VM object creation task: {e}"
                    )
                })?;
            Ok((vmm_rt, init_result))
        })
        .await
        .map_err(|e| {
            // This is extremely unexpected: if the join failed, the init
            // task panicked or was cancelled. If the init itself failed,
            // which is somewhat more reasonable, we would expect the join
            // to succeed and have an error below.
            anyhow::anyhow!("failed to join VMM runtime init task: {e}")
        })?;

        match result {
            Ok((vmm_rt, objects)) => {
                // N.B. Once these `VmObjects` exist, it is no longer safe to
                //      call `vm_init_failed`.
                let objects = Arc::new(VmObjects::new(
                    self.log.clone(),
                    self.vm.clone(),
                    objects,
                ));

                Ok(VmEnsureObjectsCreated {
                    log: self.log,
                    vm: self.vm,
                    vmm_rt,
                    ensure_request: self.ensure_request,
                    ensure_options: self.ensure_options,
                    ensure_response_tx: self.ensure_response_tx,
                    state_publisher: self.state_publisher,
                    vm_objects: objects,
                    input_queue,
                    kernel_vm_paused: false,
                })
            }
            Err(e) => Err(self.fail(e).await),
        }
    }

    pub(crate) async fn fail(self, reason: anyhow::Error) -> anyhow::Error {
        self.state_publisher
            .update(ExternalStateUpdate::Instance(InstanceState::Failed));

        self.vm.vm_init_failed().await;
        let _ = self
            .ensure_response_tx
            .send(Err(VmError::InitializationFailed(reason.to_string())));

        reason
    }
}

/// Represents an instance ensure request that has proceeded far enough to
/// create a set of VM objects, but that has not yet installed those objects as
/// an `ActiveVm` or notified the requestor that its request is complete.
///
/// WARNING: dropping `VmEnsureObjectsCreated` is a panic risk since dropping
/// the contained `tokio::runtime::Runtime` on in a worker thread will panic. It
/// is probably a bug to drop `VmEnsureObjectsCreated`, as it is expected users
/// will quickly call [`VmEnsureObjectsCreated::ensure_active`], but if you
/// must, take care in handling the contained `vmm_rt`.
pub(crate) struct VmEnsureObjectsCreated<'a> {
    log: &'a slog::Logger,
    vm: &'a Arc<super::Vm>,
    vmm_rt: tokio::runtime::Runtime,
    ensure_request: &'a VmEnsureRequest,
    ensure_options: &'a EnsureOptions,
    ensure_response_tx: InstanceEnsureResponseTx,
    state_publisher: &'a mut StatePublisher,
    vm_objects: Arc<VmObjects>,
    input_queue: Arc<InputQueue>,
    kernel_vm_paused: bool,
}

impl<'a> VmEnsureObjectsCreated<'a> {
    /// Prepares the VM's CPUs for an incoming live migration by activating them
    /// (at the kernel VM level) and then pausing the kernel VM. This must be
    /// done before importing any state into these objects.
    ///
    /// # Panics
    ///
    /// Panics if called more than once on the same set of objects.
    pub(crate) async fn prepare_for_migration(&mut self) {
        assert!(!self.kernel_vm_paused);
        let guard = self.vm_objects.lock_exclusive().await;
        guard.reset_vcpus();
        guard.pause_kernel_vm();
        self.kernel_vm_paused = true;
    }

    /// Uses this struct's VM objects to create a set of VM services, then
    /// installs an active VM into the parent VM state machine and notifies the
    /// ensure requester that its request is complete.
    pub(crate) async fn ensure_active(self) -> VmEnsureActive<'a> {
        let vm_services = VmServices::new(
            self.log,
            &self.vm_objects,
            &self.ensure_request.properties,
            self.ensure_options,
        )
        .await;

        let vmm_rt_hdl = self.vmm_rt.handle().clone();
        self.vm
            .make_active(
                self.log,
                self.input_queue.clone(),
                &self.vm_objects,
                vm_services,
                self.vmm_rt,
            )
            .await;

        // The response channel may be closed if the client who asked to ensure
        // the VM timed out or disconnected. This is OK; now that the VM is
        // active, a new client can recover by reading the current instance
        // state and using the state change API to send commands to the state
        // driver.
        let _ = self.ensure_response_tx.send(Ok(InstanceEnsureResponse {
            migrate: self.ensure_request.migration_info().map(|req| {
                InstanceMigrateInitiateResponse {
                    migration_id: req.migration_id,
                }
            }),
        }));

        VmEnsureActive {
            vm: self.vm,
            vmm_rt_hdl,
            state_publisher: self.state_publisher,
            vm_objects: self.vm_objects,
            input_queue: self.input_queue,
            kernel_vm_paused: self.kernel_vm_paused,
        }
    }
}

/// Describes a set of VM objects that are fully initialized and referred to by
/// the `ActiveVm` in a VM state machine, but for which a state driver loop has
/// not started yet.
pub(crate) struct VmEnsureActive<'a> {
    vm: &'a Arc<super::Vm>,
    vmm_rt_hdl: tokio::runtime::Handle,
    state_publisher: &'a mut StatePublisher,
    vm_objects: Arc<VmObjects>,
    input_queue: Arc<InputQueue>,
    kernel_vm_paused: bool,
}

pub(super) struct VmEnsureActiveOutput {
    pub vm_objects: Arc<VmObjects>,
    pub input_queue: Arc<InputQueue>,
    pub vmm_rt_hdl: tokio::runtime::Handle,
}

impl VmEnsureActive<'_> {
    pub(crate) fn vm_objects(&self) -> &Arc<VmObjects> {
        &self.vm_objects
    }

    pub(crate) fn state_publisher(&mut self) -> &mut StatePublisher {
        self.state_publisher
    }

    pub(crate) async fn fail(mut self) {
        // If a caller asked to prepare the VM objects for migration in the
        // previous phase, make sure that operation is undone before the VM
        // objects are torn down.
        if self.kernel_vm_paused {
            let guard = self.vm_objects.lock_exclusive().await;
            guard.resume_kernel_vm();
            self.kernel_vm_paused = false;
        }

        self.state_publisher
            .update(ExternalStateUpdate::Instance(InstanceState::Failed));

        // Since there are extant VM objects, move to the Rundown state. The VM
        // will move to RundownComplete when the objects are finally dropped.
        self.vm.set_rundown().await;
    }

    /// Yields the VM objects and input queue for this VM so that they can be
    /// used to start a state driver loop.
    pub(super) fn into_inner(self) -> VmEnsureActiveOutput {
        VmEnsureActiveOutput {
            vm_objects: self.vm_objects,
            input_queue: self.input_queue,
            vmm_rt_hdl: self.vmm_rt_hdl,
        }
    }
}

async fn initialize_vm_objects(
    log: slog::Logger,
    spec: Spec,
    properties: InstanceProperties,
    options: Arc<EnsureOptions>,
    event_queue: Arc<InputQueue>,
) -> anyhow::Result<InputVmObjects> {
    info!(log, "initializing new VM";
              "spec" => #?spec,
              "properties" => #?properties,
              "use_reservoir" => options.use_reservoir,
              "bootrom" => %options.bootrom_path.display());

    let vmm_log = log.new(slog::o!("component" => "vmm"));

    let (guest_hv_interface, guest_hv_lifecycle) =
        match &spec.board.guest_hv_interface {
            GuestHypervisorInterface::Bhyve => {
                let bhyve = Arc::new(BhyveGuestInterface);
                let lifecycle = bhyve.clone();
                (bhyve as Arc<dyn Enlightenment>, lifecycle.as_lifecycle())
            }
            GuestHypervisorInterface::HyperV { features } => {
                let mut hv_features = HyperVFeatures::default();
                for f in features {
                    match f {
                        HyperVFeatureFlag::ReferenceTsc => {
                            hv_features.reference_tsc = true
                        }
                    }
                }

                let hyperv = Arc::new(HyperV::new(&vmm_log, hv_features));
                let lifecycle = hyperv.clone();
                (hyperv as Arc<dyn Enlightenment>, lifecycle.as_lifecycle())
            }
        };

    // Set up the 'shell' instance into which the rest of this routine will
    // add components.
    let machine = build_instance(
        &properties.vm_name(),
        &spec,
        options.use_reservoir,
        guest_hv_interface.clone(),
        vmm_log,
    )?;

    let mut init = MachineInitializer {
        log: log.clone(),
        machine: &machine,
        devices: Default::default(),
        block_backends: Default::default(),
        crucible_backends: Default::default(),
        spec: &spec,
        properties: &properties,
        producer_registry: options.oximeter_registry.clone(),
        state: MachineInitializerState::default(),
        kstat_sampler: initialize_kstat_sampler(
            &log,
            &spec,
            options.oximeter_registry.clone(),
        ),
        stats_vm: VirtualMachine::new(spec.board.cpus, &properties),
    };

    init.initialize_rom(options.bootrom_path.as_path())?;
    let chipset = init.initialize_chipset(
        &(event_queue.clone()
            as Arc<dyn super::guest_event::ChipsetEventHandler>),
    )?;

    init.initialize_rtc(&chipset)?;
    init.initialize_hpet();

    let com1 = Arc::new(init.initialize_uart(&chipset));
    let ps2ctrl = init.initialize_ps2(&chipset);
    init.initialize_qemu_debug_port()?;
    init.initialize_qemu_pvpanic(VirtualMachine::new(
        spec.board.cpus,
        &properties,
    ))?;
    init.initialize_network_devices(&chipset).await?;
    let mut attest_handle =
        init.initialize_vsock(&chipset, options.attest_config).await?;

    #[cfg(feature = "failure-injection")]
    init.initialize_test_devices();

    #[cfg(feature = "falcon")]
    {
        init.initialize_softnpu_ports(&chipset)?;
        init.initialize_9pfs(&chipset);
    }

    let wanted_heap = init
        .initialize_storage_devices(&chipset, options.nexus_client.clone())
        .await?;

    let ramfb =
        init.initialize_fwcfg(spec.board.cpus, &options.bootrom_version)?;

    // If we have a VM RoT, that RoT needs to be able to collect some
    // information about the guest before it can be actually usable. It will do
    // that asynchronously, but have to provide references for initial necessary
    // VM state.
    if let Some(attest_handle) = attest_handle.as_mut() {
        init.prepare_rot_initializer(attest_handle)?;
    }

    init.register_guest_hv_interface(guest_hv_lifecycle);
    init.initialize_cpus().await?;

    let total_cpus = pbind::online_cpus()?;
    let vcpu_count: i32 = machine
        .vcpus
        .len()
        .try_into()
        .map_err(|_| anyhow::anyhow!("more than 2^31 vCPUs"))?;

    // When a VM has I/O-heavy workloads across many cores, vCPUs can end up
    // moving across the host and come with wasted time in juggling cyclics.
    //
    // Nexus can't yet determine CPU binding assignments in a central manner. In
    // lieu of it, knowing that Nexus won't oversubscribe a host we can
    // autonomously follow a CPU pinning strategy as: "if the VM is larger than
    // half of the sled, there can only be one, so bind it to specific CPUs and
    // rely on the OS to sort out the rest". This gets the largest VMs to fixed
    // vCPU->CPU assignments, which also are most likely to benefit.
    let cpu_threshold = total_cpus / 2;
    let bind_cpus = if vcpu_count > cpu_threshold {
        if vcpu_count > total_cpus {
            anyhow::bail!("spec requested more CPUs than are online!");
        }

        // Bind to the upper range of CPUs, fairly arbitrary.
        let first_bind_cpu = total_cpus - vcpu_count;
        let bind_cpus = (first_bind_cpu..total_cpus).collect();

        info!(log, "applying automatic vCPU->CPU binding";
                  "vcpu_count" => vcpu_count,
                  "total_cpus" => total_cpus,
                  "threshold" => cpu_threshold,
                  "vcpu_cpus" => #?bind_cpus);

        Some(bind_cpus)
    } else {
        None
    };

    let vcpu_tasks = Box::new(crate::vcpu_tasks::VcpuTasks::new(
        &machine,
        event_queue.clone() as Arc<dyn super::guest_event::VcpuEventHandler>,
        bind_cpus,
        log.new(slog::o!("component" => "vcpu_tasks")),
    )?);

    let MachineInitializer {
        devices, block_backends, crucible_backends, ..
    } = init;

    let res = InputVmObjects {
        instance_spec: spec.clone(),
        vcpu_tasks,
        machine,
        devices,
        block_backends,
        crucible_backends,
        com1,
        framebuffer: Some(ramfb),
        ps2ctrl,
        attest_handle,
    };

    // Another really terrible hack. As we've found in Propolis#1008, brk()
    // can end end up starved by a long stream of page faults. We allocate
    // and deallocate often enough in the hot parts of Propolis at runtime,
    // but the working set is expected to be relatively tiny; we'd brk()
    // once or twice, get up to a reasonable size, and never thing twice a
    // bout it.
    //
    // In practice that later allocation may be blocked for tens of
    // *minutes*, and might be while holding locks for device state (in
    // 1008 we hold a queue state lock, for example). Everything goes off
    // the rails from there, as vCPUs can easily get dragged into the mess,
    // guest NMIs aren't acknowledged because vCPUs can't run, etc.
    //
    // So, the awful hack. Most of the runtime growth, that we can think of,
    // comes from storage backend implementation. We guess at that amount in
    // `initialize_storage_devices`. Now' we'll alloc at least that much, free
    // it, and try to avoid brk() when the OS won't be able to get to it.
    //
    // This should be able to be removed when we are confident we can
    // actually brk() at runtime without starving ourselves out.
    //
    // As one last step, include an extra 134 MiB in the balloon. In the happy
    // case, even just 16 MiB was sufficient for Oximeter and regular check-ins
    // with Propolis. In at least one case, though, we saw a Propolis with a
    // heap about 126 MiB larger than the balloon had wanted. We don't know what
    // happened to grow the heap like that, but size against the worst case for
    // now for safety.
    //
    // All in all the worst case balloon puts propolis-server's heap a bit over
    // (around 483 MiB + 30MiB) the RFD 413 expectation of 0.5 GiB for Propolis
    // memory.
    let balloon_size = wanted_heap + 134 * propolis::common::MB;
    info!(log, "inflating balloon";
        "balloon_size" => balloon_size);
    let balloon = vec![0u8; balloon_size];
    // Do a volatile access to the Vec to make sure Rust doesn't optimize it
    // out...
    unsafe { std::ptr::read_volatile(balloon.as_ptr()) };
    std::mem::drop(balloon);

    Ok(res)
}

/// Create an object used to sample kstats.
fn initialize_kstat_sampler(
    log: &slog::Logger,
    spec: &Spec,
    producer_registry: Option<ProducerRegistry>,
) -> Option<KstatSampler> {
    let registry = producer_registry?;
    let sampler = create_kstat_sampler(log, spec)?;

    match registry.register_producer(sampler.clone()) {
        Ok(_) => Some(sampler),
        Err(e) => {
            slog::error!(
                log,
                "Failed to register kstat sampler in producer \
                registry, no kstat-based metrics will be produced";
                "error" => ?e,
            );
            None
        }
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/guest_event.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Types and traits for handling guest-emitted events on the VM state driver.

use std::{collections::VecDeque, time::Duration};

/// An event raised by some component in the instance (e.g. a vCPU or the
/// chipset) that the state worker must handle.
///
/// The vCPU-sourced events carry a time element (duration since VM boot) as
/// emitted by the kernel vmm.  This is used to deduplicate events when all
/// vCPUs running in-kernel are kicked out for the suspend state.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(super) enum GuestEvent {
    /// Fired when the bhyve VM enters its halt state.
    VcpuSuspendHalt(Duration),
    /// Fired when the bhyve VM enters its reset state.
    VcpuSuspendReset(Duration),
    /// Fired when the bhyve VM resets due to a triple fault. The first element
    /// identifies the vCPU that sent this notification.
    VcpuSuspendTripleFault(i32, Duration),
    /// Chipset signaled halt condition
    ChipsetHalt,
    /// Chipset signaled reboot condition
    ChipsetReset,
}

#[derive(Debug, Default)]
pub(super) struct GuestEventQueue {
    queue: VecDeque<GuestEvent>,
}

/// A sink for events raised by a VM's vCPU tasks.
pub(crate) trait VcpuEventHandler: Send + Sync {
    fn suspend_halt_event(&self, when: Duration);
    fn suspend_reset_event(&self, when: Duration);
    fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration);
    fn unhandled_vm_exit(
        &self,
        vcpu_id: i32,
        exit: propolis::exits::VmExitKind,
    );
    fn io_error_event(&self, vcpu_id: i32, error: std::io::Error);
}

/// A sink for events raised by a VM's chipset.
pub(crate) trait ChipsetEventHandler: Send + Sync {
    fn chipset_halt(&self);
    fn chipset_reset(&self);
}

impl GuestEventQueue {
    pub(super) fn enqueue(&mut self, event: GuestEvent) -> bool {
        if !self.queue.iter().any(|ev| *ev == event) {
            self.queue.push_back(event);
            true
        } else {
            false
        }
    }

    pub(super) fn pop_front(&mut self) -> Option<GuestEvent> {
        self.queue.pop_front()
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implements the [`Vm`] type, which encapsulates a single Propolis virtual
//! machine instance and provides a public interface thereto to the Propolis
//! Dropshot server.
//!
//! The VM state machine looks like this:
//!
//! ```text
//!            [NoVm]
//!              |
//!              |
//!              v
//! +---- WaitingForInit <----+
//! |            |            |
//! |            |            |
//! |            v            |
//! |         Active          |
//! |            |            |
//! |            |            |
//! |            v            |
//! +-------> Rundown         |
//! |            |            |
//! |            |            |
//! |            v            |
//! +---> RundownComplete ----+
//! ```
//!
//! In the happy case where new VMs always start successfully, this state
//! machine transitions as follows:
//!
//! - New state machines start in [`VmState::NoVm`].
//! - A request to create a new VM moves to [`VmState::WaitingForInit`].
//! - Once all of the VM's components are created, the VM moves to
//!   [`VmState::Active`].
//! - When the VM stops, the VM moves to [`VmState::Rundown`].
//! - When all references to the VM's components are dropped, the VM moves to
//!   [`VmState::RundownComplete`]. A request to create a new VM will move back
//!   to `WaitingForInit`.
//!
//! In any state except `NoVm`, the state machine holds enough state to describe
//! the most recent VM known to the state machine, whether it is being created
//! (`WaitingForInit`), running (`Active`), or being torn down (`Rundown` and
//! `RundownComplete`).
//!
//! In the `Active` state, the VM wrapper holds an [`active::ActiveVm`] and
//! allows API-layer callers to obtain references to it. These callers use these
//! references to ask to change a VM's state or change its configuration. An
//! active VM holds a reference to a [`objects::VmObjects`] structure that
//! bundles up all of the Propolis components (kernel VM, devices, and backends)
//! that make up an instance and a spec that describes that instance; API-layer
//! callers may use this structure to read the instance's properties and query
//! component state, but cannot mutate the VM's structure this way.
//!
//! Requests to change a VM's state or configuration (and events from a running
//! guest that might change a VM's state, like an in-guest shutdown or reboot
//! request or a triple fault) are placed in an [input
//! queue](state_driver::InputQueue) that is serviced by a single "state driver"
//! task. When an instance stops, this task moves the state machine to the
//! `Rundown` state, which renders new API-layer callers unable to clone new
//! references to the VM's `VmObjects`. When all outstanding references to the
//! objects are dropped, the VM moves to the `RundownComplete` state, obtains
//! the final instance state from the (joined) state driver task, and publishes
//! that state. At that point the VM may be reinitialized.
//!
//! The VM state machine delegates VM creation to the state driver task. This
//! task can fail to initialize a VM in two ways:
//!
//! 1. It may fail to create all of the VM's component objects (e.g. due to
//!    bad configuration or resource exhaustion).
//! 2. It may successfully create all of the VM's component objects, but then
//!    fail to populate their initial state via live migration from another
//!    instance.
//!
//! In the former case, where no VM objects are ever created, the state driver
//! moves the state machine directly from `WaitingForInit` to `RundownComplete`.
//! In the latter case, the driver moves to `Rundown` and allows `VmObjects`
//! teardown to drive the state machine to `RundownComplete`.

use std::{collections::BTreeMap, net::SocketAddr, path::PathBuf, sync::Arc};

use active::ActiveVm;
use ensure::VmEnsureRequest;
use oximeter::types::ProducerRegistry;
use propolis_api_types::instance::{
    InstanceEnsureResponse, InstanceProperties, InstanceState,
    InstanceStateMonitorResponse,
};
use propolis_api_types::instance_spec::{
    InstanceSpecGetResponse, InstanceSpecStatus, SpecKey,
};
use propolis_api_types::migration::{
    InstanceMigrateStatusResponse, InstanceMigrationStatus, MigrationState,
};
use slog::info;
use state_driver::StateDriverOutput;
use state_publisher::StatePublisher;
use tokio::sync::{oneshot, watch, RwLock, RwLockReadGuard};

use crate::{server::MetricsEndpointConfig, spec::Spec, vnc::VncServer};
use propolis::attestation::server::AttestationServerConfig;

mod active;
pub(crate) mod ensure;
pub(crate) mod guest_event;
pub(crate) mod objects;
mod request_queue;
mod services;
mod state_driver;
pub(crate) mod state_publisher;

/// Maps component names to lifecycle trait objects that allow
/// components to be started, paused, resumed, and halted.
pub(crate) type DeviceMap =
    BTreeMap<SpecKey, Arc<dyn propolis::common::Lifecycle>>;

/// Mapping of NIC identifiers to viona device instance IDs.
/// We use a Vec here due to the limited size of the NIC array.
pub(crate) type NetworkInterfaceIds = Vec<(uuid::Uuid, KstatInstanceId)>;

/// Maps component names to block backend trait objects.
pub(crate) type BlockBackendMap =
    BTreeMap<SpecKey, Arc<dyn propolis::block::Backend>>;

/// Maps component names to Crucible backend objects.
pub(crate) type CrucibleBackendMap =
    BTreeMap<SpecKey, Arc<propolis::block::CrucibleBackend>>;

/// Type alias for the sender side of the channel that receives
/// externally-visible instance state updates.
type InstanceStateTx = watch::Sender<InstanceStateMonitorResponse>;

/// Type alias for the receiver side of the channel that receives
/// externally-visible instance state updates.
type InstanceStateRx = watch::Receiver<InstanceStateMonitorResponse>;

/// Type alias for the results sent by the state driver in response to a request
/// to change a Crucible backend's configuration.
pub(crate) type CrucibleReplaceResult =
    Result<crucible_client_types::ReplaceResult, dropshot::HttpError>;

/// Type alias for the sender side of a channel that receives Crucible backend
/// reconfiguration results.
pub(crate) type CrucibleReplaceResultTx =
    oneshot::Sender<CrucibleReplaceResult>;

/// PCI device instance ID type to which a per-component Kstat (kernal stat)
/// instance ID maps to.
type KstatInstanceId = u32;

/// Type alias for the sender side of a channel that receives the results of
/// instance-ensure API calls.
type InstanceEnsureResponseTx =
    oneshot::Sender<Result<InstanceEnsureResponse, VmError>>;

/// The minimum number of threads to spawn in the Tokio runtime that runs the
/// state driver and any other VM-related tasks.
const VMM_MIN_RT_THREADS: usize = 8;

/// When creating a new VM, add the VM's vCPU count to this value, then spawn
/// that many threads on its Tokio runtime or [`VMM_MIN_RT_THREADS`], whichever
/// is greater.
const VMM_BASE_RT_THREADS: usize = 4;

/// Errors generated by the VM controller and its subcomponents.
#[derive(Debug, thiserror::Error)]
pub(crate) enum VmError {
    #[error("VM operation result channel unexpectedly closed")]
    ResultChannelClosed,

    #[error("VM is currently initializing")]
    WaitingToInitialize,

    #[error("VM already initialized")]
    AlreadyInitialized,

    #[error("VM is currently shutting down")]
    RundownInProgress,

    #[error("VM initialization failed: {0}")]
    InitializationFailed(String),

    #[error("Forbidden state change")]
    ForbiddenStateChange(#[from] request_queue::RequestDeniedReason),
}

/// The top-level VM wrapper type.
pub(crate) struct Vm {
    /// Lock wrapper for the VM state machine's contents.
    ///
    /// Routines that need to read VM properties or obtain a `VmObjects` handle
    /// acquire this lock shared.
    ///
    /// Routines that drive the VM state machine acquire this lock exclusively.
    inner: RwLock<VmInner>,

    /// A logger for this VM.
    log: slog::Logger,
}

/// Holds a VM state machine and state driver task handle.
struct VmInner {
    /// The VM's current state.
    state: VmState,

    /// A handle to the VM's current state driver task, if it has one.
    driver: Option<tokio::task::JoinHandle<StateDriverOutput>>,
}

/// Stores a possibly-absent instance spec with a reason for its absence.
#[derive(Clone, Debug)]
enum MaybeSpec {
    Present(Box<Spec>),

    /// The spec is not known yet because the VM is initializing via live
    /// migration, and the source's spec is not available yet.
    WaitingForMigrationSource,
}

impl From<MaybeSpec> for InstanceSpecStatus {
    fn from(value: MaybeSpec) -> Self {
        match value {
            MaybeSpec::WaitingForMigrationSource => {
                Self::WaitingForMigrationSource
            }
            MaybeSpec::Present(spec) => Self::Present((*spec).into()),
        }
    }
}

/// Describes a past or future VM and its properties.
struct VmDescription {
    /// Records the VM's last externally-visible state.
    external_state_rx: InstanceStateRx,

    /// The VM's API-level instance properties.
    properties: InstanceProperties,

    /// The runtime on which the VM's state driver is running (or on which it
    /// ran).
    tokio_rt: Option<tokio::runtime::Runtime>,
}

/// The states in the VM state machine. See the module comment for more details.
#[allow(clippy::large_enum_variant)]
enum VmState {
    /// This state machine has never held a VM.
    NoVm,

    /// A new state driver is attempting to initialize objects for a VM with the
    /// ecnlosed description.
    WaitingForInit { vm: VmDescription, spec: MaybeSpec },

    /// The VM is active, and callers can obtain a handle to its objects.
    Active(active::ActiveVm),

    /// The previous VM is shutting down, but its objects have not been fully
    /// destroyed yet.
    Rundown { vm: VmDescription, spec: Box<Spec> },

    /// The previous VM and its objects have been cleaned up.
    RundownComplete { vm: VmDescription, spec: MaybeSpec },
}

impl std::fmt::Display for VmState {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}",
            match self {
                Self::NoVm => "NoVm",
                Self::WaitingForInit { .. } => "WaitingForInit",
                Self::Active(_) => "Active",
                Self::Rundown { .. } => "Rundown",
                Self::RundownComplete { .. } => "RundownComplete",
            }
        )
    }
}

/// Parameters to an instance ensure operation.
pub(super) struct EnsureOptions {
    /// The path to the bootrom to load into the guest.
    pub(super) bootrom_path: PathBuf,

    /// The bootrom version string to expose to the guest. If None, the machine
    /// initializer chooses a default.
    pub(super) bootrom_version: Option<String>,

    /// True if VMs should allocate memory from the kernel VMM reservoir.
    pub(super) use_reservoir: bool,

    /// Configuration used to serve Oximeter metrics from this server.
    pub(super) metrics_config: Option<MetricsEndpointConfig>,

    /// An Oximeter producer registry to pass to components that will emit
    /// Oximeter metrics.
    pub(super) oximeter_registry: Option<ProducerRegistry>,

    /// A Nexus client handle to pass to components that can make upcalls to
    /// Nexus.
    pub(super) nexus_client: Option<nexus_client::Client>,

    /// A reference to the process's VNC server, used to connect the server to
    /// a new VM's framebuffer.
    pub(super) vnc_server: Arc<VncServer>,

    /// The address of this Propolis process, used by the live migration
    /// protocol to transfer serial console connections.
    pub(super) local_server_addr: SocketAddr,

    pub(super) attest_config: Option<AttestationServerConfig>,
}

impl Vm {
    /// Creates a new VM.
    pub fn new(log: &slog::Logger) -> Arc<Self> {
        let log = log.new(slog::o!("component" => "vm_wrapper"));
        let inner = VmInner { state: VmState::NoVm, driver: None };
        Arc::new(Self { inner: RwLock::new(inner), log })
    }

    /// If the VM is `Active`, yields a shared lock guard with a reference to
    /// the relevant `ActiveVm`. Returns `None` if there is no active VM.
    pub(super) async fn active_vm(
        &self,
    ) -> Option<RwLockReadGuard<'_, ActiveVm>> {
        RwLockReadGuard::try_map(self.inner.read().await, |inner| {
            if let VmState::Active(vm) = &inner.state {
                Some(vm)
            } else {
                None
            }
        })
        .ok()
    }

    /// Returns the state, properties, and instance spec for the instance most
    /// recently wrapped by this `Vm`.
    ///
    /// # Returns
    ///
    /// - `Some` if the VM has been created.
    /// - `None` if no VM has ever been created.
    pub(super) async fn get(&self) -> Option<InstanceSpecGetResponse> {
        let guard = self.inner.read().await;
        match &guard.state {
            // If no VM has ever been created, there's nothing to get.
            VmState::NoVm => None,

            // If the VM is active, pull the required data out of its objects.
            VmState::Active(vm) => {
                let spec =
                    vm.objects().lock_shared().await.instance_spec().clone();
                let state = vm.external_state_rx.borrow().clone();
                Some(InstanceSpecGetResponse {
                    properties: vm.properties.clone(),
                    spec: InstanceSpecStatus::Present(spec.into()),
                    state: state.state,
                })
            }
            VmState::WaitingForInit { vm, spec }
            | VmState::RundownComplete { vm, spec } => {
                Some(InstanceSpecGetResponse {
                    properties: vm.properties.clone(),
                    state: vm.external_state_rx.borrow().state,
                    spec: spec.clone().into(),
                })
            }
            VmState::Rundown { vm, spec } => Some(InstanceSpecGetResponse {
                properties: vm.properties.clone(),
                state: vm.external_state_rx.borrow().state,
                spec: InstanceSpecStatus::Present(
                    spec.as_ref().to_owned().into(),
                ),
            }),
        }
    }

    /// Yields a handle to the most recent instance state receiver wrapped by
    /// this `Vm`.
    ///
    /// # Returns
    ///
    /// - `Some` if the VM has been created.
    /// - `None` if no VM has ever been created.
    pub(super) async fn state_watcher(&self) -> Option<InstanceStateRx> {
        let guard = self.inner.read().await;
        match &guard.state {
            VmState::NoVm => None,
            VmState::Active(vm) => Some(vm.external_state_rx.clone()),
            VmState::WaitingForInit { vm, .. }
            | VmState::Rundown { vm, .. }
            | VmState::RundownComplete { vm, .. } => {
                Some(vm.external_state_rx.clone())
            }
        }
    }

    /// Moves this VM from the `WaitingForInit` state to the `Active` state,
    /// creating an `ActiveVm` with the supplied input queue, VM objects, and VM
    /// services.
    ///
    /// # Panics
    ///
    /// Panics if the VM is not in the `WaitingForInit` state.
    async fn make_active(
        self: &Arc<Self>,
        log: &slog::Logger,
        state_driver_queue: Arc<state_driver::InputQueue>,
        objects: &Arc<objects::VmObjects>,
        services: services::VmServices,
        vmm_rt: tokio::runtime::Runtime,
    ) {
        info!(self.log, "installing active VM");
        let mut guard = self.inner.write().await;
        let old = std::mem::replace(&mut guard.state, VmState::NoVm);
        match old {
            VmState::WaitingForInit { vm, .. } => {
                guard.state = VmState::Active(ActiveVm {
                    log: log.clone(),
                    state_driver_queue,
                    external_state_rx: vm.external_state_rx,
                    properties: vm.properties,
                    objects: objects.clone(),
                    services,
                    tokio_rt: vmm_rt,
                });
            }
            state => unreachable!(
                "only a starting VM's state worker calls make_active \
                (current state: {state})"
            ),
        }
    }

    /// Moves this VM from the `WaitingForInit` state to the `RundownComplete`
    /// state in response to an instance initialization failure.
    ///
    /// The caller must ensure there are no active `VmObjects` that refer to
    /// this VM.
    ///
    /// # Panics
    ///
    /// Panics if the VM is not in the `WaitingForInit` state.
    async fn vm_init_failed(&self) {
        let mut guard = self.inner.write().await;
        let old = std::mem::replace(&mut guard.state, VmState::NoVm);
        match old {
            VmState::WaitingForInit { vm, spec } => {
                guard.state = VmState::RundownComplete { vm, spec }
            }
            state => unreachable!(
                "start failures should only occur before an active VM is \
                installed (current state: {state})"
            ),
        }
    }

    /// Moves this VM from the `Active` state to the `Rundown` state.
    ///
    /// This routine should only be called by the state driver.
    ///
    /// # Panics
    ///
    /// Panics if the VM is not in the `Active` state.
    async fn set_rundown(&self) {
        info!(self.log, "setting VM rundown");
        let services = {
            let mut guard = self.inner.write().await;
            let old = std::mem::replace(&mut guard.state, VmState::NoVm);
            let vm = match old {
                VmState::Active(vm) => vm,
                state => panic!(
                    "VM should be active before being run down (current state: \
                    {state})"
                ),
            };

            let spec = vm.objects().lock_shared().await.instance_spec().clone();
            let ActiveVm { external_state_rx, properties, tokio_rt, .. } = vm;
            guard.state = VmState::Rundown {
                vm: VmDescription {
                    external_state_rx,
                    properties,
                    tokio_rt: Some(tokio_rt),
                },
                spec: Box::new(spec),
            };
            vm.services
        };

        services.stop(&self.log).await;
    }

    /// Moves this VM from the `Rundown` state to the `RundownComplete` state.
    ///
    /// This routine should only be called when dropping VM objects.
    ///
    /// # Panics
    ///
    /// Panics if the VM is not in the `Rundown` state.
    async fn complete_rundown(&self) {
        info!(self.log, "completing VM rundown");
        let mut guard = self.inner.write().await;
        let old = std::mem::replace(&mut guard.state, VmState::NoVm);
        let rt = match old {
            VmState::Rundown { mut vm, spec } => {
                let rt = vm.tokio_rt.take().expect("rundown VM has a runtime");
                guard.state = VmState::RundownComplete {
                    vm,
                    spec: MaybeSpec::Present(spec),
                };
                rt
            }
            state => unreachable!(
                "VM rundown completed from invalid prior state {state}"
            ),
        };

        let StateDriverOutput { mut state_publisher, final_state } = guard
            .driver
            .take()
            .expect("driver must exist in rundown")
            .await
            .expect("state driver shouldn't panic");

        state_publisher.update(state_publisher::ExternalStateUpdate::Instance(
            final_state,
        ));

        // Shut down the runtime without blocking to wait for tasks to complete
        // (since blocking is illegal in an async context).
        //
        // This must happen after the state driver task has successfully joined
        // (otherwise it might be canceled and will fail to yield the VM's final
        // state).
        rt.shutdown_background();
    }

    /// Attempts to move this VM to the `Active` state by setting up a state
    /// driver task and directing it to initialize a new VM.
    pub(crate) async fn ensure(
        self: &Arc<Self>,
        log: &slog::Logger,
        ensure_request: VmEnsureRequest,
        options: EnsureOptions,
    ) -> Result<InstanceEnsureResponse, VmError> {
        let log_for_driver =
            log.new(slog::o!("component" => "vm_state_driver"));

        // This routine will create a state driver task that actually
        // initializes the VM. The external instance-ensure API shouldn't return
        // until that task has disposed of the initialization request. Create a
        // channel to allow the state driver task to send back an ensure result
        // at the appropriate moment.
        let (ensure_reply_tx, ensure_rx) = oneshot::channel();

        // The external state receiver needs to exist as soon as this routine
        // returns, so create the appropriate channel here. The sender side of
        // the channel will move to the state driver task.
        let (external_publisher, external_rx) = StatePublisher::new(
            &log_for_driver,
            InstanceStateMonitorResponse {
                gen: 1,
                state: if ensure_request.is_migration() {
                    InstanceState::Migrating
                } else {
                    InstanceState::Creating
                },
                migration: InstanceMigrateStatusResponse {
                    migration_in: ensure_request.migration_info().map(|req| {
                        InstanceMigrationStatus {
                            id: req.migration_id,
                            state: MigrationState::Sync,
                        }
                    }),
                    migration_out: None,
                },
            },
        );

        // Take the lock for writing, since in the common case this call will be
        // creating a new VM and there's no easy way to upgrade from a reader
        // lock to a writer lock.
        {
            let mut guard = self.inner.write().await;
            match guard.state {
                VmState::WaitingForInit { .. } => {
                    return Err(VmError::WaitingToInitialize)
                }
                VmState::Active { .. } => {
                    return Err(VmError::AlreadyInitialized)
                }
                VmState::Rundown { .. } => {
                    return Err(VmError::RundownInProgress)
                }
                _ => {}
            };

            let properties = ensure_request.properties.clone();
            let spec = match &ensure_request.init {
                ensure::VmInitializationMethod::Spec(s) => {
                    MaybeSpec::Present(s.clone())
                }
                ensure::VmInitializationMethod::Migration(_) => {
                    MaybeSpec::WaitingForMigrationSource
                }
            };

            let vm_for_driver = self.clone();
            let base_log = log.clone();
            guard.driver = Some(tokio::spawn(async move {
                state_driver::ensure_vm_and_launch_driver(
                    log_for_driver,
                    base_log,
                    vm_for_driver,
                    external_publisher,
                    ensure_request,
                    ensure_reply_tx,
                    options,
                )
                .await
            }));

            guard.state = VmState::WaitingForInit {
                vm: VmDescription {
                    external_state_rx: external_rx.clone(),
                    properties,
                    tokio_rt: None,
                },
                spec,
            };
        }

        // Wait for the state driver task to dispose of this request.
        ensure_rx.await.map_err(|_| VmError::ResultChannelClosed)?
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/objects.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A collection of all of the components that make up a Propolis VM instance.

use std::{
    ops::{Deref, DerefMut},
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
};

use futures::{future::BoxFuture, stream::FuturesUnordered, StreamExt};
use propolis::{
    attestation,
    hw::{ps2::ctrl::PS2Ctrl, qemu::ramfb::RamFb, uart::LpcUart},
    vmm::VmmHdl,
    Machine,
};
use propolis_api_types::instance_spec::SpecKey;
use slog::info;
use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};

use crate::{serial::Serial, spec::Spec, vcpu_tasks::VcpuTaskController};

use super::{BlockBackendMap, CrucibleBackendMap, DeviceMap};

/// A collection of components that make up a Propolis VM instance.
pub(crate) struct VmObjects {
    /// A reference to the VM state machine that created these objects. Used to
    /// complete rundown when the objects are dropped.
    parent: Arc<super::Vm>,

    /// Synchronizes access to the VM's objects.
    ///
    /// API-layer callers that want to enumerate a VM's devices or read its spec
    /// acquire this lock shared. The state driver acquires this lock exclusive
    /// to mutate the VM.
    inner: RwLock<VmObjectsLocked>,
}

/// A collection of objects that should eventually be wrapped in a lock and
/// stored in a `VmObjects` structure. See [`VmObjectsLocked`].
pub(super) struct InputVmObjects {
    pub instance_spec: Spec,
    pub vcpu_tasks: Box<dyn VcpuTaskController>,
    pub machine: Machine,
    pub devices: DeviceMap,
    pub block_backends: BlockBackendMap,
    pub crucible_backends: CrucibleBackendMap,
    pub com1: Arc<Serial<LpcUart>>,
    pub framebuffer: Option<Arc<RamFb>>,
    pub ps2ctrl: Arc<PS2Ctrl>,
    pub attest_handle: Option<attestation::server::AttestationSock>,
}

/// The collection of objects and state that make up a Propolis instance.
pub(crate) struct VmObjectsLocked {
    /// The objects' associated logger.
    log: slog::Logger,

    /// The instance spec that describes this collection of objects.
    instance_spec: Spec,

    /// The set of tasks that run this VM's vCPUs.
    vcpu_tasks: Box<dyn VcpuTaskController>,

    /// The Propolis kernel VMM for this instance.
    machine: Machine,

    /// Maps from component names to the trait objects that implement lifecycle
    /// operations (e.g. pause and resume) for eligible components.
    devices: DeviceMap,

    /// Maps from component names to trait objects that implement the block
    /// storage backend trait.
    block_backends: BlockBackendMap,

    /// Maps from component names to Crucible backend objects.
    crucible_backends: CrucibleBackendMap,

    /// A handle to the serial console connection to the VM's first COM port.
    com1: Arc<Serial<LpcUart>>,

    /// A handle to the VM's framebuffer.
    framebuffer: Option<Arc<RamFb>>,

    /// A handle to the VM's PS/2 controller.
    ps2ctrl: Arc<PS2Ctrl>,

    /// A handle to the VM's attestation server.
    attest_handle: Option<attestation::server::AttestationSock>,
}

impl VmObjects {
    /// Creates a new VM object container.
    pub(super) fn new(
        log: slog::Logger,
        parent: Arc<super::Vm>,
        input: InputVmObjects,
    ) -> Self {
        let inner = VmObjectsLocked::new(&log, input);
        Self { parent, inner: tokio::sync::RwLock::new(inner) }
    }

    /// Yields a shared lock guard referring to the underlying object
    /// collection.
    pub(crate) async fn lock_shared(&self) -> VmObjectsShared<'_> {
        VmObjectsShared(self.inner.read().await)
    }

    /// Yields an exclusive lock guard referring to the underlying object
    /// collection.
    pub(crate) async fn lock_exclusive(&self) -> VmObjectsExclusive<'_> {
        VmObjectsExclusive(self.inner.write().await)
    }
}

impl VmObjectsLocked {
    /// Associates a collection of VM objects with a logger.
    fn new(log: &slog::Logger, input: InputVmObjects) -> Self {
        Self {
            log: log.clone(),
            instance_spec: input.instance_spec,
            vcpu_tasks: input.vcpu_tasks,
            machine: input.machine,
            devices: input.devices,
            block_backends: input.block_backends,
            crucible_backends: input.crucible_backends,
            com1: input.com1,
            framebuffer: input.framebuffer,
            ps2ctrl: input.ps2ctrl,
            attest_handle: input.attest_handle,
        }
    }

    /// Yields the VM's current instance spec.
    pub(crate) fn instance_spec(&self) -> &Spec {
        &self.instance_spec
    }

    /// Yields a mutable reference to the VM's current instance spec.
    pub(crate) fn instance_spec_mut(&mut self) -> &mut Spec {
        &mut self.instance_spec
    }

    /// Yields the VM's current Propolis VM aggregation.
    pub(crate) fn machine(&self) -> &Machine {
        &self.machine
    }

    /// Yields the VM's current kernel VMM handle.
    pub(crate) fn vmm_hdl(&self) -> &Arc<VmmHdl> {
        &self.machine.hdl
    }

    /// Yields an accessor to the VM's memory context, or None if guest memory
    /// is not currently accessible.
    pub(crate) fn access_mem(
        &self,
    ) -> Option<propolis::accessors::Guard<'_, propolis::vmm::MemAccessed>>
    {
        self.machine.acc_mem.access()
    }

    /// Obtains a handle to the lifecycle trait object for the component with
    /// the supplied `id`.
    pub(crate) fn device_by_id(
        &self,
        id: &SpecKey,
    ) -> Option<Arc<dyn propolis::common::Lifecycle>> {
        self.devices.get(id).cloned()
    }

    /// Yields the VM's current Crucible backend map.
    pub(crate) fn crucible_backends(&self) -> &CrucibleBackendMap {
        &self.crucible_backends
    }

    /// Yields a clonable reference to the serial console for this VM's first
    /// COM port.
    pub(crate) fn com1(&self) -> &Arc<Serial<LpcUart>> {
        &self.com1
    }

    /// Yields a clonable reference to this VM's framebuffer.
    pub(crate) fn framebuffer(&self) -> &Option<Arc<RamFb>> {
        &self.framebuffer
    }

    /// Yields a clonable reference to this VM's PS/2 controller.
    pub(crate) fn ps2ctrl(&self) -> &Arc<PS2Ctrl> {
        &self.ps2ctrl
    }

    pub(crate) fn device_map(&self) -> &DeviceMap {
        &self.devices
    }

    pub(crate) fn block_backend_map(&self) -> &BlockBackendMap {
        &self.block_backends
    }

    /// Iterates over all of the lifecycle trait objects in this VM and calls
    /// `func` on each one.
    pub(crate) fn for_each_device(
        &self,
        mut func: impl FnMut(&SpecKey, &Arc<dyn propolis::common::Lifecycle>),
    ) {
        for (name, dev) in self.devices.iter() {
            func(name, dev);
        }
    }

    /// Iterates over all of the lifecycle objects in this VM and calls `func`
    /// on each one. If any invocation of `func` fails, this routine returns
    /// immediately and yields the relevant error.
    pub(crate) fn for_each_device_fallible<E>(
        &self,
        mut func: impl FnMut(
            &SpecKey,
            &Arc<dyn propolis::common::Lifecycle>,
        ) -> std::result::Result<(), E>,
    ) -> std::result::Result<(), E> {
        for (name, dev) in self.devices.iter() {
            func(name, dev)?;
        }

        Ok(())
    }

    /// Pauses the VM at the kernel VMM level, ensuring that in-kernel-emulated
    /// devices and vCPUs are brought to a consistent state.
    ///
    /// When the VM is paused, attempts to run its vCPUs (via `VM_RUN` ioctl)
    /// will fail.  A corresponding `resume_vm()` call must be made prior to
    /// allowing vCPU tasks to run.
    pub(super) fn pause_kernel_vm(&self) {
        info!(self.log, "pausing kernel VMM resources");
        self.machine.hdl.pause().expect("VM_PAUSE should succeed");
    }

    /// Resumes the VM at the kernel VMM level.
    pub(super) fn resume_kernel_vm(&self) {
        info!(self.log, "resuming kernel VMM resources");
        self.machine.hdl.resume().expect("VM_RESUME should succeed");
    }

    /// Reinitializes the VM by resetting all of its devices and its kernel VMM.
    pub(super) fn reset_devices_and_machine(&self) {
        self.for_each_device(|name, dev| {
            info!(self.log, "sending reset request to {}", name);
            dev.reset();
        });

        self.machine.reinitialize().unwrap();
    }

    /// Pauses this VM's devices and its kernel VMM.
    pub(crate) async fn pause(&mut self) {
        // Order matters here: the Propolis lifecycle trait's pause function
        // requires that all vCPUs pause before any devices do, and all vCPUs
        // must be paused before the kernel VM can pause.
        self.vcpu_tasks.pause_all();
        self.pause_devices().await;
        self.pause_kernel_vm();
    }

    /// Resumes this VM's devices and its kernel VMM.
    pub(crate) fn resume(&mut self) {
        // Order matters here: the kernel VM must resume before any vCPUs can
        // resume, and the Propolis lifecycle trait's resume function requires
        // that all devices resume before any vCPUs do.
        self.resume_kernel_vm();
        self.resume_devices();
        self.resume_vcpus();
    }

    /// Resumes this VM's vCPU tasks.
    ///
    /// This is intended for use in VM startup sequences where the state driver
    /// needs fine-grained control over the order in which devices and vCPUs
    /// start. When pausing and resuming a VM that's already been started, use
    /// [`Self::pause`] and [`Self::resume`] instead.
    pub(crate) fn resume_vcpus(&mut self) {
        self.vcpu_tasks.resume_all();
    }

    /// Stops the VM's vCPU tasks and devices.
    pub(super) async fn halt(&mut self) {
        self.vcpu_tasks.exit_all();
        self.halt_devices().await;
    }

    /// Resets the VM's kernel vCPU state.
    pub(super) fn reset_vcpus(&self) {
        self.vcpu_tasks.new_generation();
        self.reset_vcpu_state();
    }

    /// Hard-resets a VM by pausing, resetting, and resuming all its devices and
    /// vCPUs.
    pub(super) async fn reboot(&mut self) {
        // Reboot is implemented as a pause -> reset -> resume transition.
        //
        // First, pause the vCPUs and all devices so no partially-completed
        // work is present.
        self.vcpu_tasks.pause_all();
        self.pause_devices().await;

        // Reset all entities and the VM's bhyve state, then reset the
        // vCPUs. The vCPU reset must come after the bhyve reset.
        self.reset_devices_and_machine();
        self.reset_vcpus();

        // Resume devices so they're ready to do more work, then resume
        // vCPUs.
        self.resume_devices();
        self.resume_vcpus();
    }

    /// Pauses all of a VM's devices.
    async fn pause_devices(&self) {
        // Take care not to wedge the runtime with any device pause
        // implementations which might block.
        tokio::task::block_in_place(|| {
            self.for_each_device(|name, dev| {
                info!(self.log, "sending pause request to {}", name);
                dev.pause();
            });
        });

        struct NamedFuture {
            name: String,
            future: BoxFuture<'static, ()>,
        }

        impl std::future::Future for NamedFuture {
            type Output = String;

            fn poll(
                self: Pin<&mut Self>,
                cx: &mut Context<'_>,
            ) -> Poll<Self::Output> {
                let mut_self = self.get_mut();
                Pin::new(&mut mut_self.future)
                    .poll(cx)
                    .map(|_| mut_self.name.clone())
            }
        }

        info!(self.log, "waiting for devices to pause");
        let mut stream: FuturesUnordered<_> = self
            .devices
            .iter()
            .map(|(name, dev)| {
                info!(self.log, "got paused future from dev {}", name);
                NamedFuture { name: name.to_string(), future: dev.paused() }
            })
            .collect();

        while let Some(name) = stream.next().await {
            info!(self.log, "dev {} completed pause", name);
        }

        info!(self.log, "all devices paused");
    }

    /// Resumes all of a VM's devices.
    fn resume_devices(&self) {
        self.for_each_device(|name, dev| {
            info!(self.log, "sending resume request to {}", name);
            dev.resume();
        })
    }

    /// Stops all of a VM's devices and detaches its block backends from their
    /// devices.
    async fn halt_devices(&mut self) {
        // Take care not to wedge the runtime with any device halt
        // implementations which might block.
        tokio::task::block_in_place(|| {
            self.for_each_device(|name, dev| {
                info!(self.log, "sending halt request to {}", name);
                dev.halt();
            });
        });

        for (id, backend) in self.block_backends.iter() {
            info!(self.log, "stopping and detaching block backend {}", id);
            backend.stop().await;
            backend.attachment().detach();
        }

        if let Some(attest_handle) = self.attest_handle.take() {
            attest_handle.halt().await;
        }
    }

    /// Resets a VM's kernel vCPU objects to their initial states.
    fn reset_vcpu_state(&self) {
        for vcpu in self.machine.vcpus.iter() {
            info!(self.log, "resetting vCPU {}", vcpu.id);
            vcpu.activate().unwrap();
            vcpu.reboot_state().unwrap();
            if vcpu.is_bsp() {
                info!(self.log, "Resetting BSP vCPU {}", vcpu.id);
                vcpu.set_run_state(propolis::bhyve_api::VRS_RUN, None).unwrap();
                vcpu.set_reg(
                    propolis::bhyve_api::vm_reg_name::VM_REG_GUEST_RIP,
                    0xfff0,
                )
                .unwrap();
            }
        }
    }
}

impl Drop for VmObjects {
    fn drop(&mut self) {
        // Signal to these objects' owning VM that rundown has completed and a
        // new VM can be created.
        //
        // It is always safe to complete rundown at this point because the state
        // driver ensures that if it creates VM objects, then it will not drop
        // them without first moving the VM to the Rundown state.
        let parent = self.parent.clone();
        tokio::spawn(async move {
            parent.complete_rundown().await;
        });
    }
}

/// A shared lock on the contents of a [`VmObjects`].
pub(crate) struct VmObjectsShared<'o>(RwLockReadGuard<'o, VmObjectsLocked>);

/// An exclusive lock on the contents of a [`VmObjects`].
pub(crate) struct VmObjectsExclusive<'o>(RwLockWriteGuard<'o, VmObjectsLocked>);

impl Deref for VmObjectsShared<'_> {
    type Target = VmObjectsLocked;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl Deref for VmObjectsExclusive<'_> {
    type Target = VmObjectsLocked;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl DerefMut for VmObjectsExclusive<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/request_queue.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Handles requests to change a Propolis server's state or component
//! configuration via the external API.
//!
//! The queue accepts or rejects requests based on a combination of its current
//! state and its knowledge of requests that it has previously queued but that
//! have not yet been processed. The latter knowledge is used to reject requests
//! that will never be fulfilled (because a prior request preempts them) or that
//! may need to be redirected to a migration target.
//!
//! The queue contains no synchronization of its own. Users who want to share a
//! queue between multiple threads must wrap it in a synchronization object.

use std::collections::VecDeque;

use propolis_api_types::instance_spec::SpecKey;
use slog::{info, Logger};
use thiserror::Error;
use uuid::Uuid;

/// Wraps a [`dropshot::WebsocketConnection`] for inclusion in an
/// [`ExternalRequest`].
//
// This newtype allows this module's tests (which want to verify queuing
// dispositions and don't care about request contents) to construct a
// `MigrateAsSource` request without having to conjure up a real websocket
// connection.
pub(crate) struct WebsocketConnection(Option<dropshot::WebsocketConnection>);

impl From<dropshot::WebsocketConnection> for WebsocketConnection {
    fn from(value: dropshot::WebsocketConnection) -> Self {
        Self(Some(value))
    }
}

impl WebsocketConnection {
    /// Yields the wrapped [`dropshot::WebsocketConnection`].
    pub(crate) fn into_inner(self) -> dropshot::WebsocketConnection {
        // Unwrapping is safe here because the only way an external consumer can
        // get an instance of this wrapper is to use the From impl, which always
        // wraps a `Some`.
        self.0.unwrap()
    }
}

/// A request to change a VM's runtime state.
pub(crate) enum StateChangeRequest {
    /// Asks the state worker to start a brand-new VM (i.e. not one initialized
    /// by live migration, which implicitly starts the VM).
    Start,

    /// Asks the state worker to start a migration-source task.
    MigrateAsSource { migration_id: Uuid, websock: WebsocketConnection },

    /// Resets the guest by pausing all devices, resetting them to their
    /// cold-boot states, and resuming the devices. Note that this is not a
    /// graceful reboot and does not coordinate with guest software.
    Reboot,

    /// Halts the VM. Note that this is not a graceful shutdown and does not
    /// coordinate with guest software.
    Stop,
}

impl std::fmt::Debug for StateChangeRequest {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Start => write!(f, "Start"),
            Self::MigrateAsSource { migration_id, websock: _ } => f
                .debug_struct("MigrateAsSource")
                .field("migration_id", migration_id)
                .finish(),
            Self::Reboot => write!(f, "Reboot"),
            Self::Stop => write!(f, "Stop"),
        }
    }
}

/// A request to reconfigure a VM's components.
///
/// NOTE: Successfully queuing a component change request does not guarantee
/// that the request will be processed, because it may be preempted by a VM
/// state change. If this happens, the request will fail and notify the
/// submitter using whatever channel is appropriate for the request's type.
pub enum ComponentChangeRequest {
    /// Attempts to update the volume construction request for the supplied
    /// Crucible volume.
    ReconfigureCrucibleVolume {
        /// The ID of the Crucible backend in the VM's Crucible backend map.
        backend_id: SpecKey,

        /// The new volume construction request to supply to the Crucible
        /// upstairs.
        new_vcr_json: String,

        /// The sink for the result of this operation.
        result_tx: super::CrucibleReplaceResultTx,
    },
}

impl std::fmt::Debug for ComponentChangeRequest {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::ReconfigureCrucibleVolume { backend_id, .. } => f
                .debug_struct("ReconfigureCrucibleVolume")
                .field("backend_id", backend_id)
                .finish(),
        }
    }
}

/// An external request made of a VM controller via the server API. Handled by
/// the controller's state driver thread.
#[derive(Debug)]
pub(crate) enum ExternalRequest {
    /// A request to change the VM's runtime state.
    State(StateChangeRequest),

    /// A request to reconfigure one of the VM's components.
    Component(ComponentChangeRequest),
}

impl ExternalRequest {
    /// Constructs a VM start request.
    pub const fn start() -> Self {
        Self::State(StateChangeRequest::Start)
    }

    /// Constructs a VM stop request.
    pub const fn stop() -> Self {
        Self::State(StateChangeRequest::Stop)
    }

    /// Constructs a VM reboot request.
    pub const fn reboot() -> Self {
        Self::State(StateChangeRequest::Reboot)
    }

    /// Constructs a request to migrate a VM to another Propolis instance, using
    /// `ws_conn` as the websocket connection to the migration target.
    pub fn migrate_as_source(
        migration_id: Uuid,
        ws_conn: dropshot::WebsocketConnection,
    ) -> Self {
        Self::State(StateChangeRequest::MigrateAsSource {
            migration_id,
            websock: WebsocketConnection(Some(ws_conn)),
        })
    }

    /// Constructs a request to update a Crucible volume's construction request.
    /// The result of this request will be sent to the supplied `result_tx`.
    pub fn reconfigure_crucible_volume(
        backend_id: SpecKey,
        new_vcr_json: String,
        result_tx: super::CrucibleReplaceResultTx,
    ) -> Self {
        Self::Component(ComponentChangeRequest::ReconfigureCrucibleVolume {
            backend_id,
            new_vcr_json,
            result_tx,
        })
    }

    fn is_stop(&self) -> bool {
        matches!(self, Self::State(StateChangeRequest::Stop))
    }
}

/// A set of reasons why a request to queue an external state transition can
/// fail.
#[derive(Copy, Clone, Debug, Error)]
pub(crate) enum RequestDeniedReason {
    #[error("Operation requires an active instance")]
    InstanceNotActive,

    #[error("Instance is currently starting")]
    StartInProgress,

    #[error("Instance is already a migration source")]
    AlreadyMigrationSource,

    #[error("Operation cannot be performed on a migration source")]
    InvalidForMigrationSource,

    #[error("Instance is preparing to stop")]
    HaltPending,

    #[error("Instance has migrated out and is being torn down")]
    MigratedOut,

    #[error("Instance has already halted")]
    Halted,

    #[error("Instance failed to start or halted due to a failure")]
    InstanceFailed,
}

/// A kind of request that can be popped from the queue and then completed.
#[derive(Copy, Clone, Debug)]
pub(super) enum CompletedRequest {
    Start { succeeded: bool },
    Reboot,
    MigrationOut { succeeded: bool },
    Stop,
}

/// The queue's internal notion of the VM's runtime state.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum QueueState {
    /// The instance has not started yet and no one has asked it to start.
    NotStarted,

    /// The instance is not running yet, but it's on its way there: either the
    /// state driver is actively trying to start it, or there is a request that,
    /// when processed, will direct the driver to start the instance.
    StartPending,

    /// The instance has successfully started.
    Running,

    /// The instance has stopped due to a migration out.
    MigratedOut,

    /// The instance has shut down.
    Stopped,

    /// The instance failed to start.
    Failed,
}

impl QueueState {
    /// If `self` is a state in which new change requests should be denied
    /// unconditionally, returns a `Some` containing an appropriate
    /// [`RequestDeniedReason`]; returns `None` otherwise.
    fn deny_reason(&self) -> Option<RequestDeniedReason> {
        match self {
            Self::MigratedOut => Some(RequestDeniedReason::MigratedOut),
            Self::Stopped => Some(RequestDeniedReason::Halted),
            Self::Failed => Some(RequestDeniedReason::InstanceFailed),
            _ => None,
        }
    }
}

#[derive(Debug)]
pub(super) struct ExternalRequestQueue {
    /// The queue of unprocessed state change requests.
    state_queue: VecDeque<StateChangeRequest>,

    /// The queue of unprocessed component change requests.
    component_queue: VecDeque<ComponentChangeRequest>,

    /// The "effective" (for purposes of deciding how to dispose of requests)
    /// state of the instance associated with this queue.
    state: QueueState,

    /// True if this queue has enqueued a reboot request that has not been
    /// completed by the state driver.
    awaiting_reboot: bool,

    /// True if this queue has enqueued a request to migrate out that has not
    /// been completed by the state driver.
    awaiting_migration_out: bool,

    /// True if this queue has enqueued a stop request that has not been
    /// completed by the state driver.
    awaiting_stop: bool,

    /// The queue's logger.
    log: Logger,
}

/// Indicates whether this queue's creator will start the relevant instance
/// without waiting for a Start request from the queue.
pub(super) enum InstanceAutoStart {
    Yes,
    No,
}

impl ExternalRequestQueue {
    /// Creates a new queue that logs to the supplied logger.
    pub fn new(log: Logger, auto_start: InstanceAutoStart) -> Self {
        let instance_state = match auto_start {
            InstanceAutoStart::Yes => QueueState::StartPending,
            InstanceAutoStart::No => QueueState::NotStarted,
        };

        Self {
            state_queue: Default::default(),
            component_queue: Default::default(),

            state: instance_state,
            awaiting_reboot: false,
            awaiting_migration_out: false,
            awaiting_stop: false,
            log,
        }
    }

    /// Pops the next request off of the queue. If the queue contains both state
    /// change and component change requests, the next state change request is
    /// popped first (even if it arrived later in time than the next component
    /// change request).
    pub fn pop_front(&mut self) -> Option<ExternalRequest> {
        if let Some(state_change) = self.state_queue.pop_front() {
            Some(ExternalRequest::State(state_change))
        } else {
            self.component_queue.pop_front().map(ExternalRequest::Component)
        }
    }

    /// Indicates whether the queue is empty.
    #[cfg(test)]
    pub fn is_empty(&self) -> bool {
        self.state_queue.is_empty() && self.component_queue.is_empty()
    }

    /// Attempts to replace the supplied `request` on the queue, returning `Ok`
    /// if the request was accepted and an `Err` otherwise. In the latter case,
    /// the error contains a [`RequestDeniedReason`] that describes why the
    /// request was rejected.
    pub fn try_queue(
        &mut self,
        request: ExternalRequest,
    ) -> Result<(), RequestDeniedReason> {
        let should_queue = self.should_queue(&request);
        match should_queue {
            Ok(true) => {
                info!(
                    &self.log,
                    "enqueued external request";
                    "request" => ?request
                );
            }
            Ok(false) => {
                info!(
                    &self.log,
                    "ignored external request";
                    "request" => ?request
                );

                return Ok(());
            }
            Err(reason) => {
                info!(
                    &self.log,
                    "denied external request";
                    "request" => ?request,
                    "reason" => %reason
                );
                return Err(reason);
            }
        }

        match request {
            ExternalRequest::State(StateChangeRequest::Start) => {
                assert_eq!(self.state, QueueState::NotStarted);
                self.state = QueueState::StartPending;
            }
            ExternalRequest::State(StateChangeRequest::MigrateAsSource {
                ..
            }) => {
                assert!(!self.awaiting_migration_out);
                self.awaiting_migration_out = true;
            }
            ExternalRequest::State(StateChangeRequest::Reboot) => {
                assert!(!self.awaiting_reboot);
                self.awaiting_reboot = true;
            }
            ExternalRequest::State(StateChangeRequest::Stop) => {
                assert!(!self.awaiting_stop);
                self.awaiting_stop = true;
            }
            ExternalRequest::Component(_) => {}
        }

        match request {
            ExternalRequest::State(s) => self.state_queue.push_back(s),
            ExternalRequest::Component(c) => self.component_queue.push_back(c),
        }

        Ok(())
    }

    /// Determines whether the supplied `request` should be queued, returning:
    ///
    /// - `Ok(true)` if the request was enqueued,
    /// - `Ok(false)` if the request was ignored, and
    /// - `Err(reason)` if the request was denied.
    fn should_queue(
        &mut self,
        request: &ExternalRequest,
    ) -> Result<bool, RequestDeniedReason> {
        // If the queue is in a terminal state, deny the request straightaway
        // (unless it's a stop request, which can be ignored for idempotency).
        if let Some(reason) = self.state.deny_reason() {
            if request.is_stop() {
                return Ok(false);
            } else {
                return Err(reason);
            }
        } else {
            // The instance hasn't stopped yet, so consider this request in
            // light of its current state and the other as-yet unprocessed
            // requests from the queue.
            //
            // In general, try to make state change requests idempotent by
            // ignoring new requests when a request of the appropriate kind is
            // already on the queue, and deny requests to reach a state that is
            // precluded by an earlier state change request.
            match request {
                // Interpret start requests as requests to reach the Running
                // state.
                ExternalRequest::State(StateChangeRequest::Start) => {
                    if self.awaiting_stop {
                        return Err(RequestDeniedReason::HaltPending);
                    } else if self.state != QueueState::NotStarted {
                        return Ok(false);
                    }
                }

                // Only allow one attempt to migrate out at a time (if it works
                // the VM can't migrate out again), and only allow migration out
                // after an instance begins to run.
                ExternalRequest::State(
                    StateChangeRequest::MigrateAsSource { .. },
                ) => {
                    if self.awaiting_migration_out {
                        return Err(
                            RequestDeniedReason::AlreadyMigrationSource,
                        );
                    } else if self.awaiting_stop {
                        return Err(RequestDeniedReason::HaltPending);
                    } else if self.state == QueueState::NotStarted {
                        return Err(RequestDeniedReason::InstanceNotActive);
                    }
                }

                // Treat reboot requests as a request to take a VM that has
                // already started, reset its state, and resume the VM. If the
                // VM migrates out first, this request needs to be directed to
                // the target, so reject it here to allow the caller to wait for
                // the migration to resolve.
                ExternalRequest::State(StateChangeRequest::Reboot) => {
                    if self.awaiting_migration_out {
                        return Err(
                            RequestDeniedReason::InvalidForMigrationSource,
                        );
                    } else if self.awaiting_stop {
                        return Err(RequestDeniedReason::HaltPending);
                    } else if self.state == QueueState::NotStarted {
                        return Err(RequestDeniedReason::InstanceNotActive);
                    } else if self.state == QueueState::StartPending {
                        return Err(RequestDeniedReason::StartInProgress);
                    } else if self.awaiting_reboot {
                        return Ok(false);
                    }
                }

                // Always queue requests to stop a VM unless one is already
                // present.
                //
                // Note that if the VM migrates out before this request is
                // processed, then the "logical" VM is still running (in another
                // Propolis). The client is responsible for tracking any
                // outstanding migrations and directing its stop requests
                // accordingly.
                ExternalRequest::State(StateChangeRequest::Stop) => {
                    if self.awaiting_stop {
                        return Ok(false);
                    }
                }

                // Always enqueue component change requests, even if the VM has
                // a pending request to stop or migrate out. This allows the
                // state driver to process these requests during a state change,
                // which may be necessary to complete that state change. If the
                // change request is canceled by a later state transition, the
                // queue can use the request data to notify the requestor.
                ExternalRequest::Component(
                    ComponentChangeRequest::ReconfigureCrucibleVolume {
                        ..
                    },
                ) => {}
            }
        };

        Ok(true)
    }

    /// Notifies this queue that the caller has finished processing a
    /// previously-dequeued request, allowing the queue to adjust its
    /// dispositions in response.
    pub(super) fn notify_request_completed(&mut self, req: CompletedRequest) {
        info!(
            &self.log,
            "queue notified of request completion";
            "request" => ?req
        );

        match req {
            CompletedRequest::Start { succeeded } => {
                assert_eq!(self.state, QueueState::StartPending);
                if succeeded {
                    self.state = QueueState::Running;
                } else {
                    self.state = QueueState::Failed;
                }
            }
            CompletedRequest::Reboot => {
                assert_eq!(self.state, QueueState::Running);
                assert!(self.awaiting_reboot);
                self.awaiting_reboot = false;
            }
            CompletedRequest::MigrationOut { succeeded } => {
                assert_eq!(self.state, QueueState::Running);
                assert!(self.awaiting_migration_out);
                self.awaiting_migration_out = false;
                if succeeded {
                    self.state = QueueState::MigratedOut;
                }
            }
            CompletedRequest::Stop => {
                assert!(self.awaiting_stop);
                self.awaiting_stop = false;
                self.state = QueueState::Stopped;
            }
        }
    }

    /// Notifies this queue that the instance has stopped. This routine is meant
    /// to be used in cases where an instance stops for reasons other than an
    /// external request (e.g., a guest-requested chipset-driven shutdown).
    pub(super) fn notify_stopped(&mut self) {
        info!(&self.log, "queue notified that VM has stopped");
        self.state = QueueState::Stopped;
    }
}

// It's possible for an external request queue to be dropped with outstanding
// requests if an event from the guest shuts down the VM before the queue can be
// drained. If this happens, notify anyone waiting on a specific request on the
// queue that the VM is gone.
impl Drop for ExternalRequestQueue {
    fn drop(&mut self) {
        // No special handling is needed for the state change queue:
        //
        // - Requests to start, reboot, and stop are handled asynchronously
        //   (calls to change the instance's state return as soon as they're
        //   queued).
        // - Requests to migrate out contain a connection to the migration
        //   target; dropping this connection tells the target the source is
        //   gone.
        //
        // Drain the component change request queue and send messages to
        // requestors telling them that their requests have been canceled.
        for req in self.component_queue.drain(..) {
            match req {
                // Crucible VCR change requestors wait for their requests to be
                // retired.
                ComponentChangeRequest::ReconfigureCrucibleVolume {
                    result_tx,
                    ..
                } => {
                    let _ = result_tx.send(Err(
                        dropshot::HttpError::for_client_error_with_status(
                            Some(
                                "VM destroyed before request could be handled"
                                    .to_string(),
                            ),
                            dropshot::ClientErrorStatusCode::GONE,
                        ),
                    ));
                }
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    use proptest::prelude::*;
    use uuid::Uuid;

    fn test_logger() -> slog::Logger {
        slog::Logger::root(slog::Discard, slog::o!())
    }

    fn make_migrate_as_source_request() -> ExternalRequest {
        ExternalRequest::State(StateChangeRequest::MigrateAsSource {
            migration_id: Uuid::new_v4(),
            websock: WebsocketConnection(None),
        })
    }

    fn make_reconfigure_crucible_request() -> ExternalRequest {
        let (tx, _rx) = tokio::sync::oneshot::channel();
        ExternalRequest::Component(
            ComponentChangeRequest::ReconfigureCrucibleVolume {
                backend_id: SpecKey::Uuid(Uuid::new_v4()),
                new_vcr_json: "".to_string(),
                result_tx: tx,
            },
        )
    }

    impl ExternalRequest {
        #[track_caller]
        fn assert_start(&self) {
            assert!(
                matches!(self, Self::State(StateChangeRequest::Start)),
                "expected start request, got {self:?}"
            );
        }

        #[track_caller]
        fn assert_stop(&self) {
            assert!(self.is_stop(), "expected stop request, got {self:?}");
        }

        #[track_caller]
        fn assert_reboot(&self) {
            assert!(
                matches!(self, Self::State(StateChangeRequest::Reboot)),
                "expected reboot request, got {self:?}"
            );
        }

        #[track_caller]
        fn assert_migrate_as_source(&self) {
            assert!(
                matches!(
                    self,
                    Self::State(StateChangeRequest::MigrateAsSource { .. })
                ),
                "expected migrate as source request, got {self:?}"
            );
        }

        #[track_caller]
        fn assert_reconfigure_crucible(&self) {
            assert!(
                matches!(
                    self,
                    Self::Component(
                        ComponentChangeRequest::ReconfigureCrucibleVolume { .. }
                    )
                ),
                "expected Crucible reconfiguration request, got {self:?}"
            );
        }
    }

    #[test]
    fn start_requests_become_idempotent_after_first_request() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No);

        // The first request to start should succeed.
        assert!(queue.try_queue(ExternalRequest::start()).is_ok());

        // The second one should too, but only for idempotency: the queue should
        // then have only one start request on it.
        assert!(queue.try_queue(ExternalRequest::start()).is_ok());
        queue.pop_front().unwrap().assert_start();
        assert!(queue.is_empty());

        // Start requests continue to be ignored even after the instance starts
        // to run.
        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        assert!(queue.try_queue(ExternalRequest::start()).is_ok());
        assert!(queue.is_empty());
    }

    #[test]
    fn migrate_as_source_is_not_idempotent() {
        // Simulate a running instance.
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);

        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        // Requests to migrate out should be allowed.
        assert!(queue.try_queue(make_migrate_as_source_request()).is_ok());

        // Once the request is queued, other requests to migrate out are
        // disallowed until the queued request is disposed of.
        //
        // This differs from the migration-in case in that requests to migrate
        // in are issued by the sled agent as part of a saga (where idempotency
        // is assumed), but requests to migrate out are issued by the target
        // Propolis (which does not assume idempotency and issues only one
        // request per migration attempt).
        assert!(queue.try_queue(make_migrate_as_source_request()).is_err());

        // If migration fails, the instance resumes running, and then another
        // request to migrate out should be allowed.
        queue.pop_front().unwrap().assert_migrate_as_source();
        queue.notify_request_completed(CompletedRequest::MigrationOut {
            succeeded: false,
        });

        assert!(queue.try_queue(make_migrate_as_source_request()).is_ok());

        // A successful migration stops the instance, which forecloses on future
        // requests to migrate out.
        queue.pop_front();
        queue.notify_request_completed(CompletedRequest::MigrationOut {
            succeeded: true,
        });

        assert!(queue.try_queue(make_migrate_as_source_request()).is_err());
    }

    #[test]
    fn stop_requests_are_idempotent() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);

        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        assert!(queue.try_queue(ExternalRequest::stop()).is_ok());
        assert!(queue.try_queue(ExternalRequest::stop()).is_ok());
        queue.pop_front().unwrap().assert_stop();
        assert!(queue.is_empty());
    }

    #[test]
    fn stop_requests_ignored_after_vm_failure() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);

        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: false,
        });

        assert!(queue.try_queue(ExternalRequest::stop()).is_ok());
        assert!(queue.is_empty());
    }

    #[test]
    fn reboot_requests_are_idempotent_except_when_stopping() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);
        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        // Once the instance is started, reboot requests should be allowed, but
        // after the first, subsequent requests should be dropped for
        // idempotency.
        assert!(queue.is_empty());
        for _ in 0..5 {
            assert!(queue.try_queue(ExternalRequest::reboot()).is_ok());
        }
        queue.pop_front().unwrap().assert_reboot();
        assert!(queue.is_empty());

        // Once the instance has rebooted, new requests can be queued.
        queue.notify_request_completed(CompletedRequest::Reboot);
        assert!(queue.try_queue(ExternalRequest::reboot()).is_ok());
        queue.pop_front().unwrap().assert_reboot();
        queue.notify_request_completed(CompletedRequest::Reboot);

        // If a request to reboot is queued, and then a request to stop is
        // queued, new requests to reboot should always fail, even after the
        // instance finishes rebooting.
        assert!(queue.try_queue(ExternalRequest::reboot()).is_ok());
        assert!(!queue.is_empty());
        assert!(queue.try_queue(ExternalRequest::stop()).is_ok());
        assert!(queue.try_queue(ExternalRequest::reboot()).is_err());
        queue.pop_front().unwrap().assert_reboot();
        queue.notify_request_completed(CompletedRequest::Reboot);
        assert!(queue.try_queue(ExternalRequest::reboot()).is_err());
    }

    #[test]
    fn mutation_disallowed_after_stopped() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);
        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        assert!(queue.try_queue(ExternalRequest::stop()).is_ok());
        queue.notify_request_completed(CompletedRequest::Stop);
        assert!(queue.try_queue(make_reconfigure_crucible_request()).is_err());
    }

    #[tokio::test]
    async fn vcr_requests_canceled_when_queue_drops() {
        let mut queue =
            ExternalRequestQueue::new(test_logger(), InstanceAutoStart::Yes);

        queue.notify_request_completed(CompletedRequest::Start {
            succeeded: true,
        });

        let (tx, rx) = tokio::sync::oneshot::channel();
        let req = ExternalRequest::Component(
            ComponentChangeRequest::ReconfigureCrucibleVolume {
                backend_id: SpecKey::Uuid(Uuid::new_v4()),
                new_vcr_json: "".to_string(),
                result_tx: tx,
            },
        );

        assert!(queue.try_queue(req).is_ok());
        drop(queue);
        let err = rx.await.unwrap().unwrap_err();
        assert_eq!(err.status_code, dropshot::ClientErrorStatusCode::GONE);
    }

    /// A helper for generating requests as part of a property testing strategy.
    /// `proptest` requires values that are the output of a `Strategy` to be
    /// `Clone`, which `ExternalRequest` is not. To get around this, create a
    /// strategy that returns variants of this enum and have a `From` impl that
    /// then creates requests of the appropriate kind.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    enum RequestKind {
        Start { will_succeed: bool },
        Stop,
        Reboot,
        Migrate { will_succeed: bool },
        ReconfigureCrucible,
    }

    impl From<RequestKind> for ExternalRequest {
        fn from(value: RequestKind) -> Self {
            match value {
                RequestKind::Start { will_succeed: _ } => {
                    ExternalRequest::start()
                }
                RequestKind::Stop => ExternalRequest::stop(),
                RequestKind::Reboot => ExternalRequest::reboot(),
                RequestKind::Migrate { will_succeed: _ } => {
                    make_migrate_as_source_request()
                }
                RequestKind::ReconfigureCrucible => {
                    make_reconfigure_crucible_request()
                }
            }
        }
    }

    fn request_strategy() -> impl Strategy<Value = RequestKind> {
        prop_oneof![
            Just(RequestKind::Start { will_succeed: true }),
            Just(RequestKind::Start { will_succeed: false }),
            Just(RequestKind::Stop),
            Just(RequestKind::Reboot),
            Just(RequestKind::Migrate { will_succeed: true }),
            Just(RequestKind::Migrate { will_succeed: false }),
            Just(RequestKind::ReconfigureCrucible),
        ]
    }

    proptest! {
        // Tests the behavior of the request queue in circumstances where start
        // requests are queued, but never actually acknowledged.
        #[test]
        fn request_queuing_before_start_acknowledged(
            reqs in prop::collection::vec(request_strategy(), 0..100)
        ) {
            let mut queue =
                ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No);

            let mut started = false;
            let mut stop_requested = false;
            let mut migrating_out = false;
            for req in reqs {
                let result = queue.try_queue(req.into());
                match req {
                    RequestKind::Start { .. } => {
                        if !stop_requested {
                            assert!(result.is_ok());
                            started = true;
                        } else {
                            assert!(result.is_err());
                        }
                    }

                    RequestKind::Stop => {
                        assert!(result.is_ok());
                        stop_requested = true;
                    }

                    RequestKind::Reboot => {
                        assert!(result.is_err());
                    }

                    RequestKind::Migrate { .. } => {
                        if started && !stop_requested && !migrating_out {
                            assert!(result.is_ok());
                            migrating_out = true;
                        } else {
                            assert!(result.is_err());
                        }
                    }

                    RequestKind::ReconfigureCrucible => {
                        assert!(result.is_ok());
                    }
                }
            }
        }

        // Tests the behavior of the request queue in circumstances where every
        // request made of the state driver completes immediately.
        #[test]
        fn request_queuing_with_immediate_dequeueing(
            reqs in prop::collection::vec(request_strategy(), 0..100)
        ) {
            let mut queue =
                ExternalRequestQueue::new(test_logger(), InstanceAutoStart::No);

            // True once a start request has been queued.
            let mut start_requested = false;

            // True once the VM reaches a terminal state (stopped, failed,
            // migrated out).
            let mut halted = false;
            for req in reqs {
                let result = queue.try_queue(req.into());
                match req {
                    // Start requests always succeed (though they may be
                    // ignored and not queued) on a non-halted VM.
                    RequestKind::Start { will_succeed } => {
                        if !halted {
                            assert!(result.is_ok());

                            // This request is only enqueued if it is the first
                            // request to start.
                            if !start_requested {
                                start_requested = true;
                                queue.pop_front().unwrap().assert_start();
                                let completed = CompletedRequest::Start {
                                    succeeded: will_succeed
                                };

                                queue.notify_request_completed(completed);

                                // Telling the queue that a start attempt failed
                                // moves the queue to a terminal state.
                                if !will_succeed {
                                    halted = true;
                                }
                            } else {
                                assert!(queue.is_empty());
                            }
                        } else {
                            assert!(result.is_err());
                            assert!(queue.is_empty());
                        }
                    }

                    // Stop requests always succeed (they are never denied), but
                    // they are ignored for VMs that have already halted.
                    RequestKind::Stop => {
                        assert!(result.is_ok());
                        if !halted {
                            queue.pop_front().unwrap().assert_stop();
                            queue.notify_request_completed(
                                CompletedRequest::Stop
                            );

                            halted = true;
                        } else {
                            assert!(queue.is_empty());
                        }
                    }

                    // Reboot requests are always enqueued if the VM is active.
                    // They are ignored if there's a pending migration out, but
                    // in this test there is never a *pending* migration out,
                    // since all requests are dequeued and processed
                    // immediately.
                    RequestKind::Reboot => {
                        if start_requested && !halted {
                            assert!(result.is_ok());
                            queue.pop_front().unwrap().assert_reboot();
                            queue.notify_request_completed(
                                CompletedRequest::Reboot
                            );
                        } else {
                            assert!(result.is_err());
                            assert!(queue.is_empty());
                        }
                    }

                    // Migration requests have the same disposition as reboot
                    // requests.
                    RequestKind::Migrate { will_succeed } => {
                        if start_requested && !halted {
                            assert!(result.is_ok());
                            queue
                                .pop_front()
                                .unwrap()
                                .assert_migrate_as_source();


                            let completed = CompletedRequest::MigrationOut {
                                succeeded: will_succeed
                            };

                            queue.notify_request_completed(completed);
                            if will_succeed {
                                halted = true;
                            }
                        } else {
                            assert!(result.is_err());
                            assert!(queue.is_empty());
                        }
                    }

                    // Crucible reconfiguration requests are always queued for
                    // unhalted VMs.
                    RequestKind::ReconfigureCrucible => {
                        if !halted {
                            assert!(result.is_ok());
                            queue
                                .pop_front()
                                .unwrap()
                                .assert_reconfigure_crucible();
                        } else {
                            assert!(result.is_err());
                            assert!(queue.is_empty());
                        }
                    }
                }
            }
        }
    }

    /// An operation that can be performed during a [`QueueDequeueTest`].
    #[derive(Clone, Copy, Debug)]
    enum QueueOp {
        Enqueue(RequestKind),
        Dequeue,
    }

    fn queue_op_strategy() -> impl Strategy<Value = QueueOp> {
        prop_oneof![
            request_strategy().prop_map(QueueOp::Enqueue),
            Just(QueueOp::Dequeue)
        ]
    }

    /// A helper that queues and dequeues requests in a proptest-generated
    /// order and that sends fake completion notifications back to the request
    /// queue.
    struct QueueDequeueTest {
        /// The external request queue under test.
        queue: ExternalRequestQueue,

        /// The set of state change requests that the helper expects to see from
        /// the external queue.
        expected_state: VecDeque<RequestKind>,

        /// The set of component change requests that the helper expects to see
        /// from the external queue.
        expected_component: VecDeque<RequestKind>,

        /// True if the helper has queued a request to start its fake VM.
        start_requested: bool,

        /// True if the helper has successfully started its fake VM.
        started: bool,

        /// True if the helper has queued a request to stop its fake VM.
        stop_requested: bool,

        /// True if the helper has an outstanding request to reboot its fake VM.
        reboot_requested: bool,

        /// True if the helper has an outstanding request to migrate its fake
        /// VM.
        migrate_out_requested: bool,

        /// True if the fake VM is halted (for any reason).
        halted: bool,
    }

    impl QueueDequeueTest {
        fn new() -> Self {
            Self {
                queue: ExternalRequestQueue::new(
                    test_logger(),
                    InstanceAutoStart::No,
                ),
                expected_state: Default::default(),
                expected_component: Default::default(),
                start_requested: false,
                started: false,
                stop_requested: false,
                reboot_requested: false,
                migrate_out_requested: false,
                halted: false,
            }
        }

        fn run(&mut self, ops: Vec<QueueOp>) {
            for op in ops {
                match op {
                    QueueOp::Enqueue(request) => self.queue_request(request),
                    QueueOp::Dequeue => {
                        self.dequeue_request();
                        if self.halted {
                            return;
                        }
                    }
                }
            }
        }

        /// Submits the supplied `request` to the external request queue,
        /// determines the expected result of that submission based on the
        /// helper's current flags, and asserts that the result matches the
        /// helper's expectation. If the helper expects the request to be
        /// queued, it pushes an entry to its internal expected-change queues.
        fn queue_request(&mut self, request: RequestKind) {
            let result = self.queue.try_queue(request.into());
            match request {
                RequestKind::Start { .. } => {
                    if self.halted || self.stop_requested {
                        assert!(result.is_err());
                        return;
                    }

                    assert!(result.is_ok());
                    if !self.start_requested {
                        self.start_requested = true;
                        self.expected_state.push_back(request);
                    }
                }
                RequestKind::Stop => {
                    assert!(result.is_ok());
                    if self.halted || self.stop_requested {
                        return;
                    }

                    self.stop_requested = true;
                    self.expected_state.push_back(request);
                }
                RequestKind::Reboot => {
                    if !self.started
                        || self.halted
                        || self.stop_requested
                        || self.migrate_out_requested
                    {
                        assert!(result.is_err());
                        return;
                    }

                    assert!(result.is_ok());
                    if !self.reboot_requested {
                        self.reboot_requested = true;
                        self.expected_state.push_back(request);
                    }
                }
                RequestKind::Migrate { .. } => {
                    if (!self.started && !self.start_requested)
                        || self.halted
                        || self.stop_requested
                        || self.migrate_out_requested
                    {
                        assert!(result.is_err());
                        return;
                    }

                    assert!(result.is_ok());
                    self.expected_state.push_back(request);
                    self.migrate_out_requested = true;
                }
                RequestKind::ReconfigureCrucible => {
                    if self.halted {
                        assert!(result.is_err());
                        return;
                    }

                    assert!(result.is_ok());
                    self.expected_component.push_back(request);
                }
            }
        }

        /// Pops a request from the helper's external queue and verifies that it
        /// matches the first request on the helper's expected-change queue. If
        /// the requests do match, sends a completion notification to the
        /// external queue.
        fn dequeue_request(&mut self) {
            let (dequeued, expected) = match (
                self.queue.pop_front(),
                self.expected_state
                    .pop_front()
                    .or_else(|| self.expected_component.pop_front()),
            ) {
                (None, None) => return,
                (Some(d), None) => {
                    panic!("dequeued request {d:?} but expected nothing")
                }
                (None, Some(e)) => {
                    panic!("expected request {e:?} but dequeued nothing")
                }
                (Some(d), Some(e)) => (d, e),
            };

            match (dequeued, expected) {
                (
                    ExternalRequest::State(StateChangeRequest::Start),
                    RequestKind::Start { will_succeed },
                ) => {
                    self.queue.notify_request_completed(
                        CompletedRequest::Start { succeeded: will_succeed },
                    );
                    if will_succeed {
                        self.started = true;
                    } else {
                        self.halted = true;
                    }
                }
                (
                    ExternalRequest::State(StateChangeRequest::Stop),
                    RequestKind::Stop,
                ) => {
                    self.queue.notify_request_completed(CompletedRequest::Stop);
                    self.halted = true;
                }
                (
                    ExternalRequest::State(StateChangeRequest::Reboot),
                    RequestKind::Reboot,
                ) => {
                    self.queue
                        .notify_request_completed(CompletedRequest::Reboot);
                    self.reboot_requested = false;
                }
                (
                    ExternalRequest::State(
                        StateChangeRequest::MigrateAsSource { .. },
                    ),
                    RequestKind::Migrate { will_succeed },
                ) => {
                    self.queue.notify_request_completed(
                        CompletedRequest::MigrationOut {
                            succeeded: will_succeed,
                        },
                    );
                    self.migrate_out_requested = false;
                    if will_succeed {
                        self.halted = true;
                    }
                }
                (
                    ExternalRequest::Component(
                        ComponentChangeRequest::ReconfigureCrucibleVolume {
                            ..
                        },
                    ),
                    RequestKind::ReconfigureCrucible,
                ) => {}
                (d, e) => panic!(
                    "dequeued request {d:?} but expected to dequeue {e:?}\n\
                    remaining queue: {:#?}\n\
                    remaining expected (state): {:#?}\n\
                    remaining expected (components): {:#?}",
                    self.queue, self.expected_state, self.expected_component
                ),
            }
        }
    }

    proptest! {
        #[test]
        fn request_queue_dequeue(
            ops in prop::collection::vec(queue_op_strategy(), 0..100)
        ) {
            let mut test = QueueDequeueTest::new();
            test.run(ops);
        }
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/services.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Services visible to consumers outside this Propolis that depend on
//! functionality supplied by an extant VM.

use std::sync::Arc;

use oximeter::types::ProducerRegistry;
use propolis_api_types::instance::InstanceProperties;
use slog::{error, info, Logger};

use crate::{
    serial::SerialTaskControlMessage,
    server::MetricsEndpointConfig,
    spec::Spec,
    stats::{ServerStats, VirtualMachine},
    vnc::VncServer,
};

use super::objects::{VmObjects, VmObjectsShared};

/// Information used to serve Oximeter metrics.
#[derive(Default)]
pub(crate) struct OximeterState {
    /// The Oximeter server to which Oximeter clients connect to query for
    /// metrics.
    server: Option<oximeter_producer::Server>,

    /// The statistics object used by the API layer to record its metrics.
    pub stats: Option<crate::stats::ServerStats>,
}

/// A collection of services visible to consumers outside this Propolis that
/// depend on the functionality supplied by an extant VM.
pub(crate) struct VmServices {
    /// A VM's serial console handler task.
    pub serial_task: tokio::sync::Mutex<Option<crate::serial::SerialTask>>,

    /// A VM's Oximeter state.
    ///
    /// This mostly contains the actual producer server, though the
    /// "server-level stats" are also included here.
    pub oximeter: tokio::sync::Mutex<OximeterState>,

    /// A reference to the VM's host process's VNC server.
    pub vnc_server: Arc<VncServer>,
}

impl VmServices {
    /// Starts a new set of VM services using the supplied VM objects and server
    /// configuration.
    pub(super) async fn new(
        log: &slog::Logger,
        vm_objects: &VmObjects,
        vm_properties: &InstanceProperties,
        ensure_options: &super::EnsureOptions,
    ) -> Self {
        let vm_objects = vm_objects.lock_shared().await;
        let oximeter_state = if let Some(cfg) = &ensure_options.metrics_config {
            let registry = ensure_options.oximeter_registry.as_ref().expect(
                "should have a producer registry if metrics are configured",
            );
            register_oximeter_producer(
                log,
                cfg,
                registry,
                vm_objects.instance_spec(),
                vm_properties,
            )
            .await
        } else {
            OximeterState::default()
        };

        let vnc_server = ensure_options.vnc_server.clone();
        if let Some(ramfb) = vm_objects.framebuffer() {
            vnc_server.attach(vm_objects.ps2ctrl().clone(), ramfb.clone());
        }

        let serial_task = start_serial_task(log, &vm_objects).await;

        Self {
            serial_task: tokio::sync::Mutex::new(Some(serial_task)),
            oximeter: tokio::sync::Mutex::new(oximeter_state),
            vnc_server,
        }
    }

    /// Directs all the services in this service block to stop.
    pub(super) async fn stop(&self, log: &Logger) {
        self.vnc_server.stop().await;

        if let Some(serial_task) = self.serial_task.lock().await.take() {
            let _ = serial_task
                .control_ch
                .send(SerialTaskControlMessage::Stopping)
                .await;
            let _ = serial_task.task.await;
        }

        let mut oximeter_state = self.oximeter.lock().await;
        if let Some(server) = oximeter_state.server.take() {
            if let Err(e) = server.close().await {
                error!(log, "failed to close oximeter producer server";
                       "error" => ?e);
            }
        }

        let _ = oximeter_state.stats.take();
    }
}

/// Creates an Oximeter producer and registers it with Oximeter, which will call
/// back into the server to gather the producer's metrics.
async fn register_oximeter_producer(
    log: &slog::Logger,
    cfg: &MetricsEndpointConfig,
    registry: &ProducerRegistry,
    spec: &Spec,
    vm_properties: &InstanceProperties,
) -> OximeterState {
    let mut oximeter_state = OximeterState::default();
    let virtual_machine = VirtualMachine::new(spec.board.cpus, vm_properties);

    // Create the server itself.
    //
    // The server manages all details of the registration with Nexus, so we
    // don't need our own task for that or way to shut it down.
    oximeter_state.server = match crate::stats::start_oximeter_server(
        virtual_machine.target.instance_id,
        cfg,
        log,
        registry,
    ) {
        Ok(server) => {
            info!(log, "created metric producer server");
            Some(server)
        }
        Err(err) => {
            error!(
                log,
                "failed to construct metric producer server, \
                no metrics will be available for this instance.";
                "error" => ?err,
            );
            None
        }
    };

    // Assign our own metrics production for this VM instance to the
    // registry, letting the server actually return them to oximeter when
    // polled.
    let stats = ServerStats::new(virtual_machine);
    if let Err(e) = registry.register_producer(stats.clone()) {
        error!(
            log,
            "failed to register our server metrics with \
            the ProducerRegistry, no server stats will \
            be produced";
            "error" => ?e,
        );
    }

    oximeter_state
}

/// Launches a serial console handler task.
async fn start_serial_task(
    log: &slog::Logger,
    vm_objects: &VmObjectsShared<'_>,
) -> crate::serial::SerialTask {
    let (websocks_ch, websocks_recv) = tokio::sync::mpsc::channel(1);
    let (control_ch, control_recv) = tokio::sync::mpsc::channel(1);

    let serial = vm_objects.com1().clone();
    serial.set_task_control_sender(control_ch.clone()).await;
    let err_log = log.new(slog::o!("component" => "serial task"));
    let task = tokio::spawn(async move {
        if let Err(e) = crate::serial::instance_serial_task(
            websocks_recv,
            control_recv,
            serial,
            err_log.clone(),
        )
        .await
        {
            error!(err_log, "Failure in serial task: {}", e);
        }
    });

    crate::serial::SerialTask { task, control_ch, websocks_ch }
}


================================================
FILE: bin/propolis-server/src/lib/vm/state_driver.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Structures and tasks that handle VM state and configuration change requests.
//!
//! This module handles the second high-level phase of a VM's lifecycle: once a
//! VM's components and services exist, it enters an event loop that changes the
//! VM's state in response to external API requests and signals arriving from
//! within the VM. See the [`ensure`] module for more information about the
//! initialization phase.
//!
//! This module's main struct is the [`StateDriver`], which holds references to
//! an active VM's components, the VM's event queues, and the sender side of a
//! channel that publishes instance state updates. External API requests are
//! routed to the driver's event queue and handled by the driver task. This
//! model ensures that only one task handles VM events and updates VM state; the
//! idea is to minimize the number of different tasks and threads one has to
//! consider when reasoning about concurrency in the VM state machine.
//!
//! On a migration in, the state driver implicitly starts the VM before entering
//! the main event loop:
//!
//! ```text
//!                   +-----------------------+
//!                   | VM components created |
//!                   +-----------+-----------+
//!                               |
//!                               |
//!            Yes +--------------v--------------+ No
//!              +-+ Initialized via migration?  +-+
//!              | +-----------------------------+ |
//!              |                                 |
//!       +------v--------+                        |
//!       | Auto-start VM |                        |
//!       +------+--------+                        |
//!              |                                 |
//! +------------v------------+                    |
//! | Start devices and vCPUs |                    |
//! +------------+------------+                    |
//!              |                                 |
//!              |                                 |
//!              |    +-----------------------+    |
//!              +----> Enter main event loop <----+
//!                   +-----------------------+
//! ```
//!
//! Once in the main event loop, a VM generally remains active until it receives
//! a signal telling it to do something else:
//!
//! ```text
//! +-----------------+   +-----------------+  error during startup
//! | Not yet started |   | Not yet started |       +--------+
//! | (migrating in)  |   |   (Creating)    +-------> Failed |
//! +-------+---------+   +--------+--------+       +--------+
//!         |                      |
//!         |                      | Successful start request
//!         +-----------+          |
//!                    +v----------v-----------+ API/chipset request
//!          +---------+        Running        +------+
//!          |         +---^-------+--------^--+   +--v--------+
//!          |             |       |        +------+ Rebooting |
//!          |             |       |               +-----------+
//! +--------v------+      |       |
//! | Migrating out +------+       | API/chipset request
//! +--------+------+              |
//!          |                +----v-----+
//!          |                | Stopping |
//!          |                +----+-----+
//!          |                     |
//!          |                     |            +-----------------+
//!          |                +----v-----+      |    Destroyed    |
//!          +----------------> Stopped  +------> (after rundown) |
//!                           +----------+      +-----------------+
//! ```
//!
//! The state driver's [`InputQueue`] receives events that can push a running VM
//! out of its steady "running" state. These can come either from the external
//! API or from events happening in the guest (e.g. a vCPU asserting a pin on
//! the virtual chipset that should reset or halt the VM). The policy that
//! determines what API requests can be accepted in which states is implemented
//! in the [`request_queue`] module.
//!
//! The "stopped" and "failed" states are terminal states. When the state driver
//! reaches one of these states, it exits the event loop, returning its final
//! state to the wrapper function that launched the driver. The wrapper task is
//! responsible for running down the VM objects and structures and resetting the
//! server so that it can start another VM.
//!
//! [`ensure`]: crate::vm::ensure

use std::{
    sync::{Arc, Mutex},
    time::Duration,
};

use anyhow::Context;
use dropshot::HttpError;
use propolis_api_types::instance::InstanceState;
use propolis_api_types::instance_spec::{
    components::backends::CrucibleStorageBackend, SpecKey,
};
use propolis_api_types::migration::MigrationState;
use slog::{error, info};
use tokio::sync::Notify;
use uuid::Uuid;

use crate::{
    migrate::{
        destination::DestinationProtocol, source::SourceProtocol, MigrateRole,
    },
    spec::StorageBackend,
    vm::{state_publisher::ExternalStateUpdate, BlockBackendMap},
};

use super::{
    ensure::{
        VmEnsureActive, VmEnsureActiveOutput, VmEnsureNotStarted,
        VmEnsureRequest,
    },
    guest_event::{self, GuestEvent},
    objects::VmObjects,
    request_queue::{
        self, CompletedRequest, ComponentChangeRequest, ExternalRequest,
        InstanceAutoStart, StateChangeRequest,
    },
    state_publisher::{MigrationStateUpdate, StatePublisher},
    InstanceEnsureResponseTx,
};

/// Tells the state driver what to do after handling an event.
#[derive(Debug, PartialEq, Eq)]
enum HandleEventOutcome {
    Continue,
    Exit { final_state: InstanceState },
}

/// A reason for starting a VM.
#[derive(Debug, PartialEq, Eq)]
pub(super) enum VmStartReason {
    MigratedIn,
    ExplicitRequest,
}

/// The outcome of a request to start a VM.
enum VmStartOutcome {
    Succeeded,
    Failed,
    Aborted,
}

impl VmStartOutcome {
    /// If this start outcome implies that the state driver should return
    /// immediately and allow the VM to be torn down, this routine returns
    /// `Some(state)` where `state` is the final VM state to return from the
    /// driver. If the driver should continue running the VM, this routine
    /// returns `None`.
    fn final_vm_state(&self) -> Option<InstanceState> {
        match self {
            Self::Succeeded => None,
            Self::Failed => Some(InstanceState::Failed),
            Self::Aborted => Some(InstanceState::Destroyed),
        }
    }
}

/// A kind of event the state driver can handle.
#[derive(Debug)]
enum InputQueueEvent {
    ExternalRequest(ExternalRequest),
    GuestEvent(GuestEvent),
}

/// The lock-guarded parts of a state driver's input queue.
struct InputQueueInner {
    /// State change requests from the external API.
    external_requests: request_queue::ExternalRequestQueue,

    /// State change requests from the VM's components. These take precedence
    /// over external state change requests.
    guest_events: super::guest_event::GuestEventQueue,
}

impl InputQueueInner {
    fn new(log: slog::Logger, auto_start: InstanceAutoStart) -> Self {
        Self {
            external_requests: request_queue::ExternalRequestQueue::new(
                log, auto_start,
            ),
            guest_events: super::guest_event::GuestEventQueue::default(),
        }
    }
}

/// A queue for external state change requests and guest-driven state changes.
pub(super) struct InputQueue {
    /// Contains the input queue's sub-queues, one for external state change
    /// requests and one for events emitted by the VM.
    inner: Mutex<InputQueueInner>,

    /// Notifies the state driver that a new event is present on the queue.
    ///
    /// Notifiers must use [`Notify::notify_one`] when signaling this `Notify`
    /// to guarantee the state driver does not miss incoming messages. See the
    /// comments in [`InputQueue::wait_for_next_event`].
    notify: Notify,
}

impl InputQueue {
    /// Creates a new state driver input queue.
    pub(super) fn new(
        log: slog::Logger,
        auto_start: InstanceAutoStart,
    ) -> Self {
        Self {
            inner: Mutex::new(InputQueueInner::new(log, auto_start)),
            notify: Notify::new(),
        }
    }

    /// Waits for a new event to arrive on one of the queue's sub-queues and
    /// dispatches it for processing.
    ///
    /// The sub-queues, listed here in priority order, are:
    ///
    /// - Guest events: These are signals raised from the VM's vCPUs and
    ///   devices (e.g. a request to reboot or halt the VM arising from a vCPU
    ///   asserting a virtual chipset signal).
    /// - External requests: These are state change requests received via the
    ///   server API. See [`super::request_queue`] for more details about how
    ///   these requests are queued.
    ///
    /// # Synchronization
    ///
    /// This routine assumes that it is only ever called by one task (the state
    /// driver). If multiple threads call this routine simultaneously, they may
    /// miss wakeups and not return when new events are pushed to the queue or
    /// cause a panic (see below).
    async fn wait_for_next_event(&self) -> InputQueueEvent {
        loop {
            {
                let mut guard = self.inner.lock().unwrap();
                if let Some(guest_event) = guard.guest_events.pop_front() {
                    return InputQueueEvent::GuestEvent(guest_event);
                } else if let Some(req) = guard.external_requests.pop_front() {
                    return InputQueueEvent::ExternalRequest(req);
                }
            }

            // It's safe not to use `Notified::enable` here because (1) only one
            // thread (the state driver) can call `wait_for_next_event` on a
            // given input queue, and (2) all the methods of signaling the queue
            // use `notify_one`, which buffers a permit if no one is waiting
            // when the signal arrives. This means that if a notification is
            // sent after the lock is dropped but before `notified()` is called
            // here, the ensuing wait will be satisfied immediately.
            self.notify.notified().await;
        }
    }

    /// Notifies the external request queue that the state driver has completed
    /// a request from that queue.
    fn notify_request_completed(&self, state: CompletedRequest) {
        let mut guard = self.inner.lock().unwrap();
        guard.external_requests.notify_request_completed(state);
    }

    /// Notifies the external request queue that the instance has stopped. This
    /// is used to stop the queue when the instance stops without a request from
    /// the API (e.g. because the guest requested a chipset-driven shutdown).
    fn notify_stopped(&self) {
        let mut guard = self.inner.lock().unwrap();
        guard.external_requests.notify_stopped();
    }

    /// Submits an external state change request to the queue.
    pub(super) fn queue_external_request(
        &self,
        request: ExternalRequest,
    ) -> Result<(), request_queue::RequestDeniedReason> {
        let mut inner = self.inner.lock().unwrap();
        let result = inner.external_requests.try_queue(request);
        if result.is_ok() {
            self.notify.notify_one();
        }
        result
    }
}

impl guest_event::VcpuEventHandler for InputQueue {
    fn suspend_halt_event(&self, when: Duration) {
        let mut guard = self.inner.lock().unwrap();
        if guard
            .guest_events
            .enqueue(guest_event::GuestEvent::VcpuSuspendHalt(when))
        {
            self.notify.notify_one();
        }
    }

    fn suspend_reset_event(&self, when: Duration) {
        let mut guard = self.inner.lock().unwrap();
        if guard
            .guest_events
            .enqueue(guest_event::GuestEvent::VcpuSuspendReset(when))
        {
            self.notify.notify_one();
        }
    }

    fn suspend_triple_fault_event(&self, vcpu_id: i32, when: Duration) {
        let mut guard = self.inner.lock().unwrap();
        if guard.guest_events.enqueue(
            guest_event::GuestEvent::VcpuSuspendTripleFault(vcpu_id, when),
        ) {
            self.notify.notify_one();
        }
    }

    fn unhandled_vm_exit(
        &self,
        vcpu_id: i32,
        exit: propolis::exits::VmExitKind,
    ) {
        panic!("vCPU {vcpu_id}: Unhandled VM exit: {exit:?}");
    }

    fn io_error_event(&self, vcpu_id: i32, error: std::io::Error) {
        panic!("vCPU {vcpu_id}: Unhandled vCPU error: {error}");
    }
}

impl guest_event::ChipsetEventHandler for InputQueue {
    fn chipset_halt(&self) {
        let mut guard = self.inner.lock().unwrap();
        if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetHalt) {
            self.notify.notify_one();
        }
    }

    fn chipset_reset(&self) {
        let mut guard = self.inner.lock().unwrap();
        if guard.guest_events.enqueue(guest_event::GuestEvent::ChipsetReset) {
            self.notify.notify_one();
        }
    }
}

/// The context for a VM state driver task's main loop.
struct StateDriver {
    /// The state driver's associated logger.
    log: slog::Logger,

    /// The VM objects this driver is managing.
    objects: Arc<VmObjects>,

    /// The input queue this driver gets events from.
    input_queue: Arc<InputQueue>,

    /// The channel to which this driver publishes external instance state
    /// changes.
    external_state: StatePublisher,

    /// True if the VM is paused.
    paused: bool,

    /// State persisted from previous attempts to migrate out of this VM.
    migration_src_state: crate::migrate::source::PersistentState,
}

/// Contains a state driver's terminal state and the channel it used to publish
/// state updates to the rest of the server. The driver's owner can use these to
/// publish the VM's terminal state after running down all of its objects and
/// services.
pub(super) struct StateDriverOutput {
    /// The channel this driver used to publish external instance state changes.
    pub state_publisher: StatePublisher,

    /// The terminal state of this instance. When the instance completes
    /// rundown, the parent VM publishes this state to the associated channel.
    pub final_state: InstanceState,
}

/// Given an instance ensure request, processes the request and hands the
/// resulting activated VM off to a [`StateDriver`] that will drive the main VM
/// event loop.
///
/// Returns the final state driver disposition. Note that this routine does not
/// return a `Result`; if the VM fails to start, the returned
/// [`StateDriverOutput`] contains appropriate state for a failed VM.
pub(super) async fn ensure_vm_and_launch_driver(
    log: slog::Logger,
    base_log: slog::Logger,
    vm: Arc<super::Vm>,
    mut state_publisher: StatePublisher,
    ensure_request: VmEnsureRequest,
    ensure_result_tx: InstanceEnsureResponseTx,
    ensure_options: super::EnsureOptions,
) -> StateDriverOutput {
    let ensure_options = Arc::new(ensure_options);
    let activated_vm = match ensure_active_vm(
        &base_log,
        &vm,
        &mut state_publisher,
        &ensure_request,
        ensure_result_tx,
        &ensure_options,
    )
    .await
    {
        Ok(activated) => activated,
        Err(e) => {
            error!(log, "failed to activate new VM"; "error" => #%e);
            return StateDriverOutput {
                state_publisher,
                final_state: InstanceState::Failed,
            };
        }
    };

    let VmEnsureActiveOutput { vm_objects, input_queue, vmm_rt_hdl } =
        activated_vm.into_inner();

    let state_driver = StateDriver {
        log,
        objects: vm_objects,
        input_queue,
        external_state: state_publisher,
        paused: false,
        migration_src_state: Default::default(),
    };

    // Run the VM until it exits, then set rundown on the parent VM so that no
    // new external callers can access its objects or services.
    match vmm_rt_hdl
        .spawn(async move {
            let output = state_driver.run(ensure_request.is_migration()).await;
            vm.set_rundown().await;
            output
        })
        .await
    {
        Ok(output) => output,
        Err(e) => panic!("failed to join state driver task: {e}"),
    }
}

/// Processes the supplied `ensure_request` to create a set of VM objects that
/// can be moved into a new `StateDriver`.
async fn ensure_active_vm<'a>(
    log: &'a slog::Logger,
    vm: &'a Arc<super::Vm>,
    state_publisher: &'a mut StatePublisher,
    ensure_request: &'a VmEnsureRequest,
    ensure_result_tx: InstanceEnsureResponseTx,
    ensure_options: &'a Arc<super::EnsureOptions>,
) -> anyhow::Result<VmEnsureActive<'a>> {
    let ensure = VmEnsureNotStarted::new(
        log,
        vm,
        ensure_request,
        ensure_options,
        ensure_result_tx,
        state_publisher,
    );

    if let Some(migrate_request) = ensure_request.migration_info() {
        let migration = match crate::migrate::destination::initiate(
            log,
            migrate_request,
            ensure_options.local_server_addr,
        )
        .await
        {
            Ok(mig) => mig,
            Err(e) => {
                return Err(ensure
                    .fail(e.into())
                    .await
                    .context("creating migration protocol handler"));
            }
        };

        // Delegate the rest of the activation process to the migration
        // protocol. If the migration fails, the callee is responsible for
        // dispatching failure messages to any API clients who are awaiting
        // the results of their instance ensure calls.
        Ok(migration
            .run(ensure)
            .await
            .context("running live migration protocol")?)
    } else {
        let created = ensure
            .create_objects_from_request()
            .await
            .context("creating VM objects for new instance")?;

        Ok(created.ensure_active().await)
    }
}

impl StateDriver {
    /// Directs this state driver to enter its main event loop. The driver may
    /// perform additional tasks (e.g. automatically starting a migration
    /// target) before it begins processing events from its queues.
    pub(super) async fn run(mut self, migrated_in: bool) -> StateDriverOutput {
        info!(self.log, "state driver launched");

        let final_state = if migrated_in {
            // If the final state is known merely from the attempt to start the
            // VM, return it immediately; otherwise, run the event loop and wait
            // for it to return the final state.
            match self
                .start_vm(VmStartReason::MigratedIn)
                .await
                .final_vm_state()
            {
                None => self.event_loop().await,
                Some(s) => s,
            }
        } else {
            self.event_loop().await
        };

        StateDriverOutput { state_publisher: self.external_state, final_state }
    }

    /// Runs the state driver's main event loop.
    async fn event_loop(&mut self) -> InstanceState {
        info!(self.log, "state driver entered main loop");
        loop {
            let event = self.input_queue.wait_for_next_event().await;
            info!(self.log, "state driver handling event"; "event" => ?event);

            let outcome = match event {
                InputQueueEvent::ExternalRequest(req) => {
                    self.handle_external_request(req).await
                }
                InputQueueEvent::GuestEvent(event) => {
                    self.handle_guest_event(event).await
                }
            };

            info!(self.log, "state driver handled event"; "outcome" => ?outcome);
            match outcome {
                HandleEventOutcome::Continue => {}
                HandleEventOutcome::Exit { final_state } => {
                    info!(self.log, "state driver exiting";
                          "final_state" => ?final_state);

                    return final_state;
                }
            }
        }
    }

    /// Starts the driver's VM by sending start commands to its devices and
    /// vCPUs.
    async fn start_vm(
        &mut self,
        start_reason: VmStartReason,
    ) -> VmStartOutcome {
        info!(self.log, "starting instance"; "reason" => ?start_reason);

        // Tell listeners that the VM's components are now starting up and not
        // merely being created (but keep the VM in the Migrating state if it's
        // being started pursuant to a migration in).
        if let VmStartReason::ExplicitRequest = start_reason {
            self.external_state
                .update(ExternalStateUpdate::Instance(InstanceState::Starting));
        }

        // The start sequence is arranged so that calls to block backends can be
        // interleaved with processing of requests from the external request
        // queue. This allows Nexus to reconfigure Crucible backends while they
        // are being activated, which can be necessary if the VM's original
        // specification specifies a Crucible downstairs server that is offline
        // or unavailable. (Downstairs instances can disappear at any time,
        // e.g. due to sled failure, so these configurations aren't necessarily
        // client errors.)
        //
        // Before getting into any of that, handle the synchronous portions of
        // VM startup. First, ensure that the kernel VM and all its associated
        // devices are in the correct initial states.
        let objects = self.objects.lock_shared().await;
        match start_reason {
            // If this VM is a migration target, migration will have properly
            // initialized the vCPUs, but will have left the kernel VM paused.
            // Resume it here before asking any in-kernel components to start.
            VmStartReason::MigratedIn => objects.resume_kernel_vm(),

            // If this VM is starting from scratch, its kernel VM is active, but
            // its vCPUs have not been initialized yet.
            VmStartReason::ExplicitRequest => objects.reset_vcpus(),
        }

        // Send synchronous start commands to all devices.
        for (name, dev) in objects.device_map() {
            info!(self.log, "sending start request to {}", name);
            let res = dev.start();
            if let Err(e) = res {
                error!(
                    self.log, "device start() returned an error";
                    "device" => %name,
                    "error" => %e
                );

                return VmStartOutcome::Failed;
            }
        }

        // Next, prepare to start block backends. This is done by capturing the
        // current block backend set and creating a future that issues all the
        // start requests.
        //
        // For this to work, the set of block backends to be started must not
        // change while the VM is starting. This is guaranteed because all such
        // requests to hotplug a block backend will be dispatched to the VM's
        // request queue; if any such requests are seen below, they can simply
        // be buffered and handled after the rest of the VM has started.
        async fn start_block_backends(
            log: slog::Logger,
            backends: BlockBackendMap,
        ) -> anyhow::Result<()> {
            for (name, backend) in backends {
                info!(log, "starting block backend {}", name);
                let res = backend.start().await;
                if let Err(e) = &res {
                    error!(
                        log,
                        "block backend start() returned an error";
                        "backend" => %name,
                        "error" => %e
                    );

                    return res;
                }
            }

            Ok(())
        }

        let block_backends = objects.block_backend_map().clone();
        let block_backend_fut =
            start_block_backends(self.log.clone(), block_backends);
        tokio::pin!(block_backend_fut);

        // Drop the VM object lock before proceeding to allow other API calls
        // that simply read the VM to make progress. Again, note that the set of
        // objects being started still can't change, not because the lock is
        // held, but because the only entity that can change them is the current
        // task, which can decide whether and how to buffer incoming requests.
        drop(objects);

        // Keep track of whether the external queue produced a request to stop
        // the VM while it was being started. If such a request is seen, send a
        // self-request to stop just before returning so that the VM will stop
        // immediately.
        enum Selection {
            BackendFuture(anyhow::Result<()>),
            Event(InputQueueEvent),
        }
        loop {
            let selection = tokio::select! {
                // If the VM successfully starts, return immediately and let
                // the caller process any events that may happen to be on the
                // queue.
                biased;

                res = &mut block_backend_fut => {
                    Selection::BackendFuture(res)
                }

                event = self.input_queue.wait_for_next_event() => {
                    Selection::Event(event)
                }
            };

            let req: ExternalRequest = match selection {
                Selection::BackendFuture(Ok(())) => {
                    let objects = &self.objects;
                    objects.lock_exclusive().await.resume_vcpus();
                    self.external_state.update(ExternalStateUpdate::Instance(
                        InstanceState::Running,
                    ));

                    self.input_queue.notify_request_completed(
                        CompletedRequest::Start { succeeded: true },
                    );

                    info!(&self.log, "VM successfully started");
                    return VmStartOutcome::Succeeded;
                }

                Selection::BackendFuture(Err(e)) => {
                    info!(&self.log, "VM startup failed: {e}");
                    self.input_queue.notify_request_completed(
                        CompletedRequest::Start { succeeded: false },
                    );

                    return VmStartOutcome::Failed;
                }

                // The VM's vCPUs only start when the block backend startup
                // future resolves and is selected above. If control reached
                // that point, that branch wasn't selected, so the vCPUs should
                // still be paused, which means the dequeued event should not be
                // a guest event.
                Selection::Event(InputQueueEvent::GuestEvent(_)) => {
                    unreachable!("can't get guest events before vCPUs start")
                }

                Selection::Event(InputQueueEvent::ExternalRequest(req)) => req,
            };

            match req {
                ExternalRequest::State(StateChangeRequest::Stop) => {
                    info!(
                        &self.log,
                        "got request to stop while still starting"
                    );

                    // Don't send any pause/halt notifications here, since
                    // (depending on what async work was in flight when this
                    // notification was received) there may be a
                    // partially-started component that is not prepared to be
                    // paused and halted. Instead, simply move the VM to
                    // Stopped, return an "aborted" status, and let the caller
                    // arrange to drop all the VM's components. (Note that no
                    // vCPUs have started yet, so no guest work is in flight at
                    // this point.)
                    self.external_state.update(ExternalStateUpdate::Instance(
                        InstanceState::Stopped,
                    ));

                    self.input_queue.notify_stopped();
                    return VmStartOutcome::Aborted;
                }
                ExternalRequest::Component(
                    ComponentChangeRequest::ReconfigureCrucibleVolume {
                        backend_id,
                        new_vcr_json,
                        result_tx,
                    },
                ) => {
                    // The API caller who requested this operation can hang up
                    // and drop the receiver. This isn't fatal; just keep
                    // starting the VM if it happens.
                    let _ = result_tx.send(
                        self.reconfigure_crucible_volume(
                            &backend_id,
                            new_vcr_json,
                        )
                        .await,
                    );
                }
                // The request queue is expected to reject (or at least silently
                // ignore) requests to migrate or reboot an instance that hasn't
                // reported that it's fully started. Similarly, requests to
                // start a VM that's already starting are expected to be ignored
                // for idempotency.
                r @ ExternalRequest::State(StateChangeRequest::Start)
                | r @ ExternalRequest::State(
                    StateChangeRequest::MigrateAsSource { .. },
                )
                | r @ ExternalRequest::State(StateChangeRequest::Reboot) => {
                    unreachable!(
                        "external request {r:?} shouldn't be queued while \
                        starting"
                    );
                }
            }
        }
    }

    async fn handle_guest_event(
        &mut self,
        event: GuestEvent,
    ) -> HandleEventOutcome {
        match event {
            GuestEvent::VcpuSuspendHalt(_when) => {
                info!(self.log, "Halting due to VM suspend event",);
                self.do_halt().await;
                self.external_state.update(ExternalStateUpdate::Instance(
                    InstanceState::Stopped,
                ));

                self.input_queue.notify_stopped();
                HandleEventOutcome::Exit {
                    final_state: InstanceState::Destroyed,
                }
            }
            GuestEvent::VcpuSuspendReset(_when) => {
                info!(self.log, "Resetting due to VM suspend event");
                self.do_reboot().await;
                HandleEventOutcome::Continue
            }
            GuestEvent::VcpuSuspendTripleFault(vcpu_id, _when) => {
                info!(
                    self.log,
                    "Resetting due to triple fault on vCPU {}", vcpu_id
                );
                self.do_reboot().await;
                HandleEventOutcome::Continue
            }
            GuestEvent::ChipsetHalt => {
                info!(self.log, "Halting due to chipset-driven halt");
                self.do_halt().await;
                self.external_state.update(ExternalStateUpdate::Instance(
                    InstanceState::Stopped,
                ));

                self.input_queue.notify_stopped();
                HandleEventOutcome::Exit {
                    final_state: InstanceState::Destroyed,
                }
            }
            GuestEvent::ChipsetReset => {
                info!(self.log, "Resetting due to chipset-driven reset");
                self.do_reboot().await;
                HandleEventOutcome::Continue
            }
        }
    }

    async fn handle_external_request(
        &mut self,
        request: ExternalRequest,
    ) -> HandleEventOutcome {
        match request {
            ExternalRequest::State(StateChangeRequest::Start) => {
                // If this start attempt produces a terminal VM state, return it
                // to the driver and indicate that the driver should exit.
                match self
                    .start_vm(VmStartReason::ExplicitRequest)
                    .await
                    .final_vm_state()
                {
                    None => HandleEventOutcome::Continue,
                    Some(final_state) => {
                        HandleEventOutcome::Exit { final_state }
                    }
                }
            }
            ExternalRequest::State(StateChangeRequest::MigrateAsSource {
                migration_id,
                websock,
            }) => {
                if self
                    .migrate_as_source(migration_id, websock.into_inner())
                    .await
                    .is_ok()
                {
                    self.do_halt().await;
                    HandleEventOutcome::Exit {
                        final_state: InstanceState::Destroyed,
                    }
                } else {
                    HandleEventOutcome::Continue
                }
            }
            ExternalRequest::State(StateChangeRequest::Reboot) => {
                self.do_reboot().await;
                self.input_queue
                    .notify_request_completed(CompletedRequest::Reboot);

                HandleEventOutcome::Continue
            }
            ExternalRequest::State(StateChangeRequest::Stop) => {
                self.do_halt().await;
                self.external_state.update(ExternalStateUpdate::Instance(
                    InstanceState::Stopped,
                ));

                self.input_queue
                    .notify_request_completed(CompletedRequest::Stop);

                HandleEventOutcome::Exit {
                    final_state: InstanceState::Destroyed,
                }
            }
            ExternalRequest::Component(
                ComponentChangeRequest::ReconfigureCrucibleVolume {
                    backend_id,
                    new_vcr_json,
                    result_tx,
                },
            ) => {
                let _ = result_tx.send(
                    self.reconfigure_crucible_volume(&backend_id, new_vcr_json)
                        .await,
                );
                HandleEventOutcome::Continue
            }
        }
    }

    async fn do_reboot(&mut self) {
        info!(self.log, "resetting instance");

        self.external_state
            .update(ExternalStateUpdate::Instance(InstanceState::Rebooting));

        self.objects.lock_exclusive().await.reboot().await;

        // Notify other consumers that the instance successfully rebooted and is
        // now back to Running.
        self.external_state
            .update(ExternalStateUpdate::Instance(InstanceState::Running));
    }

    async fn do_halt(&mut self) {
        info!(self.log, "stopping instance");
        self.external_state
            .update(ExternalStateUpdate::Instance(InstanceState::Stopping));

        {
            let mut guard = self.objects.lock_exclusive().await;

            // Entities expect to be paused before being halted. Note that the VM
            // may be paused already if it is being torn down after a successful
            // migration out.
            if !self.paused {
                guard.pause().await;
                self.paused = true;
            }

            guard.halt().await;
        }
    }

    async fn migrate_as_source(
        &mut self,
        migration_id: Uuid,
        websock: dropshot::WebsocketConnection,
    ) -> Result<(), ()> {
        let conn = tokio_tungstenite::WebSocketStream::from_raw_socket(
            websock.into_inner(),
            tokio_tungstenite::tungstenite::protocol::Role::Server,
            None,
        )
        .await;

        let migration = match crate::migrate::source::initiate(
            &self.log,
            migration_id,
            conn,
            &self.objects,
            &self.migration_src_state,
        )
        .await
        {
            Ok(migration) => migration,
            Err(_) => {
                self.external_state.update(ExternalStateUpdate::Migration(
                    MigrationStateUpdate {
                        id: migration_id,
                        state: MigrationState::Error,
                        role: MigrateRole::Source,
                    },
                ));

                return Err(());
            }
        };

        // Publish that migration is in progress before actually launching the
        // migration task.
        self.external_state.update(ExternalStateUpdate::Complete(
            InstanceState::Migrating,
            MigrationStateUpdate {
                state: MigrationState::Sync,
                id: migration_id,
                role: MigrateRole::Source,
            },
        ));

        match migration
            .run(
                &self.objects,
                &mut self.external_state,
                &mut self.migration_src_state,
            )
            .await
        {
            Ok(()) => {
                info!(self.log, "migration out succeeded, queuing stop");
                // On a successful migration out, the protocol promises to leave
                // the VM objects in a paused state, so don't pause them again.
                self.paused = true;
                self.input_queue.notify_request_completed(
                    CompletedRequest::MigrationOut { succeeded: true },
                );

                Ok(())
            }
            Err(e) => {
                info!(self.log, "migration out failed, resuming";
                      "error" => ?e);

                self.input_queue.notify_request_completed(
                    CompletedRequest::MigrationOut { succeeded: false },
                );

                self.external_state.update(ExternalStateUpdate::Instance(
                    InstanceState::Running,
                ));

                Err(())
            }
        }
    }

    async fn reconfigure_crucible_volume(
        &self,
        backend_id: &SpecKey,
        new_vcr_json: String,
    ) -> super::CrucibleReplaceResult {
        info!(self.log, "request to replace Crucible VCR";
              "backend_id" => %backend_id);

        let mut objects = self.objects.lock_exclusive().await;
        let backend = objects
            .crucible_backends()
            .get(backend_id)
            .ok_or_else(|| {
                let msg = format!("No crucible backend for id {backend_id}");
                dropshot::HttpError::for_not_found(Some(msg.clone()), msg)
            })?
            .clone();

        let Some(disk) = objects.instance_spec_mut().disks.iter_mut().find(
            |(_id, device)| device.device_spec.backend_id() == backend_id,
        ) else {
            let msg = format!("no disk in spec with backend ID {backend_id}");
            return Err(HttpError::for_not_found(Some(msg.clone()), msg));
        };

        let StorageBackend::Crucible(CrucibleStorageBackend {
            request_json: old_vcr_json,
            readonly,
        }) = &disk.1.backend_spec
        else {
            let msg = format!(
                "disk {} has backend {backend_id} but its kind is {}",
                disk.0,
                disk.1.backend_spec.kind()
            );
            return Err(HttpError::for_not_found(Some(msg.clone()), msg));
        };

        let replace_result = backend
            .vcr_replace(old_vcr_json.as_str(), &new_vcr_json)
            .await
            .map_err(|e| {
                dropshot::HttpError::for_bad_request(
                    Some(e.to_string()),
                    e.to_string(),
                )
            })?;

        disk.1.backend_spec =
            StorageBackend::Crucible(CrucibleStorageBackend {
                readonly: *readonly,
                request_json: new_vcr_json,
            });

        info!(self.log, "replaced Crucible VCR"; "backend_id" => %backend_id);

        Ok(replace_result)
    }
}


================================================
FILE: bin/propolis-server/src/lib/vm/state_publisher.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Helper types for publishing instance states as made visible through the
//! external API.

use propolis_api_types::instance::{
    InstanceState, InstanceStateMonitorResponse,
};
use propolis_api_types::migration::{
    InstanceMigrateStatusResponse, InstanceMigrationStatus,
};
use slog::info;
use uuid::Uuid;

use crate::migrate::MigrateRole;

use super::{InstanceStateRx, InstanceStateTx};

/// An update to an instance's migration's state.
pub(crate) struct MigrationStateUpdate {
    /// The migration's new state.
    pub state: propolis_api_types::migration::MigrationState,

    /// The migration's ID.
    pub id: Uuid,

    /// The role this VM was playing in the migration of interest.
    pub role: MigrateRole,
}

impl MigrationStateUpdate {
    /// Applies an update to a previous migration status and returns the new
    /// status.
    fn apply_to(
        self,
        old: InstanceMigrateStatusResponse,
    ) -> InstanceMigrateStatusResponse {
        let new = InstanceMigrationStatus { id: self.id, state: self.state };
        match self.role {
            MigrateRole::Destination => InstanceMigrateStatusResponse {
                migration_in: Some(new),
                migration_out: old.migration_out,
            },
            MigrateRole::Source => InstanceMigrateStatusResponse {
                migration_in: old.migration_in,
                migration_out: Some(new),
            },
        }
    }
}

/// A kind of state update to publish.
pub(crate) enum ExternalStateUpdate {
    /// Update the instance state (but not any migration state).
    Instance(InstanceState),

    /// Update migration state (but not the instance's state).
    Migration(MigrationStateUpdate),

    /// Update both instance and migration state.
    Complete(InstanceState, MigrationStateUpdate),
}

/// A channel to which to publish externally-visible instance state updates.
pub(crate) struct StatePublisher {
    tx: InstanceStateTx,
    log: slog::Logger,
}

impl StatePublisher {
    pub(super) fn new(
        log: &slog::Logger,
        initial_state: InstanceStateMonitorResponse,
    ) -> (Self, InstanceStateRx) {
        let (tx, rx) = tokio::sync::watch::channel(initial_state);
        (Self { tx, log: log.clone() }, rx)
    }

    /// Updates an instance's externally-visible state and publishes that state
    /// with a successor generation number.
    pub(crate) fn update(&mut self, update: ExternalStateUpdate) {
        let (instance_state, migration_state) = match update {
            ExternalStateUpdate::Instance(i) => (Some(i), None),
            ExternalStateUpdate::Migration(m) => (None, Some(m)),
            ExternalStateUpdate::Complete(i, m) => (Some(i), Some(m)),
        };

        let InstanceStateMonitorResponse {
            state: old_instance,
            migration: old_migration,
            gen: old_gen,
        } = self.tx.borrow().clone();

        let state = instance_state.unwrap_or(old_instance);
        let migration = if let Some(migration_state) = migration_state {
            migration_state.apply_to(old_migration)
        } else {
            old_migration
        };

        let gen = old_gen + 1;
        info!(self.log, "publishing new instance state";
              "gen" => gen,
              "state" => ?state,
              "migration" => ?migration);

        let _ = self.tx.send(InstanceStateMonitorResponse {
            gen,
            state,
            migration,
        });
    }
}


================================================
FILE: bin/propolis-server/src/lib/vnc.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeSet;
use std::io;
use std::net::SocketAddr;
use std::sync::{Arc, Mutex, MutexGuard};
use std::time::{Duration, Instant};

use propolis::hw::ps2::ctrl::PS2Ctrl;
use propolis::hw::qemu::ramfb::{FrameSnap, RamFb};

use futures::StreamExt;
use rfb::encodings::{EncodingType, RawEncoding};
use rfb::proto::{
    ClientMessage, FramebufferUpdate, FramebufferUpdateRequest, Position,
    ProtoVersion, ProtocolError, Rectangle, Resolution, SecurityType,
    SecurityTypes,
};
use rgb_frame::{FourCC, Frame, Spec};
use slog::{error, trace, Logger};
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio::net::TcpListener;
use tokio::sync::{oneshot, Notify};
use tokio::task::JoinHandle;
use tokio::time::sleep;
use tokio_util::codec::FramedRead;

/// Arbitrary maximum valid resolution
const MAX_RES: Resolution = Resolution { width: 1920, height: 1200 };
const UNINIT_RES: Resolution = Resolution { width: 800, height: 600 };
const UNINIT_FOURCC: FourCC = FourCC::XR24;
const SERVER_NAME: &str = "propolis-vnc";
/// Frame interval (in us) for 10fps
const FRAME_US_10FPS: usize = 1000000 / 10;

struct Devices {
    keyboard: Arc<PS2Ctrl>,
    display: Arc<RamFb>,
}

#[derive(Copy, Clone, Eq, PartialEq)]
enum FrameKind {
    Valid,
    Generated,
}

#[derive(Default)]
struct State {
    devices: Option<Devices>,
    is_stopped: bool,
}

struct ClientState {
    last_snap: Option<(FrameSnap, FrameKind)>,
    fbu_req: Option<FramebufferUpdateRequest>,
    encodings: BTreeSet<EncodingType>,
    output_fourcc: FourCC,
}
impl Default for ClientState {
    fn default() -> Self {
        Self {
            last_snap: None,
            fbu_req: None,
            encodings: BTreeSet::new(),
            output_fourcc: UNINIT_FOURCC,
        }
    }
}

#[derive(Default)]
pub struct Client {
    hup: Option<oneshot::Sender<()>>,
    id: Option<String>,
}

pub struct VncServer {
    state: Mutex<State>,
    client: Mutex<Client>,
    notify: Notify,
    /// Minimum frame interval (in us)
    frame_int_us: usize,
    log: Logger,
}

#[derive(thiserror::Error, Debug)]
pub enum ConnectError {
    #[error("Invalid FourCC {0}")]
    InvalidFourCC(u32),
    #[error("VNC initialization error {0:?}")]
    InitError(#[from] rfb::server::InitError),
    #[error("VNC server is stopped")]
    ServerStopped,
}

/// Alias trait to cut down on verbosity
pub trait Connection: AsyncRead + AsyncWrite + Unpin + Send + 'static {}

impl<T: AsyncRead + AsyncWrite + Unpin + Send + 'static> Connection
    for rfb::tungstenite::BinaryWs<T>
{
}
impl Connection for tokio::net::TcpStream {}
impl Connection for Box<dyn Connection> {}

impl VncServer {
    pub fn new(log: Logger) -> Arc<Self> {
        Arc::new(Self {
            state: Mutex::new(State::default()),
            client: Mutex::new(Client::default()),
            notify: Notify::new(),
            frame_int_us: FRAME_US_10FPS,
            log,
        })
    }
    pub fn attach(&self, ps2: Arc<PS2Ctrl>, fb: Arc<RamFb>) {
        let mut state = self.state.lock().unwrap();
        state.devices = Some(Devices { keyboard: ps2, display: fb });
    }
    pub async fn connect(
        self: &Arc<Self>,
        mut conn: impl Connection,
        client_id: String,
    ) -> Result<(), ConnectError> {
        let (resolution, fourcc) = {
            let state = self.state.lock().unwrap();
            if state.is_stopped {
                return Err(ConnectError::ServerStopped);
            }
            match state.devices.as_ref().map(|devs| devs.display.read_spec()) {
                Some(Ok(spec)) if spec_valid(&spec) => (
                    Resolution {
                        width: spec.width.get() as u16,
                        height: spec.height.get() as u16,
                    },
                    spec.fourcc,
                ),
                _ => (UNINIT_RES, UNINIT_FOURCC),
            }
        };

        let _client_init = rfb::server::initialize(
            &mut conn,
            rfb::server::InitParams {
                version: ProtoVersion::Rfb38,
                // vncviewer won't work without offering VncAuth, even though it
                // doesn't ask to use it.
                sec_types: SecurityTypes(vec![
                    SecurityType::None,
                    SecurityType::VncAuthentication,
                ]),
                name: SERVER_NAME.to_string(),
                resolution,
                format: fourcc.into(),
            },
        )
        .await?;

        let hup_recv = self.replace_client(client_id).await;

        let this = self.clone();
        tokio::spawn(async move {
            if let Err(e) = this.run(conn, hup_recv).await {
                error!(this.log, "VNC error, hanging up: {:?}", e);
            }
            this.hup_client();
        });

        Ok(())
    }

    async fn replace_client(&self, new_id: String) -> oneshot::Receiver<()> {
        let mut client = self.wait_client_gone().await;

        let (send, recv) = oneshot::channel();
        client.id = Some(new_id);
        client.hup = Some(send);

        recv
    }
    fn hup_client(&self) {
        let mut client = self.client.lock().unwrap();
        client.hup.take();
        client.id.take();
        self.notify.notify_one();
    }
    async fn wait_client_gone(&self) -> MutexGuard<'_, Client> {
        loop {
            {
                let mut client = self.client.lock().unwrap();
                // tell any existing client to hang up
                if let Some(hup) = client.hup.take() {
                    let _ = hup.send(());
                }
                // and once it is gone, go on to install ourself as active
                if client.id.is_none() {
                    return client;
                }
                drop(client);
            }

            self.notify.notified().await;
        }
    }

    async fn run(
        &self,
        conn: impl Connection,
        mut close_recv: oneshot::Receiver<()>,
    ) -> Result<(), ProtocolError> {
        let mut decoder =
            FramedRead::new(conn, rfb::proto::ClientMessageDecoder::default());
        let mut cstate: ClientState = Default::default();
        loop {
            tokio::select! {
                biased;

                _ = &mut close_recv => {
                    return Ok(());
                },
                msg = decoder.next() => {
                    let msg = match msg {
                        Some(Err(e)) => {
                            return Err(e);
                        }
                        None => {
                            // Client disconnect
                            return Ok(());
                        }
                        Some(Ok(m)) => m,
                    };
                    self.handle_msg(decoder.get_mut(), msg, &mut cstate).await;
                }
                _ = self.wait_for_next_frame(&mut cstate) => {
                    self.send_fbu(decoder.get_mut(), &mut cstate).await?;
                }
            }
        }
    }

    async fn handle_msg(
        &self,
        _conn: &mut impl Connection,
        msg: ClientMessage,
        cstate: &mut ClientState,
    ) {
        match msg {
            ClientMessage::KeyEvent(ke) => {
                let state = self.state.lock().unwrap();
                trace!(self.log, "VNC key event: {:?}", ke);
                if let Some(devs) = state.devices.as_ref() {
                    devs.keyboard.key_event(ke);
                }
            }
            ClientMessage::PointerEvent(pe) => {
                trace!(self.log, "VNC pointer event: {:?}", pe);
                // TODO: wire to tablet device
            }
            ClientMessage::ClientCutText(_) => {
                trace!(self.log, "Ignoring VNC CutText request");
            }
            ClientMessage::FramebufferUpdateRequest(req) => {
                cstate.fbu_req = Some(req);
            }
            ClientMessage::SetPixelFormat(pf) => match (&pf).try_into() {
                Ok(fourcc) => {
                    cstate.output_fourcc = fourcc;
                    // Convert any existing frame to the new format
                    if let Some((snap, _kind)) = cstate.last_snap.as_mut() {
                        snap.frame.convert(fourcc);
                    }
                }
                Err(e) => {
                    slog::warn!(
                        self.log,
                        "Unhandled SetPixelFormat({:?}): {e}",
                        pf
                    );
                }
            },
            ClientMessage::SetEncodings { encodings, unknown } => {
                cstate.encodings = encodings.into_iter().collect();
                slog::trace!(self.log, "SetEncodings({:?})", cstate.encodings);
                if !unknown.is_empty() {
                    slog::debug!(
                        self.log,
                        "Unrecognized SetEncodings values: {:?}",
                        unknown
                    );
                }
            }
        }
    }
    async fn send_fbu(
        &self,
        conn: &mut impl Connection,
        cstate: &mut ClientState,
    ) -> Result<(), ProtocolError> {
        let fbu = {
            let (snap, _kind) = cstate.last_snap.as_ref().unwrap();
            let r = Rectangle {
                position: Position { x: 0, y: 0 },
                dimensions: Resolution {
                    width: snap.frame.spec().width.get() as u16,
                    height: snap.frame.spec().height.get() as u16,
                },
                data: Box::new(RawEncoding::new(snap.frame.bytes().to_vec())),
            };
            FramebufferUpdate(vec![r])
        };
        fbu.write_to(conn).await?;
        conn.flush().await?;

        // With the FBU sent, the existing request is fulfilled
        cstate.fbu_req = None;

        Ok(())
    }

    fn update_frame(&self, cstate: &mut ClientState) -> bool {
        let state = self.state.lock().unwrap();

        if let Some(mut new_valid_frame) = state
            .devices
            .as_ref()
            .and_then(|devs| devs.display.read_framebuffer(spec_valid))
        {
            new_valid_frame.frame.convert(cstate.output_fourcc);
            cstate.last_snap = Some((new_valid_frame, FrameKind::Valid));
            true
        } else {
            match cstate.last_snap.as_ref() {
                Some((_, FrameKind::Generated)) => {
                    // Reuse existing generated frame
                    false
                }
                _ => {
                    // Fill out a blank frame if none is already in place
                    cstate.last_snap = Some((
                        blank_frame(cstate.output_fourcc),
                        FrameKind::Generated,
                    ));
                    true
                }
            }
        }
    }
    async fn wait_for_next_frame(&self, cstate: &mut ClientState) {
        if cstate.fbu_req.is_none() {
            // If an update has not been requested, we will wait indefinitely
            futures::future::pending::<()>().await;
        }

        loop {
            let wait_len_us = match cstate
                .last_snap
                .as_ref()
                .map(|(frame, kind)| (kind, frame.when.elapsed()))
            {
                None | Some((FrameKind::Generated, _)) => {
                    // If there is no previous frame, or the existing frame is a
                    // generated blank, do not delay in attempting an update.
                    if self.update_frame(cstate) {
                        return;
                    }
                    // If the update resulted in no change, wait the default
                    // interval to check again
                    self.frame_int_us as u64
                }
                Some((FrameKind::Valid, age)) => {
                    let since_last = age.as_micros() as usize;
                    if since_last >= self.frame_int_us {
                        self.update_frame(cstate);
                        return;
                    }
                    (self.frame_int_us - since_last) as u64
                }
            };
            sleep(Duration::from_micros(wait_len_us)).await
        }
    }

    pub async fn stop(&self) {
        {
            let mut state = self.state.lock().unwrap();
            state.is_stopped = true;
            state.devices = None;
        }

        let _client = self.wait_client_gone().await;
    }
}

/// TCP socket listener for VNC client connections
pub struct TcpSock {
    join_hdl: JoinHandle<()>,
    hup_send: oneshot::Sender<()>,
}
impl TcpSock {
    pub async fn new(
        vnc: Arc<VncServer>,
        addr: SocketAddr,
        log: Logger,
    ) -> io::Result<Self> {
        let listener = TcpListener::bind(addr).await?;
        let (hup_send, hup_recv) = oneshot::channel::<()>();
        let join_hdl = tokio::spawn(async move {
            Self::run(listener, vnc, hup_recv, log).await;
        });
        Ok(Self { join_hdl, hup_send })
    }
    pub async fn halt(self) {
        let Self { join_hdl, hup_send } = self;

        // Signal the socket listener to hang up, then wait for it to bail
        let _ = hup_send.send(());
        let _ = join_hdl.await;
    }
    async fn run(
        listener: TcpListener,
        vnc: Arc<VncServer>,
        mut hup_recv: oneshot::Receiver<()>,
        log: Logger,
    ) {
        loop {
            tokio::select! {
                biased;

                _ = &mut hup_recv => {
                    return;
                },
                sock_res = listener.accept() => {
                    match sock_res {
                        Ok((sock, addr)) => {
                            let conn_res = vnc.connect(
                                Box::new(sock) as Box<dyn Connection + 'static>,
                                addr.to_string(),
                            )
                            .await;
                            if let Err(e) = conn_res {
                                error!(&log, "Error during VNC connection: {:?}", e);
                            }
                        }
                        Err(e) => {
                            error!(&log, "VNC TCP listener error: {:?}", e);
                        }
                    }
                },
            };
        }
    }
}

/// Generate a black "filler" frame of default size/format
fn blank_frame(fourcc: FourCC) -> FrameSnap {
    // Generate a new "filler" frame, if one isn't already in place
    //
    // The default buffer contents are all zeroes, which will be black in any of
    // the currently supported FourCC formats
    FrameSnap {
        frame: Frame::new(Spec::new(
            UNINIT_RES.width as usize,
            UNINIT_RES.height as usize,
            fourcc,
        )),
        when: Instant::now(),
    }
}

/// Check that Spec derived from the framebuffer config is:
/// - Of an appropriate size (not zero or > 1920x1200
fn spec_valid(spec: &Spec) -> bool {
    spec.width.get() < MAX_RES.width as usize
        && spec.height.get() < MAX_RES.height as usize
}


================================================
FILE: bin/propolis-server/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fmt;
use std::net::{IpAddr, Ipv6Addr, SocketAddr};
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::Arc;

use omicron_common::address::Ipv6Subnet;
use propolis::attestation::server::AttestationServerConfig;
use propolis::usdt::register_probes;
use propolis_server::{
    config,
    server::{self, MetricsEndpointConfig},
    vnc,
};

use anyhow::{anyhow, Context};
use clap::Parser;
use dropshot::{
    ClientSpecifiesVersionInHeader, CompressionConfig, ConfigDropshot,
    HandlerTaskMode, VersionPolicy,
};
use slog::{info, Logger};

/// Threads to spawn for tokio runtime handling the API (dropshot, etc)
const API_RT_THREADS: usize = 4;

/// Configuration for metric registration.
#[derive(Clone, Debug, PartialEq)]
enum MetricRegistration {
    Disable,
    Dns,
    WithAddr(SocketAddr),
}

impl fmt::Display for MetricRegistration {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            MetricRegistration::Disable => "disable".fmt(f),
            MetricRegistration::Dns => "dns".fmt(f),
            MetricRegistration::WithAddr(addr) => addr.fmt(f),
        }
    }
}

impl FromStr for MetricRegistration {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        if s.eq_ignore_ascii_case("disable") {
            Ok(Self::Disable)
        } else if s.eq_ignore_ascii_case("dns") {
            Ok(Self::Dns)
        } else {
            let Ok(addr) = s.parse() else {
                anyhow::bail!(
                    "Metric registration must be 'disable', \
                    'dns', or an explicit socket address \
                    written as `IP:port`",
                );
            };
            Ok(Self::WithAddr(addr))
        }
    }
}

fn parse_log_level(s: &str) -> anyhow::Result<slog::Level> {
    s.parse().map_err(|_| anyhow::anyhow!("Invalid log level"))
}

#[derive(Debug, Parser)]
#[clap(about, version)]
/// An HTTP server providing access to Propolis
enum Args {
    /// Runs the Propolis server.
    Run {
        #[clap(action)]
        bootrom_path: PathBuf,

        #[clap(name = "PROPOLIS_IP:PORT", action)]
        propolis_addr: SocketAddr,

        #[clap(long, action)]
        bootrom_version: Option<String>,

        /// Method for registering as an Oximeter metric producer.
        ///
        /// The following values are supported:
        ///
        /// disable - Do not register or attempt to produce metrics.
        ///
        /// dns - Register at an address inferred from Oxide internal DNS.
        /// This is only available if the Propolis is listening on a
        /// non-localhost IPv6 address.
        ///
        /// IP:port - Register with the explicitly-provided socket address.
        #[clap(long, default_value_t = MetricRegistration::Disable)]
        metric_addr: MetricRegistration,

        /// IP:Port for raw TCP access to VNC console
        #[clap(name = "VNC_IP:PORT", action)]
        vnc_addr: Option<SocketAddr>,

        /// Logging level for the server
        #[clap(long, default_value_t = slog::Level::Info, value_parser = parse_log_level)]
        log_level: slog::Level,
    },
}

fn run_server(
    bootrom_path: PathBuf,
    bootrom_version: Option<String>,
    config_dropshot: dropshot::ConfigDropshot,
    config_metrics: Option<MetricsEndpointConfig>,
    vnc_addr: Option<SocketAddr>,
    attest_config: Option<AttestationServerConfig>,
    log: slog::Logger,
) -> anyhow::Result<()> {
    use propolis::api_version;

    // Check that devices conform to expected API version
    if let Err(e) = api_version::check() {
        use api_version::{Error, VersionCheckError};
        if let VersionCheckError { component: _, path, err: Error::Io(ioe) } =
            &e
        {
            if ioe.kind() == std::io::ErrorKind::NotFound {
                slog::error!(log, "Failed to open {path}");
            }
        }

        Err(e).context("API version checks")?;
    }

    // If this is a development image being run outside of an Omicron zone,
    // enable the display (in logs, panic messages, and the like) of diagnostic
    // data that may have originated in the guest.
    #[cfg(not(feature = "omicron-build"))]
    propolis::common::DISPLAY_GUEST_DATA
        .store(true, std::sync::atomic::Ordering::SeqCst);

    let use_reservoir = config::reservoir_decide(&log);

    let context = server::DropshotEndpointContext::new(
        bootrom_path,
        bootrom_version,
        use_reservoir,
        log.new(slog::o!()),
        config_metrics,
        attest_config,
    );

    // Spawn the runtime for handling API processing
    // If/when a VM instance is created, a separate runtime for handling device
    // emulation and other VM-related work will be spawned.
    let api_runtime = {
        let mut builder = tokio::runtime::Builder::new_multi_thread();
        builder.worker_threads(API_RT_THREADS).thread_name("tokio-rt-api");
        oxide_tokio_rt::build(&mut builder)?
    };
    let _guard = api_runtime.enter();

    // Start TCP listener for VNC, if requested
    let tcp_vnc = match vnc_addr {
        Some(addr) => Some(api_runtime.block_on(async {
            vnc::TcpSock::new(context.vnc_server.clone(), addr, log.clone())
                .await
        })?),
        None => None,
    };

    info!(log, "Starting server...");

    let server = dropshot::ServerBuilder::new(
        server::api(),
        Arc::new(context),
        log.clone(),
    )
    .config(config_dropshot)
    .version_policy(VersionPolicy::Dynamic(Box::new(
        ClientSpecifiesVersionInHeader::new(
            omicron_common::api::VERSION_HEADER,
            propolis_server_api::latest_version(),
        ),
    )))
    .build_starter()
    .map_err(|error| anyhow!("Failed to start server: {error}"))?
    .start();

    let result = api_runtime.block_on(server);

    // Clean up any VNC TCP socket
    if let Some(vnc) = tcp_vnc {
        api_runtime.block_on(async { vnc.halt().await });
    }

    result.map_err(|e| anyhow!("Server exited with an error: {e}"))
}

fn build_logger(level: slog::Level) -> slog::Logger {
    use slog::Drain;

    let main_drain = if atty::is(atty::Stream::Stdout) {
        let decorator = slog_term::TermDecorator::new().build();
        let drain = slog_term::FullFormat::new(decorator).build().fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    } else {
        let drain =
            slog_bunyan::with_name("propolis-server", std::io::stdout())
                .build()
                .fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    };

    let (dtrace_drain, probe_reg) = slog_dtrace::Dtrace::new();

    let filtered_main = slog::LevelFilter::new(main_drain, level);

    let log = slog::Logger::root(
        slog::Duplicate::new(filtered_main.fuse(), dtrace_drain.fuse()).fuse(),
        slog::o!(),
    );

    if let slog_dtrace::ProbeRegistration::Failed(err) = probe_reg {
        slog::error!(&log, "Error registering slog-dtrace probes: {:?}", err);
    }

    log
}

fn is_valid_listen_addr_for_dns(listen_addr: IpAddr) -> bool {
    let IpAddr::V6(addr) = listen_addr else {
        return false;
    };
    addr != Ipv6Addr::LOCALHOST
}

/// Build metric configuration from the provided registration and listen
/// addresses.
///
/// This will return None if metrics are explicitly disabled.
fn build_metric_configuration(
    log: &Logger,
    metric_addr: MetricRegistration,
    listen_addr: IpAddr,
) -> anyhow::Result<Option<MetricsEndpointConfig>> {
    let cfg = match metric_addr {
        MetricRegistration::Disable => {
            info!(
                log,
                "metric registration is disabled, no metric \
                data will be produced by this server",
            );
            None
        }
        MetricRegistration::Dns => {
            anyhow::ensure!(
                is_valid_listen_addr_for_dns(listen_addr),
                "Metric registration can only use DNS \
                if the Propolis server is provided a \
                non-localhost IPv6 address"
            );
            Some(MetricsEndpointConfig { listen_addr, registration_addr: None })
        }
        MetricRegistration::WithAddr(addr) => Some(MetricsEndpointConfig {
            listen_addr,
            registration_addr: Some(addr),
        }),
    };
    Ok(cfg)
}

fn main() -> anyhow::Result<()> {
    // Ensure proper setup of USDT probes
    register_probes().unwrap();

    #[cfg(all(
        feature = "omicron-build",
        any(feature = "failure-injection", feature = "falcon")
    ))]
    if option_env!("PHD_BUILD") != Some("true") {
        panic!(
            "`omicron-build` is enabled alongside development features, \
            this build is NOT SUITABLE for production. Set PHD_BUILD=true in \
            the environment and rebuild propolis-server if you really need \
            this to work."
        );
    }

    // Command line arguments.
    let args = Args::parse();

    match args {
        Args::Run {
            bootrom_path,
            bootrom_version,
            propolis_addr,
            metric_addr,
            vnc_addr,
            log_level,
        } => {
            // Dropshot configuration.
            let config_dropshot = ConfigDropshot {
                bind_address: propolis_addr,
                default_request_body_max_bytes: 1024 * 1024, // 1M for ISO bytes
                default_handler_task_mode: HandlerTaskMode::Detached,
                log_headers: vec![],
                compression: CompressionConfig::None,
            };

            let log = build_logger(log_level);

            let metric_config = build_metric_configuration(
                &log,
                metric_addr,
                propolis_addr.ip(),
            )?;

            let attest_config = match propolis_addr.ip() {
                IpAddr::V4(_) => None,
                IpAddr::V6(ipv6_addr) => {
                    let sled_subnet = Ipv6Subnet::<
                        { omicron_common::address::SLED_PREFIX },
                    >::new(ipv6_addr);
                    let sa_addr =
                        omicron_common::address::get_sled_address(sled_subnet);

                    Some(AttestationServerConfig::new(sa_addr))
                }
            };

            run_server(
                bootrom_path,
                bootrom_version,
                config_dropshot,
                metric_config,
                vnc_addr,
                attest_config,
                log,
            )
        }
    }
}


================================================
FILE: bin/propolis-server/src/proptest-regressions/vm/request_queue.txt
================================================
# Seeds for failure cases proptest has generated in the past. It is
# automatically read and these particular cases re-run before any
# novel cases are generated.
#
# It is recommended to check this file in to source control so that
# everyone who runs the test benefits from these saved cases.
cc 467749978aea2988f7790844904751ed5f0797f700949e702db74ae430a659e0 # shrinks to reqs = [Migrate, Stop]
cc 03ba07e9b5a99141bddd9b878bff86845a6da9eb1aa015b3afc7b3ebfed7a6d1 # shrinks to reqs = [Start, Stop, Migrate]
cc 67a067444d475068e86b43528884319ff178d6d9038a3d9223c32789f871baa3 # shrinks to reqs = [Start, Migrate]
cc b3df4b82bdb87e3533f4bd47f0a3ee8be21893c0afc15b472281b2a79006aadf # shrinks to reqs = [Migrate]
cc 3430b43ba860946e5feb7b3b0246623708efb1465dd4fe7a604ddf479d4dc3ae # shrinks to reqs = [Start { will_succeed: true }, Migrate { will_succeed: false }, Reboot]
cc 2e8b284223a88421aaed16749309839818c16efda4bc4d8d930a35cbdce018cd # shrinks to ops = [Enqueue(ReconfigureCrucible), Enqueue(Start { will_succeed: true }), Dequeue]


================================================
FILE: bin/propolis-standalone/Cargo.toml
================================================
[package]
name = "propolis-standalone"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"
rust-version = "1.73"

[[bin]]
name = "propolis-standalone"
path = "src/main.rs"
test = false
doctest = false

[dependencies]
anyhow.workspace = true
atty.workspace = true
bhyve_api.workspace = true
clap = { workspace = true, features = ["derive", "env"] }
cpuid_utils.workspace = true
ctrlc.workspace = true
fatfs.workspace = true
futures.workspace = true
libc.workspace = true
toml.workspace = true
tokio = { workspace = true, features = ["io-util", "rt-multi-thread"] }
serde = { workspace = true, features = ["derive"] }
oxide-tokio-rt.workspace = true
propolis.workspace = true
propolis_types.workspace = true
crucible-client-types = { workspace = true, optional = true }
cpuid_profile_config.workspace = true
erased-serde.workspace = true
serde_json.workspace = true
slog.workspace = true
slog-async.workspace = true
slog-dtrace.workspace = true
slog-bunyan.workspace = true
slog-term.workspace = true
strum = { workspace = true, features = ["derive"] }
tar.workspace = true
uuid.workspace = true
pbind.workspace = true

[features]
default = []
crucible = ["propolis/crucible-full", "propolis/oximeter", "crucible-client-types"]


================================================
FILE: bin/propolis-standalone/README.md
================================================
# Propolis Standalone

Server frontend aside, we also provide a standalone binary for quick
prototyping, `propolis-standalone`. It uses a static toml configuration:

## Running

```
# pfexec propolis-standalone <config_file>
```

Example configuration:
```toml
[main]
name = "testvm"
cpus = 4
bootrom = "/path/to/bootrom/OVMF_CODE.fd"
memory = 1024

# Exit propolis-standalone process with <code> if instance halts (default: 0)
# exit_on_halt = <code>

# Exit propolis-standalone process with <code> if instance reboots (default: unset)
# exit_on_reboot = <code>

# Override boot order (via communication to OVMF bootrom)
# boot_order = ["net0", "block0"]

[block_dev.alpine_iso]
type = "file"
path = "/path/to/alpine-extended-3.12.0-x86_64.iso"

[dev.block0]
driver = "pci-virtio-block"
block_dev = "alpine_iso"
pci-path = "0.4.0"

[dev.net0]
driver = "pci-virtio-viona"
vnic = "vnic_name"
pci-path = "0.5.0"
```

Propolis will not destroy the VM instance on exit.  If one exists with the
specified name on start-up, it will be destroyed and created fresh.

Propolis will create a unix domain socket, available at "./ttya",
which acts as a serial port. One such tool for accessing this serial port is
[sercons](https://github.com/jclulow/vmware-sercons), though others (such as
`screen`) would also work.

## Quickstart to Alpine

In the aforementioned config files, there are three major components
that need to be supplied: The guest firmware (bootrom) image, the ISO, and the
VNIC.

Since this is a configuration file, you can supply whatever you'd like, but here
are some options to get up-and-running quickly:

### Guest bootrom

The current recommended and tested guest bootrom is available
[here](https://buildomat.eng.oxide.computer/public/file/oxidecomputer/edk2/image_debug/bf64f45b1a58e69d126a3c6ca1e4512c88668132/OVMF_CODE.fd).

Other UEFI firmware images built from the [Open Virtual Machine Firmware
project](https://github.com/tianocore/tianocore.github.io/wiki/OVMF) may also
work, but these aren't regularly tested and your mileage may vary.

### ISO

Although there are many options for ISOs, an easy option that
should work is the [Alpine Linux distribution](https://alpinelinux.org/downloads/).

These distributions are lightweight, and they have variants
custom-built for virtual machines.

A straightforward option to start with is the "virtual" `x86_64` image.

The "extended" variant contains more useful tools, but will require a
modification of the kernel arguments when booting to see the console on the
serial port.  From Grub, this can be accomplished by pressing "e" (to edit),
adding "console=ttyS0" to the line starting with "/boot/vmlinuz-lts", and
pressing "Control + x" to boot with these parameters.

### VNIC

To see your current network interfaces, you can use the following:

```bash
$ dladm show-link
```

To create a vnic, you can use one of your physical devices
(like "e1000g0", if you have an ethernet connection) as a link
for a VNIC. This can be done as follows:

```bash
NIC_NAME="vnic_prop0"
NIC_MAC="02:08:20:ac:e9:16"
NIC_LINK="e1000g0"

if ! dladm show-vnic $NIC_NAME 2> /dev/null; then
  dladm create-vnic -t -l $NIC_LINK -m $NIC_MAC $NIC_NAME
fi
```

### Running a VM

After you've got the bootrom, an ISO, a VNIC, and a configuration file that
points to them, you're ready to create and run your VM. To do so, make sure
you've done the following:
- build `propolis-standalone`
- start `propolis-standalone`, passing it a valid config
- it will wait to start the VM until you connect to the serial console socket
  (with something like [sercons](https://github.com/jclulow/vmware-sercons))
- login to the VM as root (no password)
- optionally, run `setup-alpine` to configure the VM (including setting a root
  password)

## Using Crucible storage

`propolis-standalone` supports defining crucible-backed storage devices in the
TOML config. It is somewhat inconvenient to do this without scripting, because
`generation` must monotonically increase with each successive connection to the
Downstairs datastore. So if you use this, you need to somehow monotonically bump
up that number in the TOML file before re-launching the VM, unless you're also
creating a new Downstairs region from scratch.

All the crucible configuration options are crucible-specific, so future changes
to crucible may result in changes to the config options here as well. Consult
the [oxidecomputer/crucible](https://github.com/oxidecomputer/crucible) codebase
if you need low level details on what certain options actually do.

Here's an example config. Read the comments for parameter-specific details:

```toml
[block_dev.some_datastore]
type = "crucible"

# === REQUIRED OPTIONS ===
# these MUST match the region configuration downstairs
block_size = 512
blocks_per_extent = 262144
extent_count = 32

# Array of the SocketAddrs of the Downstairs instances. There must be three
# of these, or propolis-standalone will panic.
targets = [
  "127.0.0.1:3810",
  "127.0.0.1:3820",
  "127.0.0.1:3830",
]

# Generation number used when connecting to Downstairs. This must
# monotonically increase with each successive connection to the Downstairs,
# which means that you need to bump this number every time you restart
# your VM. Kind of annoying, maybe we can get a better way to pass it in.
# Anyway, if you don't want to read-modify-write this value, a hack you
# could do is set this to the current number of seconds since the epoch.
# This'll always work, except for if the system time goes backwards, which
# it can definitely do! So, you know. Be careful.
generation = 1
# === END REQUIRED OPTIONS ===


# === OPTIONAL OPTIONS ===
# This should be a UUID. It can be anything, really. When unset, defaults
# to a random UUIDv4
# upstairs_id = "e4396bd0-ede1-48d7-ac14-3d2094dfba5b"

# When true, some random amount of IO requests will synthetically "fail".
# This is useful when testing IO behavior under Bad Conditions.
# Defaults to false.
# lossy = false

# the Upstairs (propolis-side) component of crucible currently regularly
# dispatches flushes to act as IO barriers. By default this happens once every 5
# seconds, but you can adjust it with this option.
# flush_timeout = <number>

# Base64'd encryption key used to encrypt data at rest. Keys are 256 bits.
# Note that the region must have already been created with encryption
# enabled for this to work. That may change later though.
# encryption_key = ""

# These three values are pem files for TLS encryption of data between
# propolis and the downstairs.
# cert_pem = ""
# key_pem = ""
# root_cert_pem = ""

# Specifies the SocketAddr of the Upstairs crucible control interface. When
# ommitted, the control interface won't be started. The control interface is an
# HTTP server that exposes commands to take snapshots, simulate faults, and
# retrieve runtime debug information.
# control_addr = ""

# When true, the device will be read-only. Defaults to false
# read_only = false
# === END OPTIONAL OPTIONS ===
```
## Configuring `cpuid`

Rather than using the built-in `cpuid` data masking offered by the bhyve kernel
VMM, propolis-standalone can load a set of leaf data to be used by the instance.
An example of such configuration data is as follows:

```toml
[main]
# ... other main config bits
cpuid_profile = "NAME"

[cpuid.NAME]
vendor = "amd"
"0" = [0x10, 0x68747541, 0x444d4163, 0x69746e65]
"1" = [0x830f10, 0x10800, 0xf6d83203, 0x178bfbff]
"5" = [0x0, 0x0, 0x0, 0x0]
"6" = [0x4, 0x0, 0x0, 0x0]
"7" = [0x0, 0x0, 0x0, 0x0]
"7-0" = [0x0, 0x201401a9, 0x0, 0x0]
"d" = [0x0, 0x0, 0x0, 0x0]
"d-0" = [0x7, 0x340, 0x340, 0x0]
"d-1" = [0x1, 0x0, 0x0, 0x0]
"d-2" = [0x100, 0x240, 0x0, 0x0]
"80000000" = [0x80000020, 0x68747541, 0x444d4163, 0x69746e65]
"80000001" = [0x830f10, 0x40000000, 0x444031fb, 0x25d3fbff]
"80000002" = [0x20444d41, 0x43595045, 0x38323720, 0x36312032]
"80000003" = [0x726f432d, 0x72502065, 0x7365636f, 0x20726f73]
"80000004" = [0x20202020, 0x20202020, 0x20202020, 0x202020]
"80000005" = [0xff40ff40, 0xff40ff40, 0x20080140, 0x20080140]
"80000006" = [0x48006400, 0x68006400, 0x2006140, 0x2009140]
"80000007" = [0x0, 0x0, 0x0, 0x100]
"80000008" = [0x3030, 0x7, 0x0, 0x10000]
"8000000a" = [0x1, 0x8000, 0x0, 0x13bcff]
"80000019" = [0xf040f040, 0x0, 0x0, 0x0]
"8000001a" = [0x6, 0x0, 0x0, 0x0]
"8000001b" = [0x3ff, 0x0, 0x0, 0x0]
"8000001d" = [0x0, 0x0, 0x0, 0x0]
"8000001d-0" = [0x121, 0x1c0003f, 0x3f, 0x0]
"8000001d-1" = [0x122, 0x1c0003f, 0x3f, 0x0]
"8000001d-2" = [0x143, 0x1c0003f, 0x3ff, 0x2]
"8000001d-3" = [0x163, 0x3c0003f, 0x3fff, 0x1]
"8000001f" = [0x1000f, 0x16f, 0x1fd, 0x1]
```

If `cpuid_profile` is specified under the `main` section, a corresponding
`cpuid` section with a matching name is expected to be defined elsewhere in the
file.  The `vendor` field under that section controls fallback behavior when a
vCPU queries a non-existent leaf, and other CPU-specific behavior.  After that,
the leafs and their register data are listed.  Leafs which require an `ecx`
match (with `eax` as the function, and `ecx` as the index) are specified with a
hyphen separating the function and index.  Leafs without an index (just a single
hex number) will match only against `eax`, and at a lower priority than the
function/index leafs which match `eax` and `ecx`.  The data for leafs is
expected to be a 4-item array of 32-bit integers corresponding to `eax`, `ebx`,
`ecx`, and `edx`, in that order.

Certain fields in `cpuid` data depend on aspects specific to the host (such as
vCPU count) or the vCPU they are associated with (such as APIC ID).  Propolis
will "specialize" the data provided in the `cpuid` profile with logic appropriate
for the specific leafs involved.

## Configuring Cloud-Init

Propolis is able to assemble a disk image formatted in the
[NoCloud](https://cloudinit.readthedocs.io/en/latest/reference/datasources/nocloud.html)
fashion to be consumed by `cloud-init` inside the guest.  An example of such configuration is as follows:
```toml
# ... other configuration bits

# Define a disk device to bear the cloud-init data
[dev.cloudinit]
driver = "pci-virtio-block"
pci-path = "0.16.0"
block_dev = "cloudinit_be"

# Define the backend to that disk as the cloudinit type
[block_dev.cloudinit_be]
type = "cloudinit"

# Data from this cloudinit section will be used to populate the above block_dev
[cloudinit]
user-data = '''
#cloud-config
users:
- default
- name: test
  sudo: 'ALL=(ALL) NOPASSWD:ALL'
  lock_passwd: false
  hashed_passwd: '$6$rounds=4096$MBW/3OrwWLifnv30$QM.oCQ3pzV7X4EToX9IyZmplvaTgpZ6YJ50MhQrwlryj1soqBW5zvraVttYwfyWdxigHpZHTjY9kT.029UOEn1'
'''
# Instead of specifying string data like above, a path to a file can be used too:
# user-data-path = "path/to/file"

# Instance metadata is configured the same way:
# meta-data = "..."
# or
# meta-data-path = "path/to/file"

# Same with network configuration:
# network-config = "..."
# or
# network-config-path = "path/to/file"
```


================================================
FILE: bin/propolis-standalone/src/cidata.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io::{Cursor, Write};
use std::sync::Arc;

use anyhow::{bail, Context};
use fatfs::{FileSystem, FormatVolumeOptions, FsOptions};
use propolis::block;

use crate::config::Config;

const SECTOR_SZ: usize = 512;
const VOLUME_LABEL: [u8; 11] = *b"cidata     ";

pub(crate) fn build_cidata_be(
    config: &Config,
) -> anyhow::Result<Arc<block::InMemoryBackend>> {
    let cidata = &config
        .cloudinit
        .as_ref()
        .ok_or_else(|| anyhow::anyhow!("missing [cloudinit] config section"))?;

    let fields = [
        ("user-data", &cidata.user_data, &cidata.user_data_path),
        ("meta-data", &cidata.meta_data, &cidata.meta_data_path),
        ("network-config", &cidata.network_config, &cidata.network_config_path),
    ];
    let all_data = fields
        .into_iter()
        .map(|(name, str_data, path_data)| {
            Ok((
                name,
                match (str_data, path_data) {
                    (None, None) => vec![],
                    (None, Some(path)) => std::fs::read(path).context(
                        format!("unable to read {name} from {path}"),
                    )?,
                    (Some(data), None) => data.clone().into(),
                    (Some(_), Some(_)) => {
                        bail!("cannot provide path and string for {name}");
                    }
                },
            ))
        })
        .collect::<Result<Vec<_>, _>>()?;

    let file_sectors: usize =
        all_data.iter().map(|(_, data)| data.len().div_ceil(SECTOR_SZ)).sum();
    // vfat can hold more data than this, but we don't expect to ever need that
    // for cloud-init purposes.
    if file_sectors > 512 {
        bail!("too much vfat data: {file_sectors} > 512 sectors");
    }

    // Copying the match already done for this in Omicron:
    //
    // if we're storing < 341 KiB of clusters, the overhead is 37. With a limit
    // of 512 sectors (error check above), we can assume an overhead of 37.
    // Additionally, fatfs refuses to format a disk that is smaller than 42
    // sectors.
    let sectors = 42.max(file_sectors + 37);

    // Some tools also require that the number of sectors is a multiple of the
    // sectors-per-track. fatfs uses a default of 32 which won't evenly divide
    // sectors as we compute above generally. To fix that we simply set it to
    // match the number of sectors to make it trivially true.
    let sectors_per_track = sectors.try_into().unwrap();

    let mut disk = Cursor::new(vec![0; sectors * SECTOR_SZ]);
    fatfs::format_volume(
        &mut disk,
        FormatVolumeOptions::new()
            .bytes_per_cluster(512)
            .sectors_per_track(sectors_per_track)
            .fat_type(fatfs::FatType::Fat12)
            .volume_label(VOLUME_LABEL),
    )
    .context("error formatting FAT volume")?;

    let fs = FileSystem::new(&mut disk, FsOptions::new())?;
    let root_dir = fs.root_dir();
    for (name, data) in all_data.iter() {
        if *name == "network-config" && data.is_empty() {
            // Skip creating an empty network interfaces if nothing is provided.
            // It is not required, unlike the other files
        }
        root_dir.create_file(name)?.write_all(data)?;
    }
    drop(root_dir);
    drop(fs);

    block::InMemoryBackend::create(
        disk.into_inner(),
        block::BackendOpts {
            block_size: Some(SECTOR_SZ as u32),
            read_only: Some(true),
            ..Default::default()
        },
        std::num::NonZeroUsize::new(8).unwrap(),
    )
    .context("could not create block backend")
}


================================================
FILE: bin/propolis-standalone/src/config.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;
use std::num::NonZeroUsize;
use std::os::unix::fs::FileTypeExt;
use std::str::FromStr;
use std::sync::Arc;

use anyhow::Context;
use cpuid_utils::CpuidSet;
use propolis::vsock::proxy::VsockPortMapping;
use propolis_types::CpuidIdent;
use propolis_types::CpuidValues;
use propolis_types::CpuidVendor;
use serde::{Deserialize, Serialize};

use cpuid_profile_config::*;
use propolis::block;
use propolis::hw::pci::Bdf;

use crate::cidata::build_cidata_be;

#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Config {
    pub main: Main,

    #[serde(default, rename = "dev")]
    pub devices: BTreeMap<String, Device>,

    #[serde(default, rename = "block_dev")]
    pub block_devs: BTreeMap<String, BlockDevice>,

    #[serde(default, rename = "cpuid")]
    pub cpuid_profiles: BTreeMap<String, CpuidProfile>,

    pub cloudinit: Option<CloudInit>,
}
impl Config {
    pub fn cpuid_profile(&self) -> Option<&CpuidProfile> {
        match self.main.cpuid_profile.as_ref() {
            Some(name) => self.cpuid_profiles.get(name),
            None => None,
        }
    }
}

#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Main {
    pub name: String,
    pub cpus: u8,
    pub bootrom: String,
    pub bootrom_version: Option<String>,
    pub memory: usize,
    pub use_reservoir: Option<bool>,
    pub cpuid_profile: Option<String>,
    /// How vCPUs should be bound to physical processors, if at all. If not
    /// provided, vCPUs are not bound (equivalent to setting `any`).
    pub cpu_binding: Option<BindingStrategy>,
    /// Process exitcode to emit if/when instance halts
    ///
    /// Default: 0
    #[serde(default)]
    pub exit_on_halt: u8,
    /// Process exitcode to emit if/when instance reboots
    ///
    /// Default: None, does not exit on reboot
    #[serde(default)]
    pub exit_on_reboot: Option<u8>,

    /// Request bootrom override boot order using the devices specified
    pub boot_order: Option<Vec<String>>,
}

#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub enum BindingStrategy {
    /// vCPUs are not bound to any particular physical processor.
    Any,
    /// vCPUs are bound to the highest-numbered processors in the system, with
    /// the first vCPU bound to CPU `last - N_vCPU` and the last vCPU bound to
    /// the last CPU.
    ///
    /// An example given a system with 10 CPUs running a VM with 6 vCPUs:
    /// ```text
    /// host CPU number:   | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
    /// guest vCPU number: |   |   |   |   | 0 | 1 | 2 | 3 | 4 | 5 |
    /// ```
    UpperHalf,
}

/// A hard-coded device, either enabled by default or accessible locally
/// on a machine.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Device {
    pub driver: String,

    #[serde(flatten, default)]
    pub options: BTreeMap<String, toml::Value>,
}

#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct BlockOpts {
    pub block_size: Option<u32>,
    pub read_only: Option<bool>,
    pub skip_flush: Option<bool>,
}

#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct BlockDevice {
    #[serde(default, rename = "type")]
    pub bdtype: String,

    #[serde(flatten)]
    pub block_opts: BlockOpts,

    #[serde(flatten, default)]
    pub options: BTreeMap<String, toml::Value>,
}

#[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub struct CloudInit {
    pub user_data: Option<String>,
    pub meta_data: Option<String>,
    pub network_config: Option<String>,

    // allow path-style contents as well
    pub user_data_path: Option<String>,
    pub meta_data_path: Option<String>,
    pub network_config_path: Option<String>,
}

#[derive(Deserialize)]
struct FileConfig {
    path: String,
    workers: Option<NonZeroUsize>,
}
#[derive(Deserialize)]
struct MemAsyncConfig {
    size: u64,
    workers: Option<usize>,
}

#[derive(Deserialize)]
pub struct VionaDeviceParams {
    tx_copy_data: Option<bool>,
    tx_header_pad: Option<u16>,
}
impl VionaDeviceParams {
    pub fn from_opts(
        opts: &BTreeMap<String, toml::Value>,
    ) -> Result<Option<propolis::hw::virtio::viona::DeviceParams>, anyhow::Error>
    {
        use propolis::hw::virtio::viona::DeviceParams;
        let parsed: Self = opt_deser(opts)?;
        let out = if parsed.tx_copy_data.is_some()
            || parsed.tx_header_pad.is_some()
        {
            let default = DeviceParams::default();

            Some(DeviceParams {
                copy_data: parsed.tx_copy_data.unwrap_or(default.copy_data),
                header_pad: parsed.tx_header_pad.unwrap_or(default.header_pad),
            })
        } else {
            None
        };
        Ok(out)
    }
}

#[derive(Deserialize)]
pub struct VsockDevice {
    pub guest_cid: u64,
    pub port_mappings: Vec<VsockPortMapping>,
}

impl VsockDevice {
    pub fn from_opts(
        opts: &BTreeMap<String, toml::Value>,
    ) -> Result<VsockDevice, anyhow::Error> {
        opt_deser(opts)
    }
}

// Try to turn unmatched flattened options into a config struct
fn opt_deser<'de, T: Deserialize<'de>>(
    value: &BTreeMap<String, toml::Value>,
) -> Result<T, anyhow::Error> {
    let map = toml::map::Map::from_iter(value.clone());
    let config = map.try_into::<T>()?;
    Ok(config)
}

const DEFAULT_WORKER_COUNT: usize = 8;
const MAX_FILE_WORKERS: usize = 32;

pub fn block_backend(
    config: &Config,
    dev: &Device,
    log: &slog::Logger,
) -> (Arc<dyn block::Backend>, String) {
    let backend_name = dev.options.get("block_dev").unwrap().as_str().unwrap();
    let Some(be) = config.block_devs.get(backend_name) else {
        panic!("No configured block device named \"{backend_name}\"");
    };
    let opts = block::BackendOpts {
        block_size: be.block_opts.block_size,
        read_only: be.block_opts.read_only,
        skip_flush: be.block_opts.skip_flush,
    };

    let be = match &be.bdtype as &str {
        "file" => {
            let parsed: FileConfig = opt_deser(&be.options).unwrap();

            // Check if raw device is being used and gripe if it isn't
            let meta = std::fs::metadata(&parsed.path)
                .with_context(|| {
                    format!(
                        "opening {} for block device \"{backend_name}\"",
                        parsed.path,
                    )
                })
                .expect("file device path is valid");

            if meta.file_type().is_block_device() {
                slog::warn!(log, "Block backend using standard device rather than raw";
                    "path" => &parsed.path);
            }

            let workers: NonZeroUsize = match parsed.workers {
                Some(workers) => {
                    if workers.get() <= MAX_FILE_WORKERS {
                        workers
                    } else {
                        slog::warn!(
                            log,
                            "workers must be between 1 and {} \
                            Using default value of {}.",
                            MAX_FILE_WORKERS,
                            DEFAULT_WORKER_COUNT,
                        );
                        NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap()
                    }
                }
                None => NonZeroUsize::new(DEFAULT_WORKER_COUNT).unwrap(),
            };
            block::FileBackend::create(&parsed.path, opts, workers, log.clone())
                .unwrap()
        }
        "crucible" => create_crucible_backend(be, opts, log),
        "crucible-mem" => create_crucible_mem_backend(be, opts, log),
        "mem-async" => {
            let parsed: MemAsyncConfig = opt_deser(&be.options).unwrap();

            block::MemAsyncBackend::create(
                parsed.size,
                opts,
                NonZeroUsize::new(
                    parsed.workers.unwrap_or(DEFAULT_WORKER_COUNT),
                )
                .unwrap(),
            )
            .unwrap()
        }
        "cloudinit" => build_cidata_be(config).unwrap(),
        _ => {
            panic!("unrecognized block dev type {}!", be.bdtype);
        }
    };
    (be, backend_name.into())
}

pub fn parse(path: &str) -> anyhow::Result<Config> {
    let file_data =
        std::fs::read(path).context("Failed to read given config.toml")?;
    Ok(toml::from_str::<Config>(
        std::str::from_utf8(&file_data)
            .context("config should be valid utf-8")?,
    )?)
}

pub fn parse_bdf(v: &str) -> Option<Bdf> {
    let mut fields = Vec::with_capacity(3);
    for f in v.split('.') {
        let num = usize::from_str(f).ok()?;
        if num > u8::MAX as usize {
            return None;
        }
        fields.push(num as u8);
    }

    if fields.len() == 3 {
        Bdf::new(fields[0], fields[1], fields[2])
    } else {
        None
    }
}

pub fn parse_cpuid(config: &Config) -> anyhow::Result<Option<CpuidSet>> {
    if let Some(profile) = config.cpuid_profile() {
        let vendor = match profile.vendor {
            CpuVendor::Amd => CpuidVendor::Amd,
            CpuVendor::Intel => CpuidVendor::Intel,
        };
        let mut set = CpuidSet::new(vendor);
        let entries: Vec<CpuidEntry> = profile.try_into()?;
        for entry in entries {
            let conflict = set.insert(
                CpuidIdent { leaf: entry.func, subleaf: entry.idx },
                CpuidValues::from(entry.values),
            )?;

            if conflict.is_some() {
                anyhow::bail!(
                    "conflicing entry at func:{:#?} idx:{:#?}",
                    entry.func,
                    entry.idx
                )
            }
        }
        Ok(Some(set))
    } else {
        Ok(None)
    }
}

#[cfg(feature = "crucible")]
fn create_crucible_backend(
    be: &BlockDevice,
    opts: block::BackendOpts,
    log: &slog::Logger,
) -> Arc<dyn block::Backend> {
    use slog::info;
    use std::net::SocketAddr;
    use uuid::Uuid;

    info!(
        log,
        "Building a crucible VolumeConstructionRequest from options {:?}",
        be.options
    );

    // No defaults on here because we really shouldn't try and guess
    // what block size the downstairs is using. A lot of things
    // default to 512, but it's best not to assume it'll always be
    // that way.
    let block_size =
        u64::from(opts.block_size.expect("block_size is provided"));
    let read_only = opts.read_only.unwrap_or(false);

    #[derive(Deserialize)]
    struct CrucibleConfig {
        blocks_per_extent: u64,
        extent_count: u32,
        upstairs_id: Option<String>,
        targets: [String; 3],

        // This needs to increase monotonically with each successive connection
        // to the downstairs. As a hack, you can set it to the current system
        // time, and this will usually give us a newer generation than the last
        // connection. NEVER do this in prod EVER.
        generation: u64,

        lossy: Option<bool>,
        flush_timeout: Option<f32>,
        encryption_key: Option<String>,
        cert_pem: Option<String>,
        key_pem: Option<String>,
        root_cert_pem: Option<String>,
        control_addr: Option<String>,
    }
    let parsed: CrucibleConfig = opt_deser(&be.options).unwrap();

    // Parse a UUID, or generate a random one if none is specified.
    // Reasonable in something primarily used for testing like
    // propolis-standalone, but you wouldn't want to do this in
    // prod.
    let upstairs_id = if let Some(val) = parsed.upstairs_id {
        Uuid::parse_str(&val).expect("upstairs_id is valid uuid")
    } else {
        Uuid::new_v4()
    };

    let target = parsed
        .targets
        .iter()
        .map(|val| val.parse::<SocketAddr>())
        .collect::<Result<Vec<_>, _>>()
        .expect("targets contains valid socket addresses");

    let control = parsed.control_addr.map(|val| {
        val.parse::<SocketAddr>().expect("control_addr is valid socket addr")
    });

    let req = crucible_client_types::VolumeConstructionRequest::Region {
        block_size,
        blocks_per_extent: parsed.blocks_per_extent,
        extent_count: parsed.extent_count,
        opts: crucible_client_types::CrucibleOpts {
            id: upstairs_id,
            target,
            lossy: parsed.lossy.unwrap_or(false),
            flush_timeout: parsed.flush_timeout,
            key: parsed.encryption_key,
            cert_pem: parsed.cert_pem,
            key_pem: parsed.key_pem,
            root_cert_pem: parsed.root_cert_pem,
            control,
            read_only,
        },
        generation: parsed.generation,
    };
    info!(log, "Creating Crucible disk from request {:?}", req);
    // QUESTION: is producer_registry: None correct here?
    tokio::runtime::Handle::current().block_on(async move {
        block::CrucibleBackend::create(req, opts, None, None, log.clone())
            .await
            .unwrap()
    })
}

#[cfg(feature = "crucible")]
fn create_crucible_mem_backend(
    be: &BlockDevice,
    opts: block::BackendOpts,
    log: &slog::Logger,
) -> Arc<dyn block::Backend> {
    #[derive(Deserialize)]
    struct CrucibleMemConfig {
        size: u64,
    }
    let parsed: CrucibleMemConfig = opt_deser(&be.options).unwrap();

    tokio::runtime::Handle::current().block_on(async move {
        block::CrucibleBackend::create_mem(parsed.size, opts, log.clone())
            .await
            .unwrap()
    })
}

#[cfg(not(feature = "crucible"))]
fn create_crucible_backend(
    _be: &BlockDevice,
    _opts: block::BackendOpts,
    _log: &slog::Logger,
) -> Arc<dyn block::Backend> {
    panic!(
        "Rebuild propolis-standalone with 'crucible' feature enabled in \
           order to use the crucible block backend"
    );
}

#[cfg(not(feature = "crucible"))]
fn create_crucible_mem_backend(
    _be: &BlockDevice,
    _opts: block::BackendOpts,
    _log: &slog::Logger,
) -> Arc<dyn block::Backend> {
    panic!(
        "Rebuild propolis-standalone with 'crucible' feature enabled in \
           order to use the crucible-mem block backend"
    );
}


================================================
FILE: bin/propolis-standalone/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::{BTreeMap, VecDeque};
use std::fmt;
use std::fs::File;
use std::io::{Error, ErrorKind, Result};
use std::path::Path;
use std::process::ExitCode;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Condvar, Mutex, MutexGuard};
use std::time::{SystemTime, UNIX_EPOCH};

use anyhow::Context;
use clap::Parser;
use futures::future::BoxFuture;
use propolis::hw::qemu::pvpanic::QemuPvpanic;
use propolis::vsock::GuestCid;
use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor};
use slog::{o, Drain};
use strum::IntoEnumIterator;
use tokio::runtime;

use propolis::chardev::{BlockingSource, Sink, Source, UDSock};
use propolis::common::{GB, MB};
use propolis::firmware::smbios;
use propolis::hw::chipset::{i440fx, Chipset};
use propolis::hw::ps2::ctrl::PS2Ctrl;
use propolis::hw::qemu::fwcfg;
use propolis::hw::uart::LpcUart;
use propolis::hw::{ibmpc, qemu};
use propolis::intr_pins::FuncPin;
use propolis::usdt::register_probes;
use propolis::vcpu::Vcpu;
use propolis::vmm::{Builder, Machine};
use propolis::*;

mod cidata;
mod config;
mod snapshot;

const PAGE_OFFSET: u64 = 0xfff;
// Arbitrary ROM limit for now
const MAX_ROM_SIZE: usize = 0x20_0000;

const MIN_RT_THREADS: usize = 8;
const BASE_RT_THREADS: usize = 4;

#[derive(Copy, Clone, Debug)]
enum InstEvent {
    Halt,
    ReqHalt,

    Reset,
    TripleFault,

    ReqSave,

    ReqStart,
}
impl InstEvent {
    fn priority(&self) -> u8 {
        match self {
            InstEvent::Halt | InstEvent::ReqHalt => 3,

            InstEvent::Reset | InstEvent::TripleFault => 2,

            InstEvent::ReqSave => 1,

            InstEvent::ReqStart => 0,
        }
    }
    fn supersedes(&self, comp: &Self) -> bool {
        self.priority() >= comp.priority()
    }
}
impl From<propolis::exits::Suspend> for InstEvent {
    fn from(value: propolis::exits::Suspend) -> Self {
        match value {
            exits::Suspend::Halt => Self::Halt,
            exits::Suspend::Reset => Self::Reset,
            exits::Suspend::TripleFault(_) => Self::TripleFault,
        }
    }
}

#[derive(Clone, Debug)]
// Silence the lint about detail fields being unused, since rustc ignores the
// derived Debug impl which does read those bits.
#[allow(dead_code)]
enum EventCtx {
    Vcpu(i32),
    Pin(String),
    User(String),
    Other(String),
}

#[derive(Default)]
struct EQInner {
    events: VecDeque<(InstEvent, EventCtx)>,
}
#[derive(Default)]
struct EventQueue {
    inner: Mutex<EQInner>,
    cv: Condvar,
}
impl EventQueue {
    fn new() -> Arc<Self> {
        Arc::new(Self::default())
    }
    fn push(&self, ev: InstEvent, ctx: EventCtx) {
        let mut inner = self.inner.lock().unwrap();
        inner.events.push_back((ev, ctx));
        self.cv.notify_one();
    }
    fn pop_superseding(
        &self,
        cur: Option<&InstEvent>,
    ) -> Option<(InstEvent, EventCtx)> {
        let mut inner = self.inner.lock().unwrap();
        while let Some((ev, ctx)) = inner.events.pop_front() {
            match cur {
                Some(cur_ev) => {
                    if cur_ev.supersedes(&ev) {
                        // queued event is superseded by current one, so discard
                        // it and look for another which may be relevant.
                        continue;
                    } else {
                        return Some((ev, ctx));
                    }
                }
                None => return Some((ev, ctx)),
            }
        }
        None
    }
    fn wait(&self) {
        let guard = self.inner.lock().unwrap();
        let _guard = self.cv.wait_while(guard, |g| g.events.is_empty());
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum State {
    /// Initial state.
    Initialize,
    /// The instance is actively running.
    Run,
    /// The instance is in a paused state such that it may
    /// later be booted or maintained.
    Quiesce,
    /// The instance state is being exported
    Save,
    /// The instance is no longer running
    Halt,
    /// The instance is rebooting, and should transition back
    /// to the "Run" state.
    Reset,
    /// Terminal state in which the instance is torn down.
    Destroy,
}
impl State {
    fn next(&self, ev: InstEvent) -> (Self, Option<InstEvent>) {
        match self {
            State::Initialize => {
                if matches!(ev, InstEvent::ReqStart) {
                    (State::Run, None)
                } else {
                    // All other events require a quiesce first
                    (State::Quiesce, Some(ev))
                }
            }
            State::Run => {
                if matches!(ev, InstEvent::ReqStart) {
                    // Discard any duplicate start requests when running
                    (State::Run, None)
                } else {
                    // All other events require a quiesce first
                    (State::Quiesce, Some(ev))
                }
            }
            State::Quiesce => match ev {
                InstEvent::Halt | InstEvent::ReqHalt => (State::Halt, Some(ev)),
                InstEvent::Reset | InstEvent::TripleFault => {
                    (State::Reset, Some(ev))
                }
                InstEvent::ReqSave => (State::Save, Some(ev)),
                InstEvent::ReqStart => {
                    // Reaching quiesce with a "start" event would be odd
                    panic!("unexpected ReqStart");
                }
            },
            State::Save => (State::Halt, Some(ev)),
            State::Halt => (State::Destroy, None),
            State::Reset => match ev {
                InstEvent::Halt | InstEvent::ReqHalt => (State::Halt, Some(ev)),
                InstEvent::Reset | InstEvent::TripleFault => (State::Run, None),
                _ => (State::Run, Some(ev)),
            },
            State::Destroy => (State::Destroy, None),
        }
    }
}

#[derive(Default)]
struct Inventory {
    devs: BTreeMap<String, Arc<dyn propolis::common::Lifecycle>>,
    block: BTreeMap<String, Arc<dyn propolis::block::Backend>>,
}
impl Inventory {
    fn register<D: propolis::common::Lifecycle>(&mut self, dev: &Arc<D>) {
        self.devs.insert(
            dev.type_name().into(),
            dev.clone() as Arc<dyn propolis::common::Lifecycle>,
        );
    }
    fn register_instance<D: propolis::common::Lifecycle>(
        &mut self,
        dev: &Arc<D>,
        name: &str,
    ) {
        self.devs.insert(
            format!("{}-{name}", dev.type_name()),
            dev.clone() as Arc<dyn propolis::common::Lifecycle>,
        );
    }
    fn register_block(
        &mut self,
        be: &Arc<dyn propolis::block::Backend>,
        name: String,
    ) {
        self.block.insert(name, be.clone());
    }
    fn destroy(&mut self) {
        // Detach all block backends from their devices
        for backend in self.block.values() {
            backend.attachment().detach();
        }

        // Drop all refs in the hopes that things can clean up after themselves
        self.devs.clear();
        self.block.clear();
    }
}

struct InstState {
    machine: Option<propolis::Machine>,
    inventory: Inventory,
    state: State,
    vcpu_tasks: Vec<propolis::tasks::TaskCtrl>,
    exit_code: Option<u8>,
}

struct InstInner {
    state: Mutex<InstState>,
    boot_gen: AtomicUsize,
    eq: Arc<EventQueue>,
    cv: Condvar,
    config: config::Config,
    com1_sock: Arc<UDSock>,
}

struct Instance(Arc<InstInner>);
impl Instance {
    fn new(
        machine: propolis::Machine,
        config: config::Config,
        from_restore: bool,
        log: slog::Logger,
        com1_sock: Arc<UDSock>,
    ) -> Self {
        let this = Self(Arc::new(InstInner {
            state: Mutex::new(InstState {
                machine: Some(machine),
                inventory: Inventory::default(),
                state: State::Initialize,
                vcpu_tasks: Vec::new(),
                exit_code: None,
            }),
            boot_gen: AtomicUsize::new(0),
            eq: EventQueue::new(),
            cv: Condvar::new(),
            config,
            com1_sock,
        }));

        // Some gymnastics required for the split borrow through the MutexGuard
        let mut state_guard = this.0.state.lock().unwrap();
        let state = &mut *state_guard;
        let machine = state.machine.as_ref().unwrap();

        let bind_cpus = match this.0.config.main.cpu_binding {
            Some(config::BindingStrategy::UpperHalf) => {
                let total_cpus =
                    pbind::online_cpus().expect("can get processor count");
                let vcpu_count: i32 = machine
                    .vcpus
                    .len()
                    .try_into()
                    .expect("vCPU count <= MAXCPU < i32::MAX");

                let first_bound_cpu = total_cpus - vcpu_count;
                let bind_cpus =
                    (first_bound_cpu..total_cpus).map(Some).collect();
                slog::info!(
                    &log,
                    "Explicit CPU binding requested";
                    "last_cpu" => total_cpus,
                    "first_cpu" => first_bound_cpu,
                    "vcpu_count" => vcpu_count,
                );
                bind_cpus
            }
            Some(config::BindingStrategy::Any) | None => {
                vec![None; machine.vcpus.len()]
            }
        };

        for (vcpu, bind_cpu) in
            machine.vcpus.iter().map(Arc::clone).zip(bind_cpus.into_iter())
        {
            let (task, ctrl) =
                propolis::tasks::TaskHdl::new_held(Some(vcpu.barrier_fn()));

            let inner = this.0.clone();
            let task_log = log.new(slog::o!("vcpu" => vcpu.id));
            let _ = std::thread::Builder::new()
                .name(format!("vcpu-{}", vcpu.id))
                .spawn(move || {
                    if let Some(bind_cpu) = bind_cpu {
                        pbind::bind_lwp(bind_cpu).expect("can bind vcpu");
                    }
                    Instance::vcpu_loop(inner, vcpu.as_ref(), &task, task_log)
                })
                .unwrap();
            state.vcpu_tasks.push(ctrl);
        }
        drop(state_guard);

        let rt_hdl = runtime::Handle::current();
        let inner = this.0.clone();
        let state_log = log.clone();
        let _ = std::thread::Builder::new()
            .name("state loop".to_string())
            .spawn(move || {
                // Make sure the instance state driver has access to tokio
                let _rt_guard = rt_hdl.enter();
                Instance::state_loop(inner, from_restore, state_log)
            })
            .unwrap();

        this
    }

    fn device_state_transition(
        state: State,
        guard: &MutexGuard<InstState>,
        first_boot: bool,
        _log: &slog::Logger,
    ) {
        for (name, device) in guard.inventory.devs.iter() {
            match state {
                State::Run => {
                    if first_boot {
                        device.start().unwrap_or_else(|_| {
                            panic!("device {name} failed to start")
                        });
                    } else {
                        device.resume();
                    }
                }
                State::Quiesce => device.pause(),
                State::Halt => device.halt(),
                State::Reset => device.reset(),
                _ => panic!("invalid device state transition {state:?}"),
            }
        }
        if matches!(state, State::Quiesce) {
            let tasks: futures::stream::FuturesUnordered<
                BoxFuture<'static, ()>,
            > = guard
                .inventory
                .devs
                .values()
                .map(|device| device.paused())
                .collect();

            // Wait for all of the pause futures to complete
            tokio::runtime::Handle::current().block_on(async move {
                use futures::stream::StreamExt;
                let _: Vec<()> = tasks.collect().await;
            });
        }

        // Drive block backends through their necessary states too
        match state {
            State::Run if first_boot => {
                tokio::runtime::Handle::current().block_on(async {
                    for (_name, be) in guard.inventory.block.iter() {
                        be.start().await.expect("blockdev start succeeds");
                    }
                });
            }
            State::Halt => {
                tokio::runtime::Handle::current().block_on(async {
                    for (_name, be) in guard.inventory.block.iter() {
                        be.stop().await;
                        be.attachment().detach();
                    }
                });
            }
            _ => {}
        }
    }

    fn state_loop(
        inner: Arc<InstInner>,
        from_restore: bool,
        log: slog::Logger,
    ) {
        let mut guard = inner.state.lock().unwrap();
        let mut cur_ev = None;

        if !from_restore {
            // Initialized vCPUs to standard x86 state, unless this instance is
            // being restored from a snapshot, in which case the snapshot state
            // will be injected prior to start-up.
            let machine = guard.machine.as_ref().unwrap();
            machine.vcpu_x86_setup().unwrap();
        }

        // If instance was restored from previously-saved state, the kernel VMM
        // portion will be paused so it could be consistently loaded.  Issue the
        // necessary resume before attempting to run.
        let mut needs_resume = from_restore;

        assert!(matches!(guard.state, State::Initialize));
        loop {
            if let Some((next_ev, ctx)) =
                inner.eq.pop_superseding(cur_ev.as_ref())
            {
                slog::info!(&log, "Instance event {:?} ({:?})", next_ev, ctx);
                cur_ev = Some(next_ev);
            }

            if cur_ev.is_none() {
                drop(guard);
                inner.eq.wait();
                guard = inner.state.lock().unwrap();
                continue;
            }

            let (next_state, resid_ev) = guard.state.next(cur_ev.unwrap());
            if guard.state == next_state {
                continue;
            }

            slog::info!(
                &log,
                "State transition {:?} -> {:?}",
                guard.state,
                next_state
            );
            match next_state {
                State::Initialize => {
                    panic!("initialize state should not be visited again")
                }
                State::Run => {
                    // start device emulation and vCPUs
                    Self::device_state_transition(
                        State::Run,
                        &guard,
                        inner.boot_gen.load(Ordering::Acquire) == 0,
                        &log,
                    );
                    if needs_resume {
                        let machine = guard.machine.as_ref().unwrap();
                        machine
                            .hdl
                            .resume()
                            .expect("restored instance can resume running");
                        needs_resume = false;
                    }

                    // TODO: bail if any vCPU tasks have exited already
                    for vcpu_task in guard.vcpu_tasks.iter_mut() {
                        let _ = vcpu_task.run();
                    }
                }
                State::Quiesce => {
                    // Stop device emulation and vCPUs. Note that the device
                    // lifecycle trait requires vCPUs to be paused before any
                    // devices pause.
                    for vcpu_task in guard.vcpu_tasks.iter_mut() {
                        let _ = vcpu_task.hold();
                    }
                    Self::device_state_transition(
                        State::Quiesce,
                        &guard,
                        false,
                        &log,
                    );
                    let machine = guard.machine.as_ref().unwrap();
                    machine.hdl.pause().expect("pause should complete");
                }
                State::Save => {
                    let guard = &mut *guard;
                    let save_res = snapshot::save(guard, &inner.config, &log);
                    if let Err(err) = save_res {
                        slog::error!(log, "Snapshot error {:?}", err);
                    }
                }
                State::Halt => {
                    Self::device_state_transition(
                        State::Halt,
                        &guard,
                        false,
                        &log,
                    );
                    for mut vcpu_ctrl in guard.vcpu_tasks.drain(..) {
                        vcpu_ctrl.exit();
                    }
                    if guard.exit_code.is_none() {
                        guard.exit_code = Some(inner.config.main.exit_on_halt);
                    }
                }
                State::Reset => {
                    if let (None, Some(code)) =
                        (guard.exit_code, inner.config.main.exit_on_reboot)
                    {
                        // Emit the configured exit-on-reboot code if one is
                        // configured an no existing code would already
                        // supersede it.
                        guard.exit_code = Some(code);
                        guard.state = State::Halt;
                        cur_ev = Some(InstEvent::ReqHalt);
                        continue;
                    }
                    Self::device_state_transition(
                        State::Reset,
                        &guard,
                        false,
                        &log,
                    );
                    let machine = guard.machine.as_ref().unwrap();
                    machine.reinitialize().unwrap();
                    machine.vcpu_x86_setup().unwrap();
                    inner.boot_gen.fetch_add(1, Ordering::Release);
                    machine.hdl.resume().expect("resume should complete");
                }
                State::Destroy => {
                    // Drop the machine
                    let _ = guard.machine.take().unwrap();

                    // Abort any pending serial connection
                    inner.com1_sock.shutdown();

                    // Clean up the inventory as well
                    guard.inventory.destroy();

                    // Communicate that destruction is complete
                    slog::info!(&log, "Instance destroyed");
                    guard.state = State::Destroy;
                    inner.cv.notify_all();
                    return;
                }
            }
            guard.state = next_state;
            cur_ev = resid_ev;
        }
    }

    fn wait_destroyed(&self) -> ExitCode {
        let guard = self.0.state.lock().unwrap();
        let mut guard = self
            .0
            .cv
            .wait_while(guard, |g| !matches!(g.state, State::Destroy))
            .unwrap();
        ExitCode::from(guard.exit_code.take().unwrap_or(0))
    }

    fn vcpu_loop(
        inner: Arc<InstInner>,
        vcpu: &Vcpu,
        task: &propolis::tasks::TaskHdl,
        log: slog::Logger,
    ) {
        use propolis::exits::{SuspendDetail, VmExitKind};
        use propolis::tasks::Event;

        let mut entry = VmEntry::Run;
        let mut exit = VmExit::default();
        let mut local_gen = 0;
        loop {
            let mut exit_when_consistent = false;
            match task.pending_event() {
                Some(Event::Hold) => {
                    if !exit.kind.is_consistent() {
                        // Before the vCPU task can enter the held state, its
                        // associated in-kernel state must be driven to a point
                        // where it is consistent.
                        exit_when_consistent = true;
                    } else {
                        task.hold();

                        // Check if the instance was reinitialized while task was held.
                        let cur_gen = inner.boot_gen.load(Ordering::Acquire);
                        if local_gen != cur_gen {
                            // Reset occurred, discard any existing entry details.
                            entry = VmEntry::Run;
                            local_gen = cur_gen;
                        }
                        continue;
                    }
                }
                Some(Event::Exit) => {
                    return;
                }
                None => {}
            }

            exit = match vcpu.run(&entry, exit_when_consistent) {
                Err(e) => {
                    slog::error!(&log, "VM entry error {:?}", e);

                    inner.eq.push(
                        InstEvent::Halt,
                        EventCtx::Other(format!(
                            "error {:?} on vcpu {}",
                            e.raw_os_error().unwrap_or(0),
                            vcpu.id
                        )),
                    );
                    task.force_hold();

                    entry = VmEntry::Run;
                    continue;
                }
                Ok(exit) => exit,
            };

            entry = vcpu.process_vmexit(&exit).unwrap_or_else(|| {
                match exit.kind {
                    VmExitKind::Inout(pio) => {
                        slog::error!(
                            &log,
                            "Unhandled pio {:x?}", pio; "rip" => exit.rip
                        );
                        VmEntry::InoutFulfill(exits::InoutRes::emulate_failed(
                            &pio,
                        ))
                    }
                    VmExitKind::Mmio(mmio) => {
                        slog::error!(
                            &log,
                            "Unhandled mmio {:x?}", mmio; "rip" => exit.rip
                        );
                        VmEntry::MmioFulfill(exits::MmioRes::emulate_failed(
                            &mmio,
                        ))
                    }
                    VmExitKind::Rdmsr(msr) => {
                        slog::error!(
                            &log,
                            "Unhandled rdmsr {:#08x}", msr; "rip" => exit.rip
                        );
                        let _ = vcpu.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RAX,
                            0,
                        );
                        let _ = vcpu.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RDX,
                            0,
                        );
                        VmEntry::Run
                    }
                    VmExitKind::Wrmsr(msr, val) => {
                        slog::error!(
                            &log,
                            "Unhandled wrmsr {:#08x} <- {:#08x}", msr, val;
                            "rip" => #%exit.rip
                        );
                        VmEntry::Run
                    }
                    VmExitKind::Suspended(SuspendDetail {
                        kind,
                        when: _when,
                    }) => {
                        match kind {
                            exits::Suspend::Halt | exits::Suspend::Reset => {
                                inner
                                    .eq
                                    .push(kind.into(), EventCtx::Vcpu(vcpu.id));
                            }
                            exits::Suspend::TripleFault(vcpuid) => {
                                if vcpuid == -1 || vcpuid == vcpu.id {
                                    inner.eq.push(
                                        kind.into(),
                                        EventCtx::Vcpu(vcpu.id),
                                    );
                                }
                            }
                        }
                        task.force_hold();

                        // The next entry is unimportant as we have queued a
                        // significant event and halted this vCPU task with the
                        // expectation that it will be acted upon soon.
                        VmEntry::Run
                    }
                    _ => {
                        slog::error!(
                            &log,
                            "Unhandled exit @rip:{:08x} {:?}",
                            exit.rip,
                            exit.kind
                        );
                        todo!()
                    }
                }
            });
        }
    }

    fn generate_pins(&self) -> (Arc<FuncPin>, Arc<FuncPin>) {
        let power_eq = self.0.eq.clone();
        let power_pin =
            propolis::intr_pins::FuncPin::new(Box::new(move |rising| {
                if rising {
                    power_eq.push(
                        InstEvent::Halt,
                        EventCtx::Pin("power pin".to_string()),
                    );
                }
            }));
        let reset_eq = self.0.eq.clone();
        let reset_pin =
            propolis::intr_pins::FuncPin::new(Box::new(move |rising| {
                if rising {
                    reset_eq.push(
                        InstEvent::Reset,
                        EventCtx::Pin("reset pin".to_string()),
                    );
                }
            }));
        (Arc::new(power_pin), Arc::new(reset_pin))
    }

    fn lock(&self) -> Option<MutexGuard<'_, InstState>> {
        let guard = self.0.state.lock().unwrap();
        // Make sure machine is still "live"
        guard.machine.as_ref()?;
        Some(guard)
    }
    fn eq(&self) -> Arc<EventQueue> {
        self.0.eq.clone()
    }
}

fn build_machine(
    name: &str,
    max_cpu: u8,
    lowmem: usize,
    highmem: usize,
    use_reservoir: bool,
) -> Result<propolis::Machine> {
    let mut builder = Builder::new(
        name,
        propolis::vmm::CreateOpts {
            force: true,
            use_reservoir,
            ..Default::default()
        },
    )?
    .max_cpus(max_cpu)?
    .add_mem_region(0, lowmem, "lowmem")?
    .add_rom_region(0x1_0000_0000 - MAX_ROM_SIZE, MAX_ROM_SIZE, "bootrom")?
    .add_mmio_region(0xc000_0000, 0x2000_0000, "dev32")?
    .add_mmio_region(0xe000_0000, 0x1000_0000, "pcicfg")?;

    let highmem_start = 0x1_0000_0000;
    if highmem > 0 {
        builder = builder.add_mem_region(highmem_start, highmem, "highmem")?;
    }

    let dev64_start = highmem_start + highmem;
    builder = builder.add_mmio_region(
        dev64_start,
        vmm::MAX_PHYSMEM - dev64_start,
        "dev64",
    )?;

    builder.finalize()
}

fn open_bootrom(path: &str) -> Result<(File, usize)> {
    let fp = File::open(path)?;
    let len = fp.metadata()?.len();
    if len & PAGE_OFFSET != 0 {
        Err(Error::new(
            ErrorKind::InvalidData,
            format!(
                "rom {} length {:x} not aligned to {:x}",
                path,
                len,
                PAGE_OFFSET + 1
            ),
        ))
    } else {
        Ok((fp, len as usize))
    }
}

fn build_log(level: slog::Level) -> slog::Logger {
    let main_drain = if atty::is(atty::Stream::Stdout) {
        let decorator = slog_term::TermDecorator::new().build();
        let drain = slog_term::CompactFormat::new(decorator).build().fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    } else {
        let drain =
            slog_bunyan::with_name("propolis-standalone", std::io::stdout())
                .build()
                .fuse();
        slog_async::Async::new(drain)
            .overflow_strategy(slog_async::OverflowStrategy::Block)
            .build_no_guard()
    };

    let (dtrace_drain, probe_reg) = slog_dtrace::Dtrace::new();

    let filtered_main = slog::LevelFilter::new(main_drain, level);

    let log = slog::Logger::root(
        slog::Duplicate::new(filtered_main.fuse(), dtrace_drain.fuse()).fuse(),
        o!(),
    );

    if let slog_dtrace::ProbeRegistration::Failed(err) = probe_reg {
        slog::error!(&log, "Error registering slog-dtrace probes: {:?}", err);
    }

    log
}

fn populate_rom(
    machine: &Machine,
    region_name: &str,
    fp: &File,
    len: usize,
) -> std::io::Result<()> {
    let mem = machine.acc_mem.access().unwrap();
    let mapping = mem.direct_writable_region_by_name(region_name)?;

    if mapping.len() < len {
        return Err(Error::new(ErrorKind::InvalidData, "rom too long"));
    }

    let offset = mapping.len() - len;
    let submapping = mapping.subregion(offset, len).unwrap();
    if submapping.pread(fp, len, 0)? != len {
        // TODO: Handle short read
        return Err(Error::new(ErrorKind::InvalidData, "short read"));
    }
    Ok(())
}

struct SmbiosParams {
    memory_size: usize,
    rom_size: usize,
    rom_version: String,
    num_cpus: u8,
    cpuid_ident: Option<CpuidValues>,
    cpuid_procname: Option<[CpuidValues; 3]>,
}
fn generate_smbios(params: SmbiosParams) -> anyhow::Result<smbios::TableBytes> {
    use smbios::table::{type0, type1, type16, type4};
    let bios_version = params
        .rom_version
        .try_into()
        .expect("bootrom version string doesn't contain NUL bytes");
    let smb_type0 = smbios::table::Type0 {
        vendor: "Oxide".try_into().unwrap(),
        bios_version,
        bios_release_date: "Bureaucracy 41, 3186 YOLD".try_into().unwrap(),
        bios_rom_size: ((params.rom_size / (64 * 1024)) - 1) as u8,
        bios_characteristics: type0::BiosCharacteristics::UNSUPPORTED,
        bios_ext_characteristics: type0::BiosExtCharacteristics::ACPI
            | type0::BiosExtCharacteristics::UEFI
            | type0::BiosExtCharacteristics::IS_VM,
        ..Default::default()
    };

    let smb_type1 = smbios::table::Type1 {
        manufacturer: "Oxide".try_into().unwrap(),
        product_name: "OxVM".try_into().unwrap(),
        wake_up_type: type1::WakeUpType::PowerSwitch,
        ..Default::default()
    };

    let cpuid_vendor = cpuid_utils::host::query(CpuidIdent::leaf(0));
    let cpuid_ident = params
        .cpuid_ident
        .unwrap_or_else(|| cpuid_utils::host::query(CpuidIdent::leaf(1)));
    let family = match cpuid_ident.eax & 0xf00 {
        // If family ID is 0xf, extended family is added to it
        0xf00 => ((cpuid_ident.eax >> 20) & 0xff) + 0xf,
        // ... otherwise base family ID is used
        base => base >> 8,
    };

    let vendor = CpuidVendor::try_from(cpuid_vendor);
    let proc_manufacturer = match vendor {
        Ok(CpuidVendor::Intel) => "Intel",
        Ok(CpuidVendor::Amd) => "Advanced Micro Devices, Inc.",
        _ => "",
    }
    .try_into()
    .unwrap();
    let proc_family = match (vendor, family) {
        // Zen
        (Ok(CpuidVendor::Amd), family) if family >= 0x17 => 0x6b,
        //unknown
        _ => 0x2,
    };
    let proc_id =
        u64::from(cpuid_ident.eax) | (u64::from(cpuid_ident.edx) << 32);
    let procname_entries = params.cpuid_procname.or_else(|| {
        if cpuid_utils::host::query(CpuidIdent::leaf(0x8000_0000)).eax
            >= 0x8000_0004
        {
            Some([
                cpuid_utils::host::query(CpuidIdent::leaf(0x8000_0002)),
                cpuid_utils::host::query(CpuidIdent::leaf(0x8000_0003)),
                cpuid_utils::host::query(CpuidIdent::leaf(0x8000_0004)),
            ])
        } else {
            None
        }
    });
    let proc_version = procname_entries
        .and_then(|e| cpuid::parse_brand_string(e).ok())
        .unwrap_or("".to_string());

    let smb_type4 = smbios::table::Type4 {
        proc_type: type4::ProcType::Central,
        proc_family,
        proc_manufacturer,
        proc_id,
        proc_version: proc_version.as_str().try_into().unwrap_or_default(),
        status: type4::ProcStatus::Enabled,
        // unknown
        proc_upgrade: 0x2,
        // make core and thread counts equal for now
        core_count: params.num_cpus,
        core_enabled: params.num_cpus,
        thread_count: params.num_cpus,
        proc_characteristics: type4::Characteristics::IS_64_BIT
            | type4::Characteristics::MULTI_CORE,
        ..Default::default()
    };

    let mut smb_type16 = smbios::table::Type16 {
        location: type16::Location::SystemBoard,
        array_use: type16::ArrayUse::System,
        error_correction: type16::ErrorCorrection::Unknown,
        num_mem_devices: 1,
        ..Default::default()
    };
    smb_type16.set_max_capacity(params.memory_size);
    let phys_mem_array_handle = 0x1600.into();

    let mut smb_type17 = smbios::table::Type17 {
        phys_mem_array_handle,
        // Unknown
        form_factor: 0x2,
        // Unknown
        memory_type: 0x2,
        ..Default::default()
    };
    smb_type17.set_size(Some(params.memory_size));

    let smb_type32 = smbios::table::Type32::default();

    let mut smb_tables = smbios::Tables::new(0x7f00.into());
    smb_tables.add(0x0000.into(), &smb_type0).unwrap();
    smb_tables.add(0x0100.into(), &smb_type1).unwrap();
    smb_tables.add(0x0300.into(), &smb_type4).unwrap();
    smb_tables.add(phys_mem_array_handle, &smb_type16).unwrap();
    smb_tables.add(0x1700.into(), &smb_type17).unwrap();
    smb_tables.add(0x3200.into(), &smb_type32).unwrap();

    Ok(smb_tables.commit())
}

fn generate_e820(
    machine: &Machine,
    log: &slog::Logger,
) -> anyhow::Result<fwcfg::Entry> {
    slog::info!(log, "Generating E820 map for guest address space",);

    let mut e820_table = fwcfg::formats::E820Table::new();

    use propolis::vmm::MapType;

    for (addr, len, kind) in machine.map_physmem.mappings().into_iter() {
        let addr = addr.try_into().context("usize should fit into u64")?;
        let len = len.try_into().context("usize should fit into u64")?;
        match kind {
            MapType::Dram => {
                e820_table.add_mem(addr, len);
            }
            _ => {
                e820_table.add_reserved(addr, len);
            }
        }
    }

    Ok(e820_table.finish())
}

fn generate_bootorder(
    config: &config::Config,
    log: &slog::Logger,
) -> anyhow::Result<Option<fwcfg::Entry>> {
    let Some(names) = config.main.boot_order.as_ref() else {
        return Ok(None);
    };

    slog::info!(
        log,
        "Bootorder declared as {:?}",
        config.main.boot_order.as_ref()
    );

    let mut order = fwcfg::formats::BootOrder::new();
    for name in names.iter() {
        let dev = config
            .devices
            .get(name)
            .ok_or(anyhow::anyhow!("Could not find device: {name}"))?;

        let get_pci_path = || {
            dev.options
                .get("pci-path")
                .and_then(|v| v.as_str())
                .and_then(config::parse_bdf)
                .expect("PCI device has valid BDF")
        };

        match dev.driver.as_str() {
            "pci-virtio-block" => {
                order.add_disk(get_pci_path().location);
            }
            "pci-nvme" => {
                order.add_nvme(get_pci_path().location, 0);
            }
            driver if driver.starts_with("pci-") => {
                order.add_pci(get_pci_path().location, "device");
            }
            dev => {
                anyhow::bail!(
                    "Boot device '{name}' is unsupported device type: {dev}"
                );
            }
        }
    }
    Ok(Some(order.finish()))
}

fn setup_instance(
    config: config::Config,
    from_restore: bool,
    log: &slog::Logger,
) -> anyhow::Result<(Instance, Arc<UDSock>)> {
    let vm_name = &config.main.name;
    let cpus = config.main.cpus;

    let memsize: usize = config.main.memory * MB;
    let lowmem = memsize.min(3 * GB);
    let highmem = memsize.saturating_sub(3 * GB);

    let use_reservoir = config.main.use_reservoir.unwrap_or(false);
    if use_reservoir {
        // Do a quick check of the reservoir size if asked to use it
        //
        // The actual VM create can TOCTOU race, but we can at least raise the
        // issue nicely if things are way off.
        let ctl = propolis::bhyve_api::VmmCtlFd::open()?;
        let resv_info = ctl.reservoir_query()?;
        if resv_info.vrq_free_sz < memsize {
            slog::warn!(
                log,
                "Reservoir lacks free capacity ({}MiB < {}MiB)",
                resv_info.vrq_free_sz / MB,
                memsize / MB
            );
        }
    }

    slog::info!(log, "Creating VM with {} vCPUs, {} lowmem, {} highmem",
        cpus, lowmem, highmem;);
    let machine = build_machine(vm_name, cpus, lowmem, highmem, use_reservoir)
        .context("Failed to create VM Machine")?;
    let com1_sock =
        UDSock::bind(Path::new("./ttya")).context("Cannot open UD socket")?;
    let inst = Instance::new(
        machine,
        config.clone(),
        from_restore,
        log.clone(),
        com1_sock.clone(),
    );
    slog::info!(log, "VM created"; "name" => vm_name);

    let (romfp, rom_len) =
        open_bootrom(&config.main.bootrom).context("Cannot open bootrom")?;

    // Get necessary access to innards, now that it is nestled in `Instance`
    let mut inst_guard = inst.lock().unwrap();
    // Split borrows require this dance
    let guard = &mut *inst_guard;
    let machine = guard.machine.as_ref().unwrap();
    let hdl = machine.hdl.clone();

    populate_rom(machine, "bootrom", &romfp, rom_len)?;
    drop(romfp);

    // Add vCPUs to inventory, since they count as devices
    for vcpu in machine.vcpus.iter() {
        guard.inventory.register_instance(vcpu, &vcpu.id.to_string())
    }

    let (power_pin, reset_pin) = inst.generate_pins();
    let pci_topo =
        propolis::hw::pci::topology::Builder::new().finish(machine)?.topology;

    let chipset_hb = i440fx::I440FxHostBridge::create(
        pci_topo,
        i440fx::Opts {
            power_pin: Some(power_pin),
            reset_pin: Some(reset_pin),
            ..Default::default()
        },
    );
    let chipset_lpc = i440fx::Piix3Lpc::create(machine.hdl.clone());
    let chipset_pm = i440fx::Piix3PM::create(
        machine.hdl.clone(),
        chipset_hb.power_pin(),
        log.new(slog::o!("device" => "piix3pm")),
    );

    let chipset_pci_attach = |bdf, pcidev| {
        chipset_hb.pci_attach(bdf, pcidev, chipset_lpc.route_lintr(bdf));
    };

    chipset_pci_attach(i440fx::DEFAULT_HB_BDF, chipset_hb.clone());
    chipset_pci_attach(i440fx::DEFAULT_LPC_BDF, chipset_lpc.clone());
    chipset_pci_attach(i440fx::DEFAULT_PM_BDF, chipset_pm.clone());

    chipset_hb.attach(machine);
    chipset_lpc.attach(&machine.bus_pio);
    chipset_pm.attach(&machine.bus_pio);

    guard.inventory.register(&chipset_hb);
    guard.inventory.register(&chipset_lpc);
    guard.inventory.register(&chipset_pm);

    // RTC: populate time and CMOS
    let rtc = chipset_lpc.rtc.as_ref();
    rtc.memsize_to_nvram(lowmem as u32, highmem as u64)?;
    rtc.set_time(
        SystemTime::now()
            .duration_since(UNIX_EPOCH)
            .expect("system time precedes UNIX epoch"),
    )?;

    // HPET
    let hpet = propolis::hw::bhyve::BhyveHpet::create(hdl.clone());
    guard.inventory.register(&hpet);

    // UARTs
    let com1 = LpcUart::new(chipset_lpc.irq_pin(ibmpc::IRQ_COM1).unwrap());
    let com2 = LpcUart::new(chipset_lpc.irq_pin(ibmpc::IRQ_COM2).unwrap());
    let com3 = LpcUart::new(chipset_lpc.irq_pin(ibmpc::IRQ_COM3).unwrap());
    let com4 = LpcUart::new(chipset_lpc.irq_pin(ibmpc::IRQ_COM4).unwrap());

    com1_sock.spawn(
        Arc::clone(&com1) as Arc<dyn Sink>,
        Arc::clone(&com1) as Arc<dyn Source>,
    );
    com1.set_autodiscard(false);

    // XXX: plumb up com2-4, but until then, just auto-discard
    com2.set_autodiscard(true);
    com3.set_autodiscard(true);
    com4.set_autodiscard(true);

    let pio = &machine.bus_pio;
    LpcUart::attach(&com1, pio, ibmpc::PORT_COM1);
    LpcUart::attach(&com2, pio, ibmpc::PORT_COM2);
    LpcUart::attach(&com3, pio, ibmpc::PORT_COM3);
    LpcUart::attach(&com4, pio, ibmpc::PORT_COM4);
    guard.inventory.register_instance(&com1, "com1");
    guard.inventory.register_instance(&com2, "com2");
    guard.inventory.register_instance(&com3, "com3");
    guard.inventory.register_instance(&com4, "com4");

    // PS/2
    let ps2_ctrl = PS2Ctrl::create();
    ps2_ctrl.attach(
        pio,
        chipset_lpc.irq_pin(ibmpc::IRQ_PS2_PRI).unwrap(),
        chipset_lpc.irq_pin(ibmpc::IRQ_PS2_AUX).unwrap(),
        chipset_hb.reset_pin(),
    );
    guard.inventory.register(&ps2_ctrl);

    let debug_file = std::fs::File::create("debug.out")?;
    let debug_out = chardev::BlockingFileOutput::new(debug_file);
    let debug_device = hw::qemu::debug::QemuDebugPort::create(pio);
    debug_out.attach(Arc::clone(&debug_device) as Arc<dyn BlockingSource>);
    guard.inventory.register(&debug_device);

    for (name, dev) in config.devices.iter() {
        let driver = &dev.driver as &str;
        slog::debug!(log, "creating device"; "name" => ?name, "driver" => %driver);

        let bdf = if driver.starts_with("pci-") {
            config::parse_bdf(
                dev.options.get("pci-path").unwrap().as_str().unwrap(),
            )
        } else {
            None
        };
        let mut create_device = || -> anyhow::Result<()> {
            match driver {
                "pci-virtio-block" => {
                    let (backend, name) =
                        config::block_backend(&config, dev, log);
                    let bdf = bdf.unwrap();

                    let vioblk = hw::virtio::PciVirtioBlock::new(0x100);

                    guard
                        .inventory
                        .register_instance(&vioblk, &bdf.to_string());
                    guard.inventory.register_block(&backend, name);

                    block::attach(&vioblk.block_attach, backend.attachment())
                        .unwrap();
                    chipset_pci_attach(bdf, vioblk);
                }
                "pci-virtio-viona" => {
                    let vnic_name =
                        dev.options.get("vnic").unwrap().as_str().unwrap();
                    let bdf = bdf.unwrap();

                    let viona_params =
                        config::VionaDeviceParams::from_opts(&dev.options)
                            .expect("viona params are valid");

                    // The viona_params here (currently just copy_data and
                    // header_pad) require `viona::ApiVersion::V3`, below
                    // Propolis' minimum of V6, so we can always set them.
                    let viona = hw::virtio::PciVirtioViona::new(
                        vnic_name,
                        &hdl,
                        viona_params,
                    )?;
                    guard.inventory.register_instance(&viona, &bdf.to_string());
                    chipset_pci_attach(bdf, viona);
                }
                "pci-nvme" => {
                    let (backend, name) =
                        config::block_backend(&config, dev, log);
                    let bdf = bdf.unwrap();

                    let dev_serial = dev
                        .options
                        .get("block_dev")
                        .unwrap()
                        .as_str()
                        .unwrap()
                        .to_string();
                    let log =
                        log.new(slog::o!("dev" => format!("nvme-{}", name)));
                    // Limit data transfers to 1MiB (2^8 * 4k) in size
                    let mdts = Some(8);

                    let mut serial_number = [0u8; 20];
                    let sz = dev_serial.len().min(20);
                    serial_number[..sz]
                        .clone_from_slice(&dev_serial.as_bytes()[..sz]);

                    let nvme =
                        hw::nvme::PciNvme::create(&serial_number, mdts, log);

                    guard.inventory.register_instance(&nvme, &bdf.to_string());
                    guard.inventory.register_block(&backend, name);

                    block::attach(&nvme.block_attach, backend.attachment())
                        .unwrap();
                    chipset_pci_attach(bdf, nvme);
                }
                qemu::pvpanic::DEVICE_NAME => {
                    let enable_isa = dev
                        .options
                        .get("enable_isa")
                        .and_then(|opt| opt.as_bool())
                        .unwrap_or(false);
                    if enable_isa {
                        let pvpanic = QemuPvpanic::create(
                            log.new(slog::o!("dev" => "pvpanic")),
                        );
                        pvpanic.attach_pio(pio);
                        guard.inventory.register(&pvpanic);
                    }
                }
                "pci-virtio-socket" => {
                    let config = config::VsockDevice::from_opts(&dev.options)?;
                    let bdf = bdf.unwrap();
                    let guest_cid = GuestCid::try_from(config.guest_cid)
                        .context("guest cid")?;
                    let vsock = hw::virtio::PciVirtioSock::new(
                        512,
                        guest_cid,
                        log.new(slog::o!("dev" => "vsock")),
                        config.port_mappings,
                    );
                    guard.inventory.register(&vsock);
                    chipset_pci_attach(bdf, vsock);
                }
                _ => {
                    slog::error!(log, "unrecognized driver {driver}"; "name" => name);
                    return Err(Error::new(
                        ErrorKind::Other,
                        "Unrecognized driver",
                    )
                    .into());
                }
            };
            Ok(())
        };
        create_device().with_context(|| {
            format!("Failed to create {driver} device '{name}'")
        })?;
    }

    let fwcfg = fwcfg::FwCfg::new();
    fwcfg
        .insert_legacy(
            fwcfg::LegacyId::SmpCpuCount,
            fwcfg::Entry::fixed_u32(u32::from(cpus)),
        )
        .unwrap();

    let ramfb =
        hw::qemu::ramfb::RamFb::create(log.new(slog::o!("dev" => "ramfb")));
    ramfb.attach(&machine.acc_mem);

    fwcfg
        .insert_named(
            hw::qemu::ramfb::RamFb::FWCFG_ENTRY_NAME,
            fwcfg::Entry::RamFb,
        )
        .unwrap();
    fwcfg.attach_ramfb(Some(ramfb.clone()));

    let cpuid_profile = config::parse_cpuid(&config)?;

    let cpuid_ident = cpuid_profile
        .as_ref()
        .and_then(|p| p.get(CpuidIdent::leaf(1)))
        .cloned();
    let cpuid_procname = cpuid_profile.as_ref().and_then(|p| {
        match (
            p.get(CpuidIdent::leaf(0x8000_0002)),
            p.get(CpuidIdent::leaf(0x8000_0003)),
            p.get(CpuidIdent::leaf(0x8000_0004)),
        ) {
            (Some(a), Some(b), Some(c)) => Some([*a, *b, *c]),
            _ => None,
        }
    });

    // generate SMBIOS data and expose via fw_cfg
    let smbios::TableBytes { entry_point, structure_table } =
        generate_smbios(SmbiosParams {
            memory_size: memsize,
            rom_size: rom_len,
            rom_version: config
                .main
                .bootrom_version
                .clone()
                .unwrap_or_else(|| "v0.0.1-alpha 1".to_string()),
            num_cpus: cpus,
            cpuid_ident,
            cpuid_procname,
        })
        .unwrap();
    fwcfg
        .insert_named(
            "etc/smbios/smbios-tables",
            fwcfg::Entry::Bytes(structure_table),
        )
        .unwrap();
    fwcfg
        .insert_named(
            "etc/smbios/smbios-anchor",
            fwcfg::Entry::Bytes(entry_point),
        )
        .unwrap();

    // It is "safe" to generate bootorder (if requested) now, given that PCI
    // device configuration has been validated by preceding logic
    if let Some(boot_config) = generate_bootorder(&config, log)
        .context("Failed to generate boot order")?
    {
        fwcfg.insert_named("bootorder", boot_config).unwrap();
    }
    let e820_entry = generate_e820(machine, log).expect("can build E820 table");
    fwcfg.insert_named("etc/e820", e820_entry).unwrap();

    fwcfg.attach(pio, &machine.acc_mem);

    guard.inventory.register(&fwcfg);
    guard.inventory.register(&ramfb);

    for vcpu in machine.vcpus.iter() {
        let vcpu_profile = if let Some(profile) = cpuid_profile.as_ref() {
            propolis::cpuid::Specializer::new()
                .with_vcpu_count(
                    std::num::NonZeroU8::new(config.main.cpus).unwrap(),
                    true,
                )
                .with_vcpuid(vcpu.id)
                .with_cache_topo()
                .clear_cpu_topo(cpuid::TopoKind::iter())
                .with_cpu_topo(cpuid::TopoKind::supported())
                .execute(profile.clone())
                .context("failed to specialize cpuid profile")?
        } else {
            // An empty set will instruct the kernel to use the legacy
            // fallback behavior
            cpuid_utils::CpuidSet::new_host()
        };
        vcpu.set_cpuid(vcpu_profile)?;
        vcpu.set_default_capabs()?;
    }
    drop(inst_guard);

    Ok((inst, com1_sock))
}

/// Check bhyve and viona API versions, squawking if they do not meet
/// expectations, but ultimately still allowing forward progress since
/// propolis-standalone lives in the Thunderdome.
fn api_version_checks(log: &slog::Logger) -> std::io::Result<()> {
    use api_version::{Error, VersionCheckError};
    match api_version::check() {
        Err(VersionCheckError { component: _, path, err: Error::Io(e) }) => {
            if e.kind() == ErrorKind::NotFound {
                slog::error!(log, "Failed to open {}", path);
            }

            // IO errors _are_ fatal
            Err(e)
        }
        Err(VersionCheckError {
            component,
            err: source @ Error::TooLow { .. },
            path: _,
        }) => {
            // Make noise about version mismatch, but soldier on and let the
            // user decide if they want to quit
            slog::error!(log, "{component}: {source}");
            Ok(())
        }
        Ok(_) => Ok(()),
    }
}

#[derive(clap::Parser)]
/// Propolis command-line frontend for running a VM.
struct Args {
    /// Either the VM config file or a previously captured snapshot image.
    #[clap(value_name = "CONFIG|SNAPSHOT", action)]
    target: String,

    /// Take a snapshot on Ctrl-C before exiting.
    #[clap(short, long, action)]
    snapshot: bool,

    /// Restore previously captured snapshot.
    #[clap(short, long, action)]
    restore: bool,

    /// Maximum log level filter.
    #[clap(
        long,
        env = "PROPOLIS_LOG",
        default_value_t = LogFilter(slog::Level::Info),
        ignore_case(true),
    )]
    log_level: LogFilter,
}

fn main() -> anyhow::Result<ExitCode> {
    let Args { target, snapshot, restore, log_level: LogFilter(log_level) } =
        Args::parse();

    // Ensure proper setup of USDT probes
    register_probes().context("Failed to setup USDT probes")?;

    let log = build_log(log_level);

    // Check that vmm and viona device version match what we expect
    api_version_checks(&log).context("API version checks")?;

    propolis::common::DISPLAY_GUEST_DATA
        .store(true, std::sync::atomic::Ordering::SeqCst);

    // Load/parse the config first, since it's required to size the tokio runtime
    // used to run the instance.
    let config = if restore {
        snapshot::restore_config(&target)
    } else {
        config::parse(&target)
    }?;

    // Create tokio runtime, we don't use the tokio::main macro
    // since we'll block in main when we call `Instance::wait_for_state`
    let rt_threads =
        MIN_RT_THREADS.max(BASE_RT_THREADS + config.main.cpus as usize);
    let rt = {
        let mut builder = tokio::runtime::Builder::new_multi_thread();
        builder.worker_threads(rt_threads).thread_name("vmm-tokio");
        oxide_tokio_rt::build(&mut builder)?
    };
    let _rt_guard = rt.enter();

    // Create the VM afresh or restore it from a snapshot
    let (inst, com1_sock) = if restore {
        snapshot::restore(&target, config, &log)
    } else {
        setup_instance(config, false, &log)
    }?;

    // Register a Ctrl-C handler so we can snapshot before exiting if needed
    let ctrlc_eq = inst.eq();
    let signal_log = log.clone();
    let mut ctrlc_fired = false;
    ctrlc::set_handler(move || {
        if ctrlc_fired {
            return;
        } else {
            ctrlc_fired = true;
        }
        if snapshot {
            ctrlc_eq
                .push(InstEvent::ReqSave, EventCtx::User("Ctrl+C".to_string()));
        } else {
            slog::info!(signal_log, "Destroying instance...");
            ctrlc_eq
                .push(InstEvent::ReqHalt, EventCtx::User("Ctrl+C".to_string()));
        }
    })
    .context("Failed to register Ctrl-C signal handler.")?;

    // Wait until someone connects to ttya
    slog::info!(log, "Waiting for a connection to ttya");
    if com1_sock.wait_for_connect() {
        // Let the VM start and we're off to the races
        slog::info!(log, "Starting instance...");
        inst.eq().push(
            InstEvent::ReqStart,
            EventCtx::User("UDS connection".to_string()),
        );
    }

    // wait for instance to be destroyed
    Ok(inst.wait_destroyed())
}

/// Wrapper around `slog::Level` to implement `clap::ValueEnum`, so that this
/// type can be parsed from the command line.
#[derive(Clone, Debug)]
struct LogFilter(slog::Level);

impl clap::ValueEnum for LogFilter {
    fn value_variants<'a>() -> &'a [Self] {
        &[
            LogFilter(slog::Level::Critical),
            LogFilter(slog::Level::Error),
            LogFilter(slog::Level::Warning),
            LogFilter(slog::Level::Info),
            LogFilter(slog::Level::Debug),
            LogFilter(slog::Level::Trace),
        ]
    }

    fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
        Some(clap::builder::PossibleValue::new(self.0.as_str()))
    }
}

impl fmt::Display for LogFilter {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}


================================================
FILE: bin/propolis-standalone/src/snapshot.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Routines and types for saving and restoring a snapshot of a VM.
//!
//! The snapshot format is a tar file structured as follows:
//! - `config.toml`: VM configuration data
//! - `global.json`: Global state data for the instance
//! - `devices/*.json`: Exported state for each device
//! - `memory/<start>-<end>.bin`: Raw memory covering guest-physical address
//!   range [start, end), with those addresses formatted in hex.

use std::convert::TryInto;
use std::fs::File;
use std::io::{self, Read, Seek, SeekFrom, Write};
use std::os::unix::io::{AsRawFd, RawFd};
use std::path::Path;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};

use bhyve_api::{
    vdi_field_entry_v1, vdi_time_info_v1, ApiVersion, VAI_BOOT_HRTIME,
    VDC_VMM_ARCH, VDC_VMM_TIME,
};
use propolis::{
    chardev::UDSock,
    common::{GuestAddr, GuestRegion, GB},
    migrate::{
        MigrateCtx, Migrator, PayloadOffer, PayloadOffers, PayloadOutputs,
    },
    vmm::VmmHdl,
};

use anyhow::Context;
use serde::{Deserialize, Serialize};
use slog::{debug, info, warn};

use super::config::Config;
use super::Instance;

#[derive(Deserialize, Serialize)]
struct SnapshotDevice {
    pub instance_name: String,
    pub payload: Vec<SnapshotDevicePayload>,
}
#[derive(Deserialize, Serialize)]
struct SnapshotDevicePayload {
    pub kind: String,
    pub version: u32,
    pub data: String,
}

const DEVICE_DIR: &str = "devices";
const MEMORY_DIR: &str = "memory";
const CONFIG_NAME: &str = "config.toml";
const GLOBAL_NAME: &str = "global.json";

/// Save a snapshot of the current state of the given instance to disk.
pub(crate) fn save(
    guard: &mut super::InstState,
    config: &Config,
    log: &slog::Logger,
) -> anyhow::Result<()> {
    let now = SystemTime::now().duration_since(UNIX_EPOCH)?;
    let snapshot = format!("{}-{}.tar", config.main.name, now.as_millis());

    info!(log, "saving snapshot of VM to {}", snapshot);

    let file =
        File::create(&snapshot).context("Failed to create snapshot file")?;
    let mut builder = TarBuilder::new(file);
    let mut header = tar::Header::new_gnu();
    header.set_mode(0o444);
    header.set_mtime(now.as_secs());

    info!(log, "Serializing VM config");
    {
        let config_bytes = toml::to_string(config)?.into_bytes();
        header.set_size(config_bytes.len() as u64);
        builder.append_data(&mut header, CONFIG_NAME, &config_bytes[..])?;
    }

    // Being called from the Quiesce state, all of the device pause work should
    // be done for us already.
    let machine = guard.machine.as_ref().unwrap();
    let hdl = machine.hdl.clone();
    let memctx = machine.acc_mem.access().unwrap();
    let migratectx = MigrateCtx { mem: &memctx };

    info!(log, "Serializing global VM state");
    {
        let global_config =
            export_global(&hdl).context("Failed to export global VM state")?;
        let global_bytes = serde_json::to_vec(&global_config)?;
        header.set_size(global_bytes.len() as u64);
        builder.append_data(&mut header, "global.json", &global_bytes[..])?;
    }

    // Add directories for devices and memory
    {
        let mut dir_header = tar::Header::new_gnu();
        dir_header.set_entry_type(tar::EntryType::Directory);
        dir_header.set_mode(0o555);
        dir_header.set_mtime(now.as_secs());
        dir_header.set_size(0);

        builder.append_dir(&mut dir_header, format!("{DEVICE_DIR}/"))?;
        builder.append_dir(&mut dir_header, format!("{MEMORY_DIR}/"))?;
    }

    info!(log, "Serializing VM device state");
    for (name, dev) in guard.inventory.devs.iter() {
        let device_data = match dev.migrate() {
            Migrator::NonMigratable => {
                anyhow::bail!(
                    "Can't snapshot instance with non-migratable device ({name})"
                );
            }
            Migrator::Empty => continue,
            Migrator::Single(mech) => {
                let output = mech.export(&migratectx)?;
                SnapshotDevice {
                    instance_name: name.to_owned(),
                    payload: vec![SnapshotDevicePayload {
                        kind: output.kind.to_owned(),
                        version: output.version,
                        data: serde_json::to_string(&output.payload)?,
                    }],
                }
            }
            Migrator::Multi(mech) => {
                let mut outputs = PayloadOutputs::new();
                mech.export(&mut outputs, &migratectx)?;

                let mut payloads = Vec::new();
                for part in outputs {
                    payloads.push(SnapshotDevicePayload {
                        kind: part.kind.to_owned(),
                        version: part.version,
                        data: serde_json::to_string(&part.payload)?,
                    });
                }
                SnapshotDevice {
                    instance_name: name.to_owned(),
                    payload: payloads,
                }
            }
        };

        let device_bytes = serde_json::to_vec(&device_data)?;
        header.set_size(device_bytes.len() as u64);
        builder.append_data(
            &mut header,
            format!("{DEVICE_DIR}/{name}.json"),
            &device_bytes[..],
        )?;
    }

    // TODO(luqmana) clean this up. make mem_bounds do the lo/hi calc? or just
    // use config values?
    let mem_bounds = memctx
        .mem_bounds()
        .ok_or_else(|| anyhow::anyhow!("Failed to get VM RAM bounds"))?;
    let len: usize =
        (mem_bounds.end().0 - mem_bounds.start().0 + 1).try_into()?;
    let (lo, hi) = if len > 3 * GB {
        (3 * GB, Some(len.saturating_sub(4 * GB)))
    } else {
        (len, None)
    };
    info!(log, "Low RAM: {}, High RAM: {:?}", lo, hi);

    info!(log, "Writing low memory...");
    {
        let lo_mapping = memctx
            .direct_readable_region(&GuestRegion(GuestAddr(0), lo))
            .ok_or_else(|| anyhow::anyhow!("Failed to get lowmem region"))?;
        header.set_size(lo_mapping.len() as u64);
        let off = builder.append_space(
            &mut header,
            format!("{}/{:08x}-{:08x}.bin", MEMORY_DIR, 0, lo),
        )?;
        lo_mapping.pwrite(&builder.rawfd(), lo_mapping.len(), off as i64)?;
    }

    if let Some(hi) = hi {
        info!(log, "Writing high memory...");
        let start = 0x1_0000_0000;
        let hi_mapping = memctx
            .direct_readable_region(&GuestRegion(GuestAddr(start), hi))
            .ok_or_else(|| anyhow::anyhow!("Failed to get himem region"))?;

        header.set_size(hi_mapping.len() as u64);
        let end = start + hi as u64;
        let off = builder.append_space(
            &mut header,
            format!("{MEMORY_DIR}/{start:08x}-{end:08x}.bin"),
        )?;
        hi_mapping.pwrite(&builder.rawfd(), hi_mapping.len(), off as i64)?;
    }

    builder.inner_builder().finish()?;
    builder.into_file()?.flush()?;

    info!(log, "Snapshot saved to {}", snapshot);

    Ok(())
}

fn parse_mem_name(name: &str) -> anyhow::Result<(usize, usize)> {
    if let Some(addrs) = name.strip_suffix(".bin") {
        let mut fields = addrs.split('-');
        if let (Some(start), Some(end), None) =
            (fields.next(), fields.next(), fields.next())
        {
            let start = usize::from_str_radix(start, 16)?;
            let end = usize::from_str_radix(end, 16)?;
            if start >= end {
                anyhow::bail!("bad memory bounds {start} {end}");
            }
            return Ok((start, end));
        }
        anyhow::bail!("could not parse bounds of memory file {name}");
    } else {
        anyhow::bail!("memory file '{name}'does not end with .bin")
    }
}

/// Attempt to read (and parse) the VM config from a snapshot file
pub(crate) fn restore_config(path: impl AsRef<Path>) -> anyhow::Result<Config> {
    let file = File::open(&path).context("Failed to open snapshot file")?;
    let mut archive = TarArchive::new(file);

    let mut entry = archive.named_entry(CONFIG_NAME)?;
    let mut toml_str = String::new();
    entry.read_to_string(&mut toml_str)?;
    toml::from_str(toml_str.as_str()).context("could not parse config")
}

/// Create an instance from a previously saved snapshot.
pub(crate) fn restore(
    path: impl AsRef<Path>,
    config: Config,
    log: &slog::Logger,
) -> anyhow::Result<(Instance, Arc<UDSock>)> {
    info!(log, "restoring snapshot of VM from {}", path.as_ref().display());

    let file = File::open(&path).context("Failed to open snapshot file")?;
    let mut archive = TarArchive::new(file);

    // We have enough to create the instance so let's do that first
    let (inst, com1_sock) = super::setup_instance(config, true, log)
        .context("Failed to create Instance with config in snapshot")?;

    let guard = inst.lock().unwrap();
    let machine = guard.machine.as_ref().unwrap();
    let hdl = machine.hdl.clone();
    let memctx = machine.acc_mem.access().unwrap();

    // Set the kernel VMM state to paused, so that devices can be consistently
    // loaded without timers and such attempting to fire.
    hdl.pause()?;

    // Ensure vCPUs are in the active state
    for vcpu in machine.vcpus.iter() {
        vcpu.activate().context("Failed to activate vCPU")?;
    }

    // Mimic state transitions propolis-server would go through for a live migration
    // XXX put instance in migrate-source state

    let global: VmGlobalState = {
        let mut json_bytes = Vec::new();
        let mut global_ent = archive.named_entry(GLOBAL_NAME)?;
        global_ent.read_to_end(&mut json_bytes)?;
        serde_json::from_slice(json_bytes.as_slice())
            .context("could not parse global data")?
    };
    import_global(&hdl, &global).context("failed to import global VM state")?;

    // Read device state data
    let device_data = archive
        .entries()?
        .filter_map(|ent| {
            let mut ent = ent.ok()?;
            let path = ent.path().ok()?.into_owned();

            let mut parts = path.components();
            match (parts.next(), parts.next(), parts.next()) {
                (Some(dir), Some(_name), None)
                    if (dir.as_ref() as &Path) == Path::new(DEVICE_DIR) =>
                {
                    let mut dev_bytes = Vec::with_capacity(ent.size() as usize);
                    ent.read_to_end(&mut dev_bytes).ok()?;

                    Some(dev_bytes)
                }
                _ => None,
            }
        })
        .map(|data| serde_json::from_slice(&data[..]))
        .collect::<Result<Vec<SnapshotDevice>, serde_json::Error>>()?;

    // Locate and import RAM data
    let mem_segs = archive
        .entries()?
        .filter_map(|ent| {
            let ent = ent.ok()?;
            let path = ent.path().ok()?;

            let mut parts = path.components();
            match (parts.next(), parts.next(), parts.next()) {
                (Some(dir), Some(name), None)
                    if (dir.as_ref() as &Path) == Path::new(MEMORY_DIR) =>
                {
                    Some((name.as_os_str().to_str()?.to_owned(), ent))
                }
                _ => None,
            }
        })
        .map(|(name, entry)| {
            let (start, end) = parse_mem_name(&name)?;
            let file_off = entry.raw_file_position();
            Ok((start, end, file_off))
        })
        .collect::<Result<Vec<_>, anyhow::Error>>()?;

    let fp = archive.as_file();
    for (start, end, file_off) in mem_segs {
        debug!(log, "Loading guest memory region {start:08x}-{end:08x}");
        let region = GuestRegion(GuestAddr(start as u64), end - start);
        let mapping =
            memctx.direct_writable_region(&region).ok_or_else(|| {
                anyhow::anyhow!("could not map region {region:?}")
            })?;
        mapping.pread(fp, mapping.len(), file_off as i64)?;
    }

    // Finally, let's restore the device state
    let migratectx = MigrateCtx { mem: &memctx };
    for snap_dev in device_data {
        let name = &snap_dev.instance_name;
        let dev = guard.inventory.devs.get(name).ok_or_else(|| {
            anyhow::anyhow!("unknown device in snapshot {name}")
        })?;

        match dev.migrate() {
            Migrator::NonMigratable => anyhow::bail!(
                "can't restore snapshot with non-migratable device ({name})"
            ),
            Migrator::Empty => {
                // There really shouldn't be a payload for this
                warn!(
                    log,
                    "unexpected device state for device {} in snapshot", name
                );
            }
            Migrator::Single(mech) => {
                if snap_dev.payload.len() != 1 {
                    anyhow::bail!(
                        "Unexpected payload count {}",
                        snap_dev.payload.len()
                    );
                }
                let payload = &snap_dev.payload[0];
                let mut deser_data =
                    serde_json::Deserializer::from_str(&payload.data);

                let offer = PayloadOffer {
                    kind: &payload.kind,
                    version: payload.version,
                    payload: Box::new(<dyn erased_serde::Deserializer>::erase(
                        &mut deser_data,
                    )),
                };
                debug!(log, "Importing data into device {name}");
                mech.import(offer, &migratectx).with_context(|| {
                    format!("Failed to restore device state for {name}")
                })?;
            }
            Migrator::Multi(mech) => {
                let mut payload_desers: Vec<
                    serde_json::Deserializer<serde_json::de::StrRead>,
                > = Vec::with_capacity(snap_dev.payload.len());
                let mut metadata: Vec<(&str, u32)> =
                    Vec::with_capacity(snap_dev.payload.len());
                for payload in snap_dev.payload.iter() {
                    payload_desers.push(serde_json::Deserializer::from_str(
                        &payload.data,
                    ));
                    metadata.push((&payload.kind, payload.version));
                }
                let offer_iter = metadata
                    .iter()
                    .zip(payload_desers.iter_mut())
                    .map(|(meta, deser)| PayloadOffer {
                        kind: meta.0,
                        version: meta.1,
                        payload: Box::new(
                            <dyn erased_serde::Deserializer>::erase(deser),
                        ),
                    });

                let mut offer = PayloadOffers::new(offer_iter);
                debug!(log, "Importing data into device {name}");
                mech.import(&mut offer, &migratectx).with_context(|| {
                    format!("Failed to restore device state for {name}",)
                })?;

                let remain = offer.remaining().count();
                if remain > 0 {
                    return Err(anyhow::anyhow!(
                        "Device {name} had {remain} remaining payload(s)"
                    ));
                }
            }
        }
    }

    drop(memctx);
    drop(guard);
    Ok((inst, com1_sock))
}

#[derive(serde::Serialize, serde::Deserialize)]
pub struct VmGlobalState {
    // Just using the raw boot_hrtime leaves room for all sorts of failures,
    // especially if a saved state file is used after a subsequent reboot of the
    // host.  These problems can be addressed later.
    pub boot_hrtime: i64,
    // Fixing up the guest TSC is left as an exercise for later
}

fn export_global(hdl: &VmmHdl) -> io::Result<VmGlobalState> {
    if hdl.api_version()? > ApiVersion::V11 {
        let info = hdl.data_op(VDC_VMM_TIME, 1).read::<vdi_time_info_v1>()?;

        Ok(VmGlobalState { boot_hrtime: info.vt_boot_hrtime })
    } else {
        let arch_entries: Vec<bhyve_api::vdi_field_entry_v1> =
            hdl.data_op(VDC_VMM_ARCH, 1).read_all()?;
        let boot_ent = arch_entries
            .iter()
            .find(|ent| ent.vfe_ident == VAI_BOOT_HRTIME)
            .expect("VAI_BOOT_HRTIME should be present");

        Ok(VmGlobalState { boot_hrtime: boot_ent.vfe_value as i64 })
    }
}
fn import_global(hdl: &VmmHdl, state: &VmGlobalState) -> io::Result<()> {
    if hdl.api_version()? > ApiVersion::V11 {
        let mut info =
            hdl.data_op(VDC_VMM_TIME, 1).read::<vdi_time_info_v1>()?;

        info.vt_boot_hrtime = state.boot_hrtime;
        hdl.data_op(VDC_VMM_TIME, 1).write(&info)?;

        Ok(())
    } else {
        let arch_entry =
            vdi_field_entry_v1::new(VAI_BOOT_HRTIME, state.boot_hrtime as u64);
        hdl.data_op(VDC_VMM_ARCH, 1).write(&arch_entry)?;
        Ok(())
    }
}

/// Add some convenience wrappers atop [tar::Builder]
struct TarBuilder(tar::Builder<File>);
impl TarBuilder {
    fn new(fp: File) -> Self {
        Self(tar::Builder::new(fp))
    }
    fn append_data(
        &mut self,
        header: &mut tar::Header,
        path: impl AsRef<Path>,
        data: impl Read,
    ) -> io::Result<()> {
        self.0.append_data(header, path, data)
    }

    fn append_space(
        &mut self,
        header: &mut tar::Header,
        path: impl AsRef<Path>,
    ) -> io::Result<u64> {
        let size = header.size()?;
        self.0.append_data(header, path, io::empty())?;

        let position = self.0.get_mut().stream_position()?;
        // data region should be in multiple of tar block size
        let seek_off = match size % 512 {
            0 => size,
            rem => size + (512 - rem),
        };
        self.0.get_mut().seek(SeekFrom::Current(seek_off as i64))?;

        Ok(position)
    }

    fn append_dir(
        &mut self,
        header: &mut tar::Header,
        path: impl AsRef<Path>,
    ) -> io::Result<()> {
        assert!(header.entry_type().is_dir());
        self.0.append_data(header, path, io::empty())
    }

    fn rawfd(&mut self) -> RawFd {
        self.0.get_mut().as_raw_fd()
    }
    fn inner_builder(&mut self) -> &mut tar::Builder<File> {
        &mut self.0
    }
    fn into_file(self) -> io::Result<File> {
        self.0.into_inner()
    }
}

/// Hold either a [tar::Archive] built atop a [File], or the [File] itself
enum TarInner {
    Raw(Option<File>),
    Tar(Option<tar::Archive<File>>),
}
impl TarInner {
    fn as_tar(&mut self) -> Option<&mut tar::Archive<File>> {
        match self {
            TarInner::Raw(..) => None,
            TarInner::Tar(arc) => Some(arc.as_mut().unwrap()),
        }
    }
    fn as_file(&mut self) -> Option<&mut File> {
        match self {
            TarInner::Raw(fp) => Some(fp.as_mut().unwrap()),
            TarInner::Tar(..) => None,
        }
    }
}

/// Wrap some of the [tar::Archive] functionality, making it easier to
/// repeatedly iterate over entries and/or gain access to the underlying [File]
/// resource.
struct TarArchive(TarInner);
impl TarArchive {
    fn new(fp: File) -> Self {
        Self(TarInner::Raw(Some(fp)))
    }

    fn reset_tar(&mut self) -> io::Result<&mut tar::Archive<File>> {
        self.0 = match &mut self.0 {
            TarInner::Raw(ref mut fp) => {
                TarInner::Tar(Some(tar::Archive::new(fp.take().unwrap())))
            }
            TarInner::Tar(ref mut arc) => {
                let mut fp = arc.take().unwrap().into_inner();
                let _ = fp.seek(SeekFrom::Start(0))?;
                TarInner::Tar(Some(tar::Archive::new(fp)))
            }
        };
        Ok(self.0.as_tar().unwrap())
    }

    fn as_file(&mut self) -> &mut File {
        self.0 = match &mut self.0 {
            TarInner::Raw(ref mut fp) => {
                TarInner::Raw(Some(fp.take().unwrap()))
            }
            TarInner::Tar(ref mut arc) => {
                TarInner::Raw(Some(arc.take().unwrap().into_inner()))
            }
        };
        self.0.as_file().unwrap()
    }

    fn named_entry(&mut self, name: &str) -> io::Result<tar::Entry<'_, File>> {
        let tar = self.reset_tar()?;

        let entry = tar
            .entries_with_seek()?
            .find(|ent| {
                if let Ok(ent) = ent {
                    if let Ok(path) = ent.path() {
                        return path == Path::new(name);
                    }
                }
                false
            })
            .map(Result::unwrap);
        entry.ok_or_else(|| {
            io::Error::new(
                io::ErrorKind::NotFound,
                format!("file '{name}' not found in archive"),
            )
        })
    }

    fn entries(&mut self) -> io::Result<tar::Entries<'_, File>> {
        let tar = self.reset_tar()?;
        tar.entries_with_seek()
    }
}


================================================
FILE: bin/propolis-utils/Cargo.toml
================================================
[package]
name = "propolis-utils"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[[bin]]
name = "cpuid-gen"
test = false
doctest = false

[[bin]]
name = "rsrvrctl"
test = false
doctest = false

[dependencies]
anyhow.workspace = true
clap = { workspace = true, features = ["derive"] }
cpuid_utils = { workspace = true, features = ["instance-spec"] }
serde = { workspace = true, features = ["derive"] }
propolis = { workspace = true, default-features = false }
propolis_api_types.workspace = true
bhyve_api = { workspace = true }
libc = { workspace = true }
serde_json.workspace = true


================================================
FILE: bin/propolis-utils/README.md
================================================
# Propolis Utilities

This is a collection of assorted utilities which may be useful for certain
development activities, but are otherwise not meant for general consumption.

## Utilities

- `cpuid-gen`: Generated `cpuid` profile using the legacy emulated output from the
  local host CPU, as filtered by the kernel VMM logic.
- `rsrvrctl`: Manipulate the kernel VMM memory reservoir in the same manner
  offered by the utility shipped by the OS


================================================
FILE: bin/propolis-utils/src/bin/cpuid-gen.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::str::FromStr;

use clap::{Parser, ValueEnum};
use cpuid_utils::CpuidSet;

fn print_text(results: &CpuidSet) {
    for (key, value) in results.iter() {
        let header = match key.subleaf {
            None => {
                format!("eax:{:x}\t\t", key.leaf)
            }
            Some(subleaf) => {
                format!("eax:{:x} ecx:{:x}", key.leaf, subleaf)
            }
        };

        println!(
            "{} ->\t{:x} {:x} {:x} {:x}",
            header, value.eax, value.ebx, value.ecx, value.edx
        );
    }
}
fn print_toml(results: &CpuidSet) {
    println!("[cpuid]");
    for (key, value) in results.iter() {
        let key_name = match key.subleaf {
            None => format!("{:x}", key.leaf),
            Some(subleaf) => format!("{:x}-{:x}", key.leaf, subleaf),
        };
        println!(
            "\"{}\" = [0x{:x}, 0x{:x}, 0x{:x}, 0x{:x}]",
            key_name, value.eax, value.ebx, value.ecx, value.edx
        );
    }
}

fn print_json(results: CpuidSet) {
    let cpuid = results.into_instance_spec_cpuid();
    println!("{}", serde_json::to_string_pretty(&cpuid).unwrap());
}

#[derive(Default, Clone, Copy, Debug, ValueEnum)]
enum OutputFormat {
    /// Print a human-readable plain-text representation.
    #[default]
    Text,

    /// Print TOML suitable for use in a propolis-standalone config file.
    Toml,

    /// Print JSON suitable for inclusion in a propolis-server instance spec.
    Json,
}

impl FromStr for OutputFormat {
    type Err = &'static str;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Ok(match s {
            "text" => Self::Text,
            "toml" => Self::Toml,
            "json" => Self::Json,
            _ => {
                return Err(
                    "invalid output format, must be text, toml, or json",
                )
            }
        })
    }
}

#[derive(clap::Parser)]
struct Opts {
    /// Elide all-zero entries from results
    #[clap(short)]
    zero_elide: bool,

    /// Emit output in the specified format
    #[clap(short, long, value_enum, default_value = "text")]
    format: OutputFormat,

    /// Query CPU directly, rather that via bhyve masking
    #[clap(short)]
    raw_query: bool,
}

fn main() -> anyhow::Result<()> {
    let opts = Opts::parse();

    let source = if opts.raw_query {
        cpuid_utils::host::CpuidSource::HostCpu
    } else {
        cpuid_utils::host::CpuidSource::BhyveDefault
    };

    let mut results = cpuid_utils::host::query_complete(source)?;
    if opts.zero_elide {
        results.retain(|_id, val| !val.all_zero());
    }

    match opts.format {
        OutputFormat::Text => print_text(&results),
        OutputFormat::Toml => print_toml(&results),
        OutputFormat::Json => print_json(results),
    }

    Ok(())
}


================================================
FILE: bin/propolis-utils/src/bin/rsrvrctl.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use bhyve_api::{ReservoirError, VmmCtlFd};
use clap::Parser;

#[derive(clap::Parser, Debug)]
struct Opts {
    #[clap(subcommand)]
    cmd: Command,
}

#[derive(clap::Subcommand, Debug)]
enum Command {
    /// Add to reservoir capacity
    Add {
        /// Size to add (MiB)
        sz: usize,
        /// Chunk size (MiB)
        chunk: Option<usize>,
    },
    /// Remove from reservoir capacity
    Remove {
        /// Size to remove (MiB)
        sz: usize,
        /// Chunk size (MiB)
        chunk: Option<usize>,
    },
    /// Set reservoir capacity
    Set {
        /// Target size (MiB)
        sz: usize,
        /// Chunk size (MiB)
        chunk: Option<usize>,
    },
    /// Query current reservoir information
    Query,
}

const MB: usize = 1024 * 1024;

fn do_resize(
    ctl: &VmmCtlFd,
    sz_bytes: usize,
    chunk_mb: usize,
) -> std::io::Result<()> {
    loop {
        match ctl.reservoir_resize(sz_bytes, chunk_mb * MB) {
            Err(ReservoirError::Interrupted(sz)) => {
                println!("Reservoir size: {}MiB", sz / MB);
            }
            Err(ReservoirError::Io(e)) => return Err(e),
            Ok(_) => return Ok(()),
        }
    }
}

fn main() -> anyhow::Result<()> {
    let opts = Opts::parse();

    let ctl = VmmCtlFd::open()?;

    let size_total =
        |q: bhyve_api::vmm_resv_query| q.vrq_free_sz + q.vrq_alloc_sz;

    match opts.cmd {
        Command::Add { sz, chunk } => {
            let cur_sz = ctl.reservoir_query().map(size_total)?;

            do_resize(
                &ctl,
                cur_sz.saturating_add(sz * MB),
                chunk.unwrap_or(0),
            )?;
        }
        Command::Remove { sz, chunk } => {
            let cur_sz = ctl.reservoir_query().map(size_total)?;

            do_resize(
                &ctl,
                cur_sz.saturating_sub(sz * MB),
                chunk.unwrap_or(0),
            )?;
        }
        Command::Set { sz, chunk } => {
            do_resize(&ctl, sz * MB, chunk.unwrap_or(0))?;
        }
        Command::Query => {
            let sz = ctl.reservoir_query()?;
            println!(
                "Free KiB:\t\t\t{}\n\
                Allocated KiB:\t\t\t{}\n\
                Transient Allocated KiB:\t{}\n\
                Size limit KiB:\t\t\t{}",
                sz.vrq_free_sz / 1024,
                sz.vrq_alloc_sz / 1024,
                sz.vrq_alloc_transient_sz / 1024,
                sz.vrq_limit / 1024
            );
        }
    }

    Ok(())
}


================================================
FILE: crates/bhyve-api/Cargo.toml
================================================
[package]
name = "bhyve_api"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
bhyve_api_sys.workspace = true
libc.workspace = true
strum = { workspace = true, features = ["derive"] }


================================================
FILE: crates/bhyve-api/README.md
================================================
# bhyve-api

This crate exposes the interfaces from the bhyve kernel VMM.  Since those
interfaces are Private and subject to change, it means this crate must be kept
in sync with changes, and Propolis binaries built against a given version of
the crate (and implied interface version) will only run on systems with that
exact OS version.


================================================
FILE: crates/bhyve-api/header-check/Cargo.toml
================================================
[package]
name = "bhyve_api-hdrchk"
version = "0.0.0"
license = "MPL-2.0"
build = "build.rs"
publish = false

[dependencies]
bhyve_api_sys = { path = "../sys" }
libc = "0.2"
strum = "0.25"

[build-dependencies]
cc = "1"
ctest2 = "0.4.7"
# Build-time conditions depend on the max API version defined in the crate
bhyve_api_sys = { path = "../sys" }

[[test]]
name = "main"
path = "test/main.rs"
harness = false


================================================
FILE: crates/bhyve-api/header-check/README.md
================================================
# bhyve-api Header Check

In order to facilitate accurate reproduction of the bhyve kernel interfaces
from their respective headers, this crate uses `ctest2` in the same manner as
[rust-libc](https://github.com/rust-lang/libc/).

## Usage

Building and executing the test requires a set of illumos-gate sources
corresponding to the version of interfaces authoried in bhyve-api.  Due to a
tangle in those headers, building and executing the test must itself be done on
a moderately recent illumos system, although that may change in the future.

From the `header-check` directory, execute `cargo test` with the `GATE_SRC`
environment variable pointing towards the afformentioned illumos-gate source
tree:

```
$ GATE_SRC=/path/to/my/illumos-gate cargo test
    Finished test [unoptimized + debuginfo] target(s) in 0.02s
     Running test/main.rs (target/debug/deps/main-2be1b9ed23245f3a)
RUNNING ALL TESTS
PASSED 578 tests
```


================================================
FILE: crates/bhyve-api/header-check/build.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![deny(warnings)]

use std::convert::TryFrom;
use std::env;
use std::path::PathBuf;
use std::str::FromStr;
use std::sync::atomic::{AtomicU32, Ordering};

extern crate bhyve_api_sys;
use bhyve_api_sys::VMM_CURRENT_INTERFACE_VERSION;

static CHECK_VERSION: AtomicU32 = AtomicU32::new(VMM_CURRENT_INTERFACE_VERSION);

/// Source checked against has API version greater than `ver` argument
fn ver_gt(ver: u32) -> bool {
    CHECK_VERSION.load(Ordering::Relaxed) > ver
}
/// Source checked against has API version less than `ver` argument
fn ver_lt(ver: u32) -> bool {
    CHECK_VERSION.load(Ordering::Relaxed) < ver
}
/// Source checked against has API version equal to `ver` argument
fn ver_eq(ver: u32) -> bool {
    CHECK_VERSION.load(Ordering::Relaxed) == ver
}

fn main() {
    let mut cfg = ctest2::TestGenerator::new();

    // We cannot proceed without a path to the source
    let gate_dir = match env::var("GATE_SRC").map(PathBuf::try_from) {
        Ok(Ok(dir)) => dir,
        _ => {
            eprintln!("Must specify path to illumos-gate sources with GATE_SRC env var");
            std::process::exit(1);
        }
    };

    // Allow the user to specify a target interface version to check against.
    match env::var("API_VERSION").ok().map(|v| u32::from_str(&v)) {
        Some(Ok(ver)) => {
            if ver > VMM_CURRENT_INTERFACE_VERSION {
                eprintln!(
                    "API_VERSION {} cannot be > \
                    VMM_CURRENT_INTERFACE_VERSION ({})",
                    ver, VMM_CURRENT_INTERFACE_VERSION
                );
                std::process::exit(1);
            }
            CHECK_VERSION.store(ver, Ordering::Relaxed);
        }
        Some(Err(e)) => {
            eprintln!("Invalid API_VERSION {:?}", e);
            std::process::exit(1);
        }
        _ => {}
    }

    let include_paths = [
        // For #include_next to work, these need to be first
        "usr/src/compat/bhyve",
        "usr/src/compat/bhyve/amd64",
        "usr/src/contrib/bhyve",
        "usr/src/contrib/bhyve/amd64",
        "usr/src/head",
        "usr/src/uts/intel",
        "usr/src/uts/common",
    ];
    cfg.include("/usr/include");
    for p in include_paths {
        cfg.include(gate_dir.join(p));
    }

    cfg.header("sys/types.h");
    cfg.header("sys/vmm.h");
    cfg.header("sys/vmm_dev.h");
    cfg.header("sys/vmm_data.h");

    cfg.skip_const(move |name| match name {
        _n if _n.starts_with("SEG_") => true,

        // defined for crate consumer convenience
        "VMM_PATH_PREFIX" => true,
        "VMM_CTL_PATH" => true,

        // This was recently hidden from userspace.
        // We expose our own copy for now for us as a constraint.
        "VM_MAXCPU" => true,

        // Do not bother checking the version definition define if we are
        // assuming the source is from a different version.
        "VMM_CURRENT_INTERFACE_VERSION"
            if !ver_eq(VMM_CURRENT_INTERFACE_VERSION) =>
        {
            true
        }

        // API V11 saw the removal of several time-realted VMM_ARCH defines
        "VAI_TSC_BOOT_OFFSET" | "VAI_BOOT_HRTIME" | "VAI_TSC_FREQ"
            if ver_gt(10) =>
        {
            true
        }
        // API V11 saw the addition of the VMM_TIME data class
        "VDC_VMM_TIME" if ver_lt(11) => true,

        // API V16 saw the removal of the force-suspend flag for VM_REINIT
        "VM_REINIT_F_FORCE_SUSPEND" if ver_gt(15) => true,

        _ => false,
    });

    cfg.skip_struct(|name| match name {
        // Skip over the vmexit/vmentry structs due to unions being a mess
        "vm_exit" => true,
        "vm_exit_payload" => true,
        "vm_entry" => true,
        "vm_entry_payload" => true,

        // Skip anonymous types from vm_exit
        "vm_rwmsr" => true,
        "vm_exit_vmx" => true,
        "vm_exit_svm" => true,
        "vm_exit_msr" => true,
        "vm_exit_suspend" => true,
        "vm_inst_emul" => true,
        "vm_paging" => true,

        // In API V12, the RTC data struct was revised to v2, with the old v1
        // definition being removed.
        "vdi_rtc_v1" if ver_gt(11) => true,
        "vdi_rtc_v2" if ver_lt(12) => true,

        // VMM_TIME struct added in API V11
        "vdi_time_info_v1" if ver_lt(11) => true,

        _ => false,
    });

    cfg.skip_field_type(|ty, field| match (ty, field) {
        // Defined as an `int` in the crate, instead of an enum
        ("vm_isa_irq_trigger", "trigger") => true,
        ("vm_capability", "captype") => true,

        // Strictness between `u8` and `char` is excessive
        ("vm_create_req", "name") => true,
        ("vm_destroy_req", "name") => true,
        ("vm_memseg", "name") => true,

        _ => false,
    });

    cfg.generate("../sys/src/lib.rs", "main.rs");
}


================================================
FILE: crates/bhyve-api/header-check/test/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

extern crate bhyve_api_sys;
extern crate libc;

use bhyve_api_sys::*;
use libc::*;

include!(concat!(env!("OUT_DIR"), "/main.rs"));


================================================
FILE: crates/bhyve-api/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs::{File, OpenOptions};
use std::io::{Error, ErrorKind, Result};
use std::mem::{size_of, size_of_val};
use std::os::fd::*;
use std::os::unix::fs::OpenOptionsExt;
use std::path::PathBuf;
use std::sync::atomic::{AtomicI64, Ordering};
use std::time::Duration;

pub use bhyve_api_sys::*;

pub const VMM_PATH_PREFIX: &str = "/dev/vmm";
pub const VMM_CTL_PATH: &str = "/dev/vmmctl";

pub struct VmmCtlFd(File);
impl VmmCtlFd {
    pub fn open() -> Result<Self> {
        let ctl = OpenOptions::new()
            .write(true)
            .custom_flags(libc::O_EXCL)
            .open(VMM_CTL_PATH)?;
        Ok(Self(ctl))
    }

    /// Issue ioctl against open vmmctl handle
    ///
    /// # Safety
    ///
    /// Caller is charged with providing `data` argument which is adequate for
    /// any copyin/copyout actions which may occur as part of the ioctl
    /// processing.
    pub unsafe fn ioctl<T>(&self, cmd: i32, data: *mut T) -> Result<i32> {
        ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void)
    }
    pub fn ioctl_usize(&self, cmd: i32, data: usize) -> Result<i32> {
        if !Self::ioctl_usize_safe(cmd) {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "unsafe cmd provided",
            ));
        }
        // Safety: Since we are explicitly filtering for vmm ioctls which will
        // not assume the data argument is a pointer for copyin/copyout, we can
        // dismiss those dangers.  The caller is assumed to be cognizant of
        // other potential side effects.
        unsafe { ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void) }
    }

    /// Query the API version exposed by the kernel VMM.
    pub fn api_version(&self) -> Result<u32> {
        cache_api_version(|| -> Result<u32> { self.query_api_version() })
    }

    /// Perform the actual query of the API version
    fn query_api_version(&self) -> Result<u32> {
        let vers = self.ioctl_usize(ioctls::VMM_INTERFACE_VERSION, 0)?;

        // We expect and demand a positive version number from the
        // VMM_INTERFACE_VERSION interface.
        assert!(vers > 0);
        Ok(vers as u32)
    }

    /// Request that the VMM memory reservoir be resized.
    ///
    /// Since this may involve gathering a large portion of memory from the OS
    /// kernel to place in the reservoir, a chunking parameter `chunk_bytes` can
    /// be used to limit the size of in-kernel requests used to fulfill the
    /// request.
    pub fn reservoir_resize(
        &self,
        target_bytes: usize,
        chunk_bytes: usize,
    ) -> std::result::Result<(), ReservoirError> {
        let mut req = vmm_resv_target {
            vrt_target_sz: target_bytes,
            vrt_chunk_sz: chunk_bytes,
            ..Default::default()
        };

        // Safety: We are using the appropriate struct for this ioctl
        let res = unsafe { self.ioctl(ioctls::VMM_RESV_SET_TARGET, &mut req) };

        match res {
            Err(e) if e.kind() == ErrorKind::Interrupted => {
                Err(ReservoirError::Interrupted(req.vrt_result_sz))
            }
            Err(e) => Err(ReservoirError::Io(e)),

            Ok(_) => Ok(()),
        }
    }

    /// Query VMM memory reservoir capacity and usage.
    pub fn reservoir_query(&self) -> Result<vmm_resv_query> {
        let mut req = vmm_resv_query::default();

        // Safety: We are using the appropriate struct for this ioctl
        unsafe { self.ioctl(ioctls::VMM_RESV_QUERY, &mut req) }?;

        Ok(req)
    }

    /// Destroy VM instance
    pub fn vm_destroy(&self, name: &[u8]) -> Result<()> {
        let mut req = vm_destroy_req::new(name)?;
        unsafe { self.ioctl(ioctls::VMM_DESTROY_VM, &mut req)? };
        Ok(())
    }

    /// Check VMM ioctl command against those known to not require any
    /// copyin/copyout to function.
    const fn ioctl_usize_safe(cmd: i32) -> bool {
        matches!(cmd, ioctls::VMM_INTERFACE_VERSION,)
    }
}

impl AsRawFd for VmmCtlFd {
    fn as_raw_fd(&self) -> RawFd {
        self.0.as_raw_fd()
    }
}

pub enum ReservoirError {
    /// Resizing operation was interrupted, but if a non-zero chunk size was
    /// specified, one or more chunk-sized adjustments to the reservoir size may
    /// have completed.
    ///
    /// In that case, the resulting size of the reservoir is returned.
    Interrupted(usize),
    /// An IO error (other than interruption) occurred
    Io(Error),
}
impl From<ReservoirError> for Error {
    fn from(val: ReservoirError) -> Self {
        match val {
            ReservoirError::Interrupted(_) => {
                Error::new(ErrorKind::Interrupted, "interrupted")
            }
            ReservoirError::Io(e) => e,
        }
    }
}

pub struct VmmFd(File);
impl VmmFd {
    pub fn open(name: &str) -> Result<Self> {
        let mut vmpath = PathBuf::from(VMM_PATH_PREFIX);
        vmpath.push(name);

        let fp = OpenOptions::new().write(true).read(true).open(vmpath)?;
        Ok(Self(fp))
    }

    /// Create new instance from raw `File` resource
    ///
    /// # Safety
    ///
    /// Caller is expected to provide `File` resource which which is a valid vmm
    /// resource.  (Or alternatively, is not to make any vmm-related ioctls, if
    /// this instance was created for unit-testing purposes.)
    pub unsafe fn new_raw(fp: File) -> Self {
        Self(fp)
    }

    /// Issue ioctl against open vmm instance
    ///
    /// # Safety
    ///
    /// Caller is charged with providing `data` argument which is adequate for
    /// any copyin/copyout actions which may occur as part of the ioctl
    /// processing.
    pub unsafe fn ioctl<T>(&self, cmd: i32, data: *mut T) -> Result<i32> {
        ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void)
    }
    pub fn ioctl_usize(&self, cmd: i32, data: usize) -> Result<i32> {
        if !Self::ioctl_usize_safe(cmd) {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "unsafe cmd provided",
            ));
        }
        // Safety: Since we are explicitly filtering for vmm ioctls which will
        // not assume the data argument is a pointer for copyin/copyout, we can
        // dismiss those dangers.  The caller is assumed to be cognizant of
        // other potential side effects.
        unsafe { ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void) }
    }

    /// Query the API version exposed by the kernel VMM.
    pub fn api_version(&self) -> Result<u32> {
        cache_api_version(|| -> Result<u32> {
            match self.ioctl_usize(ioctls::VMM_INTERFACE_VERSION, 0) {
                Ok(v) => {
                    assert!(v > 0);
                    Ok(v as u32)
                }
                Err(e) if e.raw_os_error() == Some(libc::ENOTTY) => {
                    // Prior to V6, the only the vmmctl device would answer
                    // version queries, so fall back gracefully if the ioctl is
                    // unrecognized.
                    let ctl = VmmCtlFd::open()?;
                    ctl.query_api_version()
                }
                Err(e) => Err(e),
            }
        })
    }

    /// Set the time reported by the virtual RTC time.
    ///
    /// Arguments:
    /// - `time`: Duration since `UNIX_EPOCH`
    pub fn rtc_settime(&self, time: Duration) -> Result<()> {
        if self.api_version()? >= ApiVersion::V12 as u32 {
            let mut ts = libc::timespec {
                tv_sec: time.as_secs() as i64,
                tv_nsec: i64::from(time.subsec_nanos()),
            };
            unsafe { self.ioctl(ioctls::VM_RTC_SETTIME, &mut ts) }?;
            Ok(())
        } else {
            // The old RTC_SETTIME only support seconds precision
            let mut time_sec: u64 = time.as_secs();
            unsafe { self.ioctl(ioctls::VM_RTC_SETTIME, &mut time_sec) }?;
            Ok(())
        }
    }

    /// Build a [`VmmDataOp`] with specified `class` and `version` to read or
    /// write data from the in-kernel vmm.
    pub fn data_op(&self, class: u16, version: u16) -> VmmDataOp<'_> {
        VmmDataOp::new(self, class, version)
    }

    /// Check VMM ioctl command against those known to not require any
    /// copyin/copyout to function.
    const fn ioctl_usize_safe(cmd: i32) -> bool {
        matches!(
            cmd,
            ioctls::VM_PAUSE
                | ioctls::VM_RESUME
                | ioctls::VM_DESTROY_SELF
                | ioctls::VM_SET_AUTODESTRUCT
                | ioctls::VMM_INTERFACE_VERSION
                | ioctls::VM_VCPU_BARRIER,
        )
    }
}

impl AsRawFd for VmmFd {
    fn as_raw_fd(&self) -> RawFd {
        self.0.as_raw_fd()
    }
}

pub type VmmDataResult<T> = std::result::Result<T, VmmDataError>;

/// Encompasses the configuration and context to perform a vmm-data operation
/// (read or write) against an instance.  The `class` and `version` for the
/// vmm-data operation are established with the parameters passed to
/// [`VmmFd::data_op`].
pub struct VmmDataOp<'a> {
    fd: &'a VmmFd,
    class: u16,
    version: u16,
    vcpuid: Option<i32>,
}

impl<'a> VmmDataOp<'a> {
    pub fn new(fd: &'a VmmFd, class: u16, version: u16) -> Self {
        Self { fd, class, version, vcpuid: None }
    }
}

impl VmmDataOp<'_> {
    /// Dictate that the vmm-data operation be performed in the context of a
    /// specific vCPU, rather than against the VM as a whole.
    pub fn for_vcpu(mut self, vcpuid: i32) -> Self {
        self.vcpuid = Some(vcpuid);
        self
    }

    /// Read item of data, returning the result.
    pub fn read<T: Sized + Copy + Default>(self) -> VmmDataResult<T> {
        let mut item = T::default();
        self.do_read_single(
            &mut item as *mut T as *mut libc::c_void,
            size_of::<T>() as u32,
            false,
        )?;
        Ok(item)
    }

    /// Read item of data into provided buffer
    pub fn read_into<T: Sized>(self, data: &mut T) -> VmmDataResult<()> {
        self.do_read_single(
            data as *mut T as *mut libc::c_void,
            size_of::<T>() as u32,
            false,
        )
    }

    /// Read item of data, specified by identifier existing in provided buffer
    pub fn read_item<T: Sized>(self, data: &mut T) -> VmmDataResult<()> {
        self.do_read_single(
            data as *mut T as *mut libc::c_void,
            size_of::<T>() as u32,
            true,
        )
    }

    fn do_read_single(
        self,
        data: *mut libc::c_void,
        read_len: u32,
        do_copyin: bool,
    ) -> VmmDataResult<()> {
        let mut xfer = self.xfer_base(read_len, data);

        if do_copyin {
            xfer.vdx_flags |= VDX_FLAG_READ_COPYIN;
        }

        let bytes_read = self.do_read(&mut xfer)?;
        assert_eq!(bytes_read, read_len);
        Ok(())
    }

    /// Read data items, specified by identifiers existing in provided buffer
    pub fn read_many<T: Sized>(self, data: &mut [T]) -> VmmDataResult<()> {
        let read_len = size_of_val(data) as u32;
        let mut xfer =
            self.xfer_base(read_len, data.as_mut_ptr() as *mut libc::c_void);

        // When reading multiple items, it is expected that identifiers will be
        // passed into the kernel to select the entries which will be read (as
        // opposed to read-all, which is indiscriminate).
        //
        // As such, a copyin-before-read is implied to provide said identifiers.
        xfer.vdx_flags |= VDX_FLAG_READ_COPYIN;

        let bytes_read = self.do_read(&mut xfer)?;
        assert_eq!(bytes_read, read_len);
        Ok(())
    }

    /// Read all data items offered by this class/version
    pub fn read_all<T: Sized>(self) -> VmmDataResult<Vec<T>> {
        let mut xfer = self.xfer_base(0, std::ptr::null_mut());
        let total_len = match self.do_read(&mut xfer) {
            Err(VmmDataError::SpaceNeeded(sz)) => Ok(sz),
            Err(e) => Err(e),
            Ok(_) => panic!("unexpected success"),
        }?;
        let item_len = size_of::<T>() as u32;
        assert!(total_len >= item_len, "item size exceeds total data size");

        let item_count = total_len / item_len;
        assert_eq!(
            total_len,
            item_count * item_len,
            "per-item sizing does not match total data size"
        );

        let mut data: Vec<T> = Vec::with_capacity(item_count as usize);
        let mut xfer =
            self.xfer_base(total_len, data.as_mut_ptr() as *mut libc::c_void);

        let bytes_read = self.do_read(&mut xfer)?;
        assert!(bytes_read <= total_len);

        // SAFETY: Data is populated by the ioctl
        unsafe {
            data.set_len((bytes_read / item_len) as usize);
        }
        Ok(data)
    }

    /// Write item of data
    pub fn write<T: Sized>(self, data: &T) -> VmmDataResult<()> {
        let write_len = size_of::<T>() as u32;
        let mut xfer = self.xfer_base(
            write_len,
            data as *const T as *mut T as *mut libc::c_void,
        );

        let bytes_written = self.do_write(&mut xfer)?;
        assert_eq!(bytes_written, write_len);
        Ok(())
    }

    /// Write data items
    pub fn write_many<T: Sized>(self, data: &[T]) -> VmmDataResult<()> {
        let write_len = size_of_val(data) as u32;
        let mut xfer = self
            .xfer_base(write_len, data.as_ptr() as *mut T as *mut libc::c_void);

        let bytes_written = self.do_write(&mut xfer)?;
        assert_eq!(bytes_written, write_len);
        Ok(())
    }

    /// Build a [`vm_data_xfer`] struct based on parameters established for this
    /// data operation.
    fn xfer_base(&self, len: u32, data: *mut libc::c_void) -> vm_data_xfer {
        vm_data_xfer {
            vdx_vcpuid: self.vcpuid.unwrap_or(-1),
            vdx_class: self.class,
            vdx_version: self.version,
            vdx_len: len,
            vdx_data: data,
            ..Default::default()
        }
    }

    fn do_read(
        &self,
        xfer: &mut vm_data_xfer,
    ) -> std::result::Result<u32, VmmDataError> {
        self.do_ioctl(VM_DATA_READ, xfer)
    }

    fn do_write(
        &self,
        xfer: &mut vm_data_xfer,
    ) -> std::result::Result<u32, VmmDataError> {
        // If logic is added to VM_DATA_WRITE which actually makes use of
        // [`VDX_FLAG_WRITE_COPYOUT`], then the fact that [`write`] and
        // [`write_many`] accept const references for the data input will need
        // to be revisited.
        self.do_ioctl(VM_DATA_WRITE, xfer)
    }

    /// Execute a vmm-data transfer, translating the ENOSPC error, if emitted
    fn do_ioctl(
        &self,
        op: i32,
        xfer: &mut vm_data_xfer,
    ) -> std::result::Result<u32, VmmDataError> {
        match unsafe { self.fd.ioctl(op, xfer) } {
            Err(e) => match e.raw_os_error() {
                Some(errno) if errno == libc::ENOSPC => {
                    Err(VmmDataError::SpaceNeeded(xfer.vdx_result_len))
                }
                _ => Err(VmmDataError::IoError(e)),
            },
            Ok(_) => Ok(xfer.vdx_result_len),
        }
    }
}

#[derive(Debug)]
pub enum VmmDataError {
    IoError(Error),
    SpaceNeeded(u32),
}

impl From<VmmDataError> for Error {
    fn from(err: VmmDataError) -> Self {
        match err {
            VmmDataError::IoError(e) => e,
            VmmDataError::SpaceNeeded(c) => {
                // ErrorKind::StorageFull would more accurately match the underlying ENOSPC
                // but that variant is unstable still
                Error::other(format!("operation requires {c} bytes"))
            }
        }
    }
}

/// Store a cached copy of the queried API version.  Negative values indicate an
/// error occurred during query (and hold the corresponding negated `errno`).
/// A positive value indicates the cached version, and should be less than
/// `u32::MAX`.  A value of 0 indicates that no query has been performed yet.
static VERSION_CACHE: AtomicI64 = AtomicI64::new(0);

/// Query the API version from the kernel VMM component on the system.
///
/// Caches said version (or any emitted error) for later calls. The API version
/// may be used at runtime in operating the virtual machine, where the delay to
/// query again would be more directly guest-impactful.
pub fn api_version() -> Result<u32> {
    cache_api_version(|| -> Result<u32> {
        let ctl = VmmCtlFd::open()?;
        let vers = ctl.query_api_version()?;
        Ok(vers)
    })
}

fn cache_api_version(do_query: impl FnOnce() -> Result<u32>) -> Result<u32> {
    if VERSION_CACHE.load(Ordering::Acquire) == 0 {
        let newval = match do_query() {
            Ok(x) => i64::from(x),
            Err(e) => -i64::from(e.raw_os_error().unwrap_or(libc::ENOENT)),
        };
        let _ = VERSION_CACHE.compare_exchange(
            0,
            newval,
            Ordering::Relaxed,
            Ordering::Relaxed,
        );
    }

    match VERSION_CACHE.load(Ordering::Acquire) {
        0 => {
            panic!("expected VERSION_CACHE to be initialized")
        }
        x if x < 0 => Err(Error::from_raw_os_error(-x as i32)),
        y => {
            assert!(y < i64::from(u32::MAX));

            Ok(y as u32)
        }
    }
}

#[cfg(target_os = "illumos")]
unsafe fn ioctl(fd: RawFd, cmd: i32, data: *mut libc::c_void) -> Result<i32> {
    match libc::ioctl(fd, cmd, data) {
        -1 => Err(Error::last_os_error()),
        other => Ok(other),
    }
}

#[cfg(not(target_os = "illumos"))]
unsafe fn ioctl(
    _fd: RawFd,
    _cmd: i32,
    _data: *mut libc::c_void,
) -> Result<i32> {
    Err(Error::other("illumos required"))
}

/// Convenience constants to provide some documentation on what changes have
/// been introduced in the various bhyve API versions.
#[repr(u32)]
#[derive(Copy, Clone)]
pub enum ApiVersion {
    /// Initial support for CPU perf. counters on AMD
    V18 = 18,

    /// Add support for NPT bitmap operations
    V17 = 17,

    /// VM Suspend behavior reworked, `VM_VCPU_BARRIER` ioctl added
    V16 = 16,

    /// Add flag for exit-when-consistent as part of `VM_RUN`
    V15 = 15,

    /// Reading specific MSRs via vmm-data is fixed.  Access to DEBUGCTL and
    /// LBR-related MSR state is possible (on AMD).
    V14 = 14,

    /// Writes via vmm-data interface are allowed by default
    V13 = 13,

    /// Improved RTC emulation, including sub-second precision
    V12 = 12,

    /// Add support for modifing guest time data via vmm-data interface
    V11 = 11,

    /// Interrupt and exception state is properly saved/restored on VM
    /// pause/resume, and is exposed via vmm-data interface
    V10 = 10,

    /// Revamps ioctls for administrating the VMM memory reservoir and adds
    /// kstat for tracking its capacity and utilization.
    V9 = 9,

    /// Adds flag to enable dirty page tracking for VMs when running on hardware
    /// with adequate support.
    V8 = 8,

    /// Adds pause/resume ioctls to assist with the ability to load or store a
    /// consistent snapshot of VM state
    V7 = 7,

    /// Made hlt-on-exit a required CPU feature, and enabled by default in vmm
    V6 = 6,

    /// Adds ability to control `cpuid` results for guest vCPUs
    V5 = 5,
}
impl ApiVersion {
    pub const fn current() -> Self {
        Self::V18
    }
}

impl PartialEq<ApiVersion> for u32 {
    fn eq(&self, other: &ApiVersion) -> bool {
        *self == *other as u32
    }
}
impl PartialOrd<ApiVersion> for u32 {
    fn partial_cmp(&self, other: &ApiVersion) -> Option<std::cmp::Ordering> {
        Some(self.cmp(&(*other as u32)))
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn latest_api_version() {
        let cur = ApiVersion::current();
        assert_eq!(VMM_CURRENT_INTERFACE_VERSION, cur as u32);
    }

    #[test]
    fn u32_comparisons() {
        assert!(4u32 < ApiVersion::V5);
        assert!(5u32 == ApiVersion::V5);
        assert!(6u32 > ApiVersion::V5);
    }
}


================================================
FILE: crates/bhyve-api/sys/Cargo.toml
================================================
[package]
name = "bhyve_api_sys"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
libc.workspace = true
strum = { workspace = true, features = ["derive"] }


================================================
FILE: crates/bhyve-api/sys/src/enums.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use strum::FromRepr;

#[repr(C)]
#[allow(non_camel_case_types, unused)]
#[derive(Copy, Clone, Debug)]
pub enum vm_reg_name {
    VM_REG_GUEST_RAX,
    VM_REG_GUEST_RBX,
    VM_REG_GUEST_RCX,
    VM_REG_GUEST_RDX,
    VM_REG_GUEST_RSI,
    VM_REG_GUEST_RDI,
    VM_REG_GUEST_RBP,
    VM_REG_GUEST_R8,
    VM_REG_GUEST_R9,
    VM_REG_GUEST_R10,
    VM_REG_GUEST_R11,
    VM_REG_GUEST_R12,
    VM_REG_GUEST_R13,
    VM_REG_GUEST_R14,
    VM_REG_GUEST_R15,
    VM_REG_GUEST_CR0,
    VM_REG_GUEST_CR3,
    VM_REG_GUEST_CR4,
    VM_REG_GUEST_DR7,
    VM_REG_GUEST_RSP,
    VM_REG_GUEST_RIP,
    VM_REG_GUEST_RFLAGS,
    VM_REG_GUEST_ES,
    VM_REG_GUEST_CS,
    VM_REG_GUEST_SS,
    VM_REG_GUEST_DS,
    VM_REG_GUEST_FS,
    VM_REG_GUEST_GS,
    VM_REG_GUEST_LDTR,
    VM_REG_GUEST_TR,
    VM_REG_GUEST_IDTR,
    VM_REG_GUEST_GDTR,
    VM_REG_GUEST_EFER,
    VM_REG_GUEST_CR2,
    VM_REG_GUEST_PDPTE0,
    VM_REG_GUEST_PDPTE1,
    VM_REG_GUEST_PDPTE2,
    VM_REG_GUEST_PDPTE3,
    VM_REG_GUEST_INTR_SHADOW,
    VM_REG_GUEST_DR0,
    VM_REG_GUEST_DR1,
    VM_REG_GUEST_DR2,
    VM_REG_GUEST_DR3,
    VM_REG_GUEST_DR6,
    VM_REG_GUEST_ENTRY_INST_LENGTH,
    VM_REG_GUEST_XCR0,
    VM_REG_LAST,
}

#[repr(i32)]
#[allow(non_camel_case_types, unused)]
#[derive(FromRepr, Debug)]
pub enum vm_exitcode {
    VM_EXITCODE_INOUT,
    VM_EXITCODE_VMX,
    VM_EXITCODE_BOGUS,
    VM_EXITCODE_RDMSR,
    VM_EXITCODE_WRMSR,
    VM_EXITCODE_HLT,
    VM_EXITCODE_MTRAP,
    VM_EXITCODE_PAUSE,
    VM_EXITCODE_PAGING,
    VM_EXITCODE_INST_EMUL,
    VM_EXITCODE_RUN_STATE,
    VM_EXITCODE_MMIO_EMUL,
    /// Formerly `VM_EXITCODE_RUNBLOCK`
    VM_EXITCODE_DEPRECATED,
    VM_EXITCODE_IOAPIC_EOI,
    VM_EXITCODE_SUSPENDED,
    VM_EXITCODE_MMIO,
    VM_EXITCODE_TASK_SWITCH,
    VM_EXITCODE_MONITOR,
    VM_EXITCODE_MWAIT,
    VM_EXITCODE_SVM,
    /// Formerly `VM_EXITCODE_REQIDLE`
    /// Deprecated in v16
    VM_EXITCODE_DEPRECATED2,
    VM_EXITCODE_DEBUG,
    VM_EXITCODE_VMINSN,
    VM_EXITCODE_BPT,
    VM_EXITCODE_HT,
}

#[repr(u32)]
#[allow(non_camel_case_types, unused)]
pub enum vcpu_reset_kind {
    VRK_RESET = 0,
    VRK_INIT = 1,
}

#[repr(u32)]
#[allow(non_camel_case_types, unused)]
pub enum vm_entry_cmds {
    VEC_DEFAULT = 0,
    VEC_DISCARD_INSTR,
    VEC_FULFILL_MMIO,
    VEC_FULFILL_INOUT,
    VEC_FLAG_EXIT_CONSISTENT = 1 << 31,
}

#[repr(i32)]
#[allow(non_camel_case_types, unused)]
pub enum vm_cap_type {
    VM_CAP_HALT_EXIT,
    VM_CAP_MTRAP_EXIT,
    VM_CAP_PAUSE_EXIT,
    VM_CAP_ENABLE_INVPCID,
    VM_CAP_BPT_EXIT,
}

#[repr(u32)]
#[allow(non_camel_case_types, unused)]
#[derive(FromRepr)]
pub enum vm_suspend_how {
    VM_SUSPEND_NONE,
    VM_SUSPEND_RESET,
    VM_SUSPEND_POWEROFF,
    VM_SUSPEND_HALT,
    VM_SUSPEND_TRIPLEFAULT,
}


================================================
FILE: crates/bhyve-api/sys/src/ioctls.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Define constants from machine/vmm_dev.h

const VMMCTL_IOC_BASE: i32 = ((b'V' as i32) << 16) | ((b'M' as i32) << 8);
const VMM_IOC_BASE: i32 = ((b'v' as i32) << 16) | ((b'm' as i32) << 8);
const VMM_LOCK_IOC_BASE: i32 = ((b'v' as i32) << 16) | ((b'l' as i32) << 8);
const VMM_CPU_IOC_BASE: i32 = ((b'v' as i32) << 16) | ((b'p' as i32) << 8);

// Operations performed on the vmmctl device
pub const VMM_CREATE_VM: i32 = VMMCTL_IOC_BASE | 0x01;
pub const VMM_DESTROY_VM: i32 = VMMCTL_IOC_BASE | 0x02;
pub const VMM_VM_SUPPORTED: i32 = VMMCTL_IOC_BASE | 0x03;
pub const VMM_INTERFACE_VERSION: i32 = VMMCTL_IOC_BASE | 0x04;

// VMM memory reservoir operations
pub const VMM_RESV_QUERY: i32 = VMMCTL_IOC_BASE | 0x10;
pub const VMM_RESV_SET_TARGET: i32 = VMMCTL_IOC_BASE | 0x11;

// Operations performed in the context of a given vCPU
pub const VM_RUN: i32 = VMM_CPU_IOC_BASE | 0x01;
pub const VM_SET_REGISTER: i32 = VMM_CPU_IOC_BASE | 0x02;
pub const VM_GET_REGISTER: i32 = VMM_CPU_IOC_BASE | 0x03;
pub const VM_SET_SEGMENT_DESCRIPTOR: i32 = VMM_CPU_IOC_BASE | 0x04;
pub const VM_GET_SEGMENT_DESCRIPTOR: i32 = VMM_CPU_IOC_BASE | 0x05;
pub const VM_SET_REGISTER_SET: i32 = VMM_CPU_IOC_BASE | 0x06;
pub const VM_GET_REGISTER_SET: i32 = VMM_CPU_IOC_BASE | 0x07;
pub const VM_INJECT_EXCEPTION: i32 = VMM_CPU_IOC_BASE | 0x08;
pub const VM_SET_CAPABILITY: i32 = VMM_CPU_IOC_BASE | 0x09;
pub const VM_GET_CAPABILITY: i32 = VMM_CPU_IOC_BASE | 0x0a;
pub const VM_PPTDEV_MSI: i32 = VMM_CPU_IOC_BASE | 0x0b;
pub const VM_PPTDEV_MSIX: i32 = VMM_CPU_IOC_BASE | 0x0c;
pub const VM_SET_X2APIC_STATE: i32 = VMM_CPU_IOC_BASE | 0x0d;
pub const VM_GLA2GPA: i32 = VMM_CPU_IOC_BASE | 0x0e;
pub const VM_GLA2GPA_NOFAULT: i32 = VMM_CPU_IOC_BASE | 0x0f;
pub const VM_ACTIVATE_CPU: i32 = VMM_CPU_IOC_BASE | 0x10;
pub const VM_SET_INTINFO: i32 = VMM_CPU_IOC_BASE | 0x11;
pub const VM_GET_INTINFO: i32 = VMM_CPU_IOC_BASE | 0x12;
pub const VM_RESTART_INSTRUCTION: i32 = VMM_CPU_IOC_BASE | 0x13;
pub const VM_SET_KERNEMU_DEV: i32 = VMM_CPU_IOC_BASE | 0x14;
pub const VM_GET_KERNEMU_DEV: i32 = VMM_CPU_IOC_BASE | 0x15;
pub const VM_RESET_CPU: i32 = VMM_CPU_IOC_BASE | 0x16;
pub const VM_GET_RUN_STATE: i32 = VMM_CPU_IOC_BASE | 0x17;
pub const VM_SET_RUN_STATE: i32 = VMM_CPU_IOC_BASE | 0x18;
pub const VM_GET_FPU: i32 = VMM_CPU_IOC_BASE | 0x19;
pub const VM_SET_FPU: i32 = VMM_CPU_IOC_BASE | 0x1a;
pub const VM_GET_CPUID: i32 = VMM_CPU_IOC_BASE | 0x1b;
pub const VM_SET_CPUID: i32 = VMM_CPU_IOC_BASE | 0x1c;
pub const VM_LEGACY_CPUID: i32 = VMM_CPU_IOC_BASE | 0x1d;

// Operations requiring write-locking the VM
pub const VM_REINIT: i32 = VMM_LOCK_IOC_BASE | 0x01;
pub const VM_BIND_PPTDEV: i32 = VMM_LOCK_IOC_BASE | 0x02;
pub const VM_UNBIND_PPTDEV: i32 = VMM_LOCK_IOC_BASE | 0x03;
pub const VM_MAP_PPTDEV_MMIO: i32 = VMM_LOCK_IOC_BASE | 0x04;
pub const VM_ALLOC_MEMSEG: i32 = VMM_LOCK_IOC_BASE | 0x05;
pub const VM_MMAP_MEMSEG: i32 = VMM_LOCK_IOC_BASE | 0x06;
pub const VM_PMTMR_LOCATE: i32 = VMM_LOCK_IOC_BASE | 0x07;
pub const VM_MUNMAP_MEMSEG: i32 = VMM_LOCK_IOC_BASE | 0x08;
pub const VM_UNMAP_PPTDEV_MMIO: i32 = VMM_LOCK_IOC_BASE | 0x09;
pub const VM_PAUSE: i32 = VMM_LOCK_IOC_BASE | 0x0a;
pub const VM_RESUME: i32 = VMM_LOCK_IOC_BASE | 0x0b;

pub const VM_WRLOCK_CYCLE: i32 = VMM_LOCK_IOC_BASE | 0xff;

// All other ioctls
pub const VM_GET_GPA_PMAP: i32 = VMM_IOC_BASE | 0x01;
pub const VM_GET_MEMSEG: i32 = VMM_IOC_BASE | 0x02;
pub const VM_MMAP_GETNEXT: i32 = VMM_IOC_BASE | 0x03;

pub const VM_LAPIC_IRQ: i32 = VMM_IOC_BASE | 0x04;
pub const VM_LAPIC_LOCAL_IRQ: i32 = VMM_IOC_BASE | 0x05;
pub const VM_LAPIC_MSI: i32 = VMM_IOC_BASE | 0x06;

pub const VM_IOAPIC_ASSERT_IRQ: i32 = VMM_IOC_BASE | 0x07;
pub const VM_IOAPIC_DEASSERT_IRQ: i32 = VMM_IOC_BASE | 0x08;
pub const VM_IOAPIC_PULSE_IRQ: i32 = VMM_IOC_BASE | 0x09;

pub const VM_ISA_ASSERT_IRQ: i32 = VMM_IOC_BASE | 0x0a;
pub const VM_ISA_DEASSERT_IRQ: i32 = VMM_IOC_BASE | 0x0b;
pub const VM_ISA_PULSE_IRQ: i32 = VMM_IOC_BASE | 0x0c;
pub const VM_ISA_SET_IRQ_TRIGGER: i32 = VMM_IOC_BASE | 0x0d;

pub const VM_RTC_WRITE: i32 = VMM_IOC_BASE | 0x0e;
pub const VM_RTC_READ: i32 = VMM_IOC_BASE | 0x0f;
pub const VM_RTC_SETTIME: i32 = VMM_IOC_BASE | 0x10;
pub const VM_RTC_GETTIME: i32 = VMM_IOC_BASE | 0x11;

pub const VM_SUSPEND: i32 = VMM_IOC_BASE | 0x12;

pub const VM_IOAPIC_PINCOUNT: i32 = VMM_IOC_BASE | 0x13;
pub const VM_GET_PPTDEV_LIMITS: i32 = VMM_IOC_BASE | 0x14;
pub const VM_GET_HPET_CAPABILITIES: i32 = VMM_IOC_BASE | 0x15;

pub const VM_STATS_IOC: i32 = VMM_IOC_BASE | 0x16;
pub const VM_STAT_DESC: i32 = VMM_IOC_BASE | 0x17;

pub const VM_INJECT_NMI: i32 = VMM_IOC_BASE | 0x18;
pub const VM_GET_X2APIC_STATE: i32 = VMM_IOC_BASE | 0x19;
pub const VM_SET_TOPOLOGY: i32 = VMM_IOC_BASE | 0x1a;
pub const VM_GET_TOPOLOGY: i32 = VMM_IOC_BASE | 0x1b;
pub const VM_GET_CPUS: i32 = VMM_IOC_BASE | 0x1c;
pub const VM_SUSPEND_CPU: i32 = VMM_IOC_BASE | 0x1d;
pub const VM_RESUME_CPU: i32 = VMM_IOC_BASE | 0x1e;
pub const VM_TRACK_DIRTY_PAGES: i32 = VMM_IOC_BASE | 0x20;
pub const VM_DESC_FPU_AREA: i32 = VMM_IOC_BASE | 0x21;
pub const VM_DATA_READ: i32 = VMM_IOC_BASE | 0x22;
pub const VM_DATA_WRITE: i32 = VMM_IOC_BASE | 0x23;
pub const VM_SET_AUTODESTRUCT: i32 = VMM_IOC_BASE | 0x24;
pub const VM_DESTROY_SELF: i32 = VMM_IOC_BASE | 0x25;
pub const VM_DESTROY_PENDING: i32 = VMM_IOC_BASE | 0x26;
pub const VM_VCPU_BARRIER: i32 = VMM_IOC_BASE | 0x27;
pub const VM_NPT_OPERATION: i32 = VMM_IOC_BASE | 0x28;

pub const VM_DEVMEM_GETOFFSET: i32 = VMM_IOC_BASE | 0xff;


================================================
FILE: crates/bhyve-api/sys/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

mod enums;
pub mod ioctls;
mod structs;
mod vmm_data;

pub use enums::*;
pub use ioctls::*;
pub use structs::*;
pub use vmm_data::*;

pub const VM_MAXCPU: usize = 32;

/// This is the VMM interface version which bhyve_api expects to operate
/// against.  All constants and structs defined by the crate are done so in
/// terms of that specific version.
pub const VMM_CURRENT_INTERFACE_VERSION: u32 = 18;


================================================
FILE: crates/bhyve-api/sys/src/structs.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io::{Error, ErrorKind, Result};
use std::os::raw::{c_int, c_uint, c_void};

use libc::size_t;

// 3:0 - segment type
/// Descriptor type flag (0 = system, 1 = code/data)
pub const SEG_ACCESS_S: u32 = 1 << 4;
// 6:5 - DPL
/// Segment present
pub const SEG_ACCESS_P: u32 = 1 << 7;
// 11:8 reserved
/// Available for use by system software
pub const SEG_ACCESS_AVAIL: u32 = 1 << 12;
pub const SEG_ACCESS_L: u32 = 1 << 13;
pub const SEG_ACCESS_DB: u32 = 1 << 14;
pub const SEG_ACCESS_G: u32 = 1 << 15;
pub const SEG_ACCESS_UNUSABLE: u32 = 1 << 16;

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct seg_desc {
    pub base: u64,
    pub limit: u32,
    pub access: u32,
}

pub const INOUT_IN: u8 = 1 << 0;

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_inout {
    pub eax: u32,
    pub port: u16,
    pub bytes: u8,
    pub flags: u8,

    // fields used only by in-kernel emulation
    addrsize: u8,
    segment: u8,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_mmio {
    pub bytes: u8,
    pub read: u8,
    pub _pad: [u16; 3],
    pub gpa: u64,
    pub data: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_rwmsr {
    pub code: u32,
    pub wval: u64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_exit {
    pub exitcode: c_int,
    pub inst_length: c_int,
    pub rip: u64,
    pub u: vm_exit_payload,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_entry {
    pub cpuid: c_int,
    pub cmd: c_uint,
    pub exit_data: *mut c_void,
    pub u: vm_entry_payload,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub union vm_exit_payload {
    pub inout: vm_inout,
    pub mmio: vm_mmio,
    pub msr: vm_rwmsr,
    pub inst_emul: vm_inst_emul,
    pub suspend: vm_exit_suspend,
    pub paging: vm_paging,
    pub vmx: vm_exit_vmx,
    pub svm: vm_exit_svm,
    // sized to zero entire union
    empty: [u64; 6],
}

impl Default for vm_exit_payload {
    fn default() -> Self {
        Self { empty: [0u64; 6] }
    }
}

#[repr(C)]
#[derive(Copy, Clone)]
pub union vm_entry_payload {
    pub inout: vm_inout,
    pub mmio: vm_mmio,
    // sized to zero entire union
    empty: [u64; 3],
}

impl Default for vm_entry_payload {
    fn default() -> Self {
        Self { empty: [0u64; 3] }
    }
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_exit_vmx {
    pub status: c_int,
    pub exit_reason: u32,
    pub exit_qualification: u64,
    pub inst_type: c_int,
    pub inst_error: c_int,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_exit_svm {
    pub exitcode: u64,
    pub exitinfo1: u64,
    pub exitinfo2: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_exit_msr {
    pub code: u32,
    pub wval: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_exit_suspend {
    pub how: c_int,
    /// Source vCPU ID, if any.
    /// (-1 for non-vCPU-specific suspend conditions)
    pub source: c_int,
    /// When suspend condition was raised, measured in nanoseconds since the VM
    /// boot time.
    pub when: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_inst_emul {
    pub inst: [u8; 15],
    pub num_valid: u8,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_paging {
    pub gpa: u64,
    pub fault_type: c_int,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_memmap {
    pub gpa: u64,
    pub segid: c_int,
    pub segoff: i64,
    pub len: size_t,
    pub prot: c_int,
    pub flags: c_int,
}

pub const VM_MEMMAP_F_WIRED: c_int = 0x01;
#[allow(unused)]
pub const VM_MEMMAP_F_IOMMU: c_int = 0x02;

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_memseg {
    pub segid: c_int,
    pub len: size_t,
    pub name: [u8; VM_MAX_SEG_NAMELEN],
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_devmem_offset {
    pub segid: c_int,
    pub offset: i64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_register {
    pub cpuid: c_int,
    pub regnum: c_int,
    pub regval: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_seg_desc {
    pub cpuid: c_int,
    pub regnum: c_int,
    pub desc: seg_desc,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_intinfo {
    pub vcpuid: c_int,
    pub info1: u64,
    pub info2: u64,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_exception {
    pub cpuid: c_int,
    pub vector: c_int,
    pub error_code: c_uint,
    pub error_code_valid: c_int,
    pub restart_instruction: c_int,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_lapic_msi {
    pub msg: u64,
    pub addr: u64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_lapic_irq {
    pub cpuid: c_int,
    pub vector: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_ioapic_irq {
    pub irq: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_isa_irq {
    pub atpic_irq: c_int,
    pub ioapic_irq: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_isa_irq_trigger {
    pub atpic_irq: c_int,
    /// Trigger mode: 0 - Edge triggered, Non-0 - Level triggered
    pub trigger: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_rtc_data {
    pub offset: i32,
    pub value: u8,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_capability {
    pub cpuid: c_int,
    pub captype: c_int,
    pub capval: c_int,
    pub allcpus: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_nmi {
    pub cpuid: c_int,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_suspend {
    /// Acceptable values defined by `vm_suspend_how`
    pub how: u32,
    pub source: c_int,
}

// bit definitions for `vm_reinit.flags`
pub const VM_REINIT_F_FORCE_SUSPEND: u64 = 1 << 0;

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_reinit {
    pub flags: u64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_vcpu_reset {
    pub vcpuid: c_int,
    // kind values defined in vcpu_reset_kind
    pub kind: u32,
}

// bit definitions for vm_run_state`state
pub const VRS_HALT: u32 = 0;
pub const VRS_INIT: u32 = 1 << 0;
pub const VRS_RUN: u32 = 1 << 1;
pub const VRS_PEND_INIT: u32 = 1 << 14;
pub const VRS_PEND_SIPI: u32 = 1 << 15;

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_run_state {
    pub vcpuid: c_int,
    pub state: u32,
    pub sipi_vector: u8,
    pub _pad: [u8; 3],
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_fpu_state {
    pub vcpuid: c_int,
    pub buf: *mut c_void,
    pub len: u64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_fpu_desc_entry {
    pub vfde_feature: u64,
    pub vfde_size: u32,
    pub vfde_off: u32,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_fpu_desc {
    pub vfd_entry_data: *mut vm_fpu_desc_entry,
    pub vfd_req_size: u64,
    pub vfd_num_entries: u32,
}
impl Default for vm_fpu_desc {
    fn default() -> Self {
        Self {
            vfd_entry_data: std::ptr::null_mut(),
            vfd_req_size: 0,
            vfd_num_entries: 0,
        }
    }
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vmm_dirty_tracker {
    pub vdt_start_gpa: u64,
    pub vdt_len: size_t,
    pub vdt_pfns: *mut c_void,
}

// Definitions for vm_data_xfer.vdx_flags
pub const VDX_FLAG_READ_COPYIN: u32 = 1 << 0;
pub const VDX_FLAG_WRITE_COPYOUT: u32 = 1 << 1;

// Current max size for vdx_data
pub const VM_DATA_XFER_LIMIT: u32 = 8192;

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_data_xfer {
    pub vdx_vcpuid: c_int,
    pub vdx_class: u16,
    pub vdx_version: u16,
    pub vdx_flags: u32,
    pub vdx_len: u32,
    pub vdx_result_len: u32,
    pub vdx_data: *mut c_void,
}
impl Default for vm_data_xfer {
    fn default() -> Self {
        vm_data_xfer {
            vdx_vcpuid: -1,
            vdx_class: 0,
            vdx_version: 0,
            vdx_flags: 0,
            vdx_len: 0,
            vdx_result_len: 0,
            vdx_data: std::ptr::null_mut(),
        }
    }
}

/// Use index (ecx) input value when matching entry
pub const VCE_FLAG_MATCH_INDEX: u32 = 1 << 0;

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vcpu_cpuid_entry {
    pub vce_function: u32,
    pub vce_index: u32,
    pub vce_flags: u32,
    pub vce_eax: u32,
    pub vce_ebx: u32,
    pub vce_ecx: u32,
    pub vce_edx: u32,
    pub _pad: u32,
}
impl vcpu_cpuid_entry {
    fn match_idx(&self) -> bool {
        self.vce_flags & VCE_FLAG_MATCH_INDEX != 0
    }
    /// Order entries for proper cpuid evaluation by the kernel VMM.
    ///
    /// Bhyve expects that cpuid entries are sorted by function, and then index,
    /// from least to greatest.  Entries which must match on index should come
    /// before (less-than) those that do not, so the former can take precedence
    /// in matching.
    ///
    /// This function is provided so that a list of entries can be easily sorted
    /// prior to loading them into the kernel VMM.
    ///
    /// ```
    /// let mut entries: Vec<vcpu_cpuid_entry> = vec![
    ///     // entries loaded here
    /// ];
    /// entries.sort_by(vcpu_cpuid_entry::eval_sort);
    /// let config = vm_vcpu_cpuid_config {
    ///     vvcc_cpuid: 0,
    ///     vvcc_flags: 0,
    ///     vvcc_nent: entries.len(),
    ///     vvcc_entries: &mut entries,
    /// };
    /// // perform ioctl(VM_SET_CPUID, &config) ...
    /// ```
    pub fn eval_sort(a: &Self, b: &Self) -> std::cmp::Ordering {
        use std::cmp::Ordering;

        match a.vce_function.cmp(&b.vce_function) {
            Ordering::Equal => match (a.match_idx(), b.match_idx()) {
                (true, false) => Ordering::Less,
                (false, true) => Ordering::Greater,
                (true, true) | (false, false) => a.vce_index.cmp(&b.vce_index),
            },

            ord => ord,
        }
    }
}

/// Use legacy hard-coded cpuid masking tables applied to the host CPU
pub const VCC_FLAG_LEGACY_HANDLING: u32 = 1 << 0;

/// Emulate Intel-style fallback behavior (emit highest "standard" entry) if the
/// queried function/index do not match.  If not set, emulate AMD-style, where
/// all zeroes are returned in such cases.
pub const VCC_FLAG_INTEL_FALLBACK: u32 = 1 << 1;

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_vcpu_cpuid_config {
    pub vvcc_vcpuid: c_int,
    pub vvcc_flags: u32,
    pub vvcc_nent: u32,
    pub _pad: u32,
    pub vvcc_entries: *mut c_void,
}
impl Default for vm_vcpu_cpuid_config {
    fn default() -> Self {
        Self {
            vvcc_vcpuid: 0,
            vvcc_flags: 0,
            vvcc_nent: 0,
            _pad: 0,
            vvcc_entries: std::ptr::null_mut(),
        }
    }
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vm_legacy_cpuid {
    pub vlc_vcpuid: c_int,
    pub vlc_eax: u32,
    pub vlc_ebx: u32,
    pub vlc_ecx: u32,
    pub vlc_edx: u32,
}

pub const VM_MAX_NAMELEN: usize = 128;
pub const VM_MAX_SEG_NAMELEN: usize = 128;

/// Copy VM name into array appropriately sized for create/destroy request.
/// Advanced checks are left to the kernel logic consuming that value.
fn validate_name(value: &[u8]) -> Result<[u8; VM_MAX_NAMELEN]> {
    let mut buf = [0u8; VM_MAX_NAMELEN];

    if value.len() > buf.len() {
        return Err(Error::new(
            ErrorKind::InvalidInput,
            "name length exceeds VM_MAX_NAMELEN",
        ));
    }

    buf[..(value.len())].copy_from_slice(value);
    Ok(buf)
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_create_req {
    pub name: [u8; VM_MAX_NAMELEN],
    pub flags: u64,
}
impl Default for vm_create_req {
    fn default() -> Self {
        Self { name: [0u8; VM_MAX_NAMELEN], flags: 0 }
    }
}
impl vm_create_req {
    pub fn new(name: &[u8]) -> Result<Self> {
        Ok(Self { name: validate_name(name)?, flags: 0 })
    }
}

// Flag values for use in in `vm_create_req`:

// Allocate guest memory segments from existing reservoir capacity, rather than
// attempting to create transient allocations.
pub const VCF_RESERVOIR_MEM: u64 = 1;

/// Enable dirty page tracking for the guest.
pub const VCF_TRACK_DIRTY: u64 = 1 << 1;

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_destroy_req {
    pub name: [u8; VM_MAX_NAMELEN],
}
impl Default for vm_destroy_req {
    fn default() -> Self {
        Self { name: [0u8; VM_MAX_NAMELEN] }
    }
}
impl vm_destroy_req {
    pub fn new(name: &[u8]) -> Result<Self> {
        Ok(Self { name: validate_name(name)? })
    }
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vmm_resv_query {
    pub vrq_free_sz: size_t,
    pub vrq_alloc_sz: size_t,
    pub vrq_alloc_transient_sz: size_t,
    pub vrq_limit: size_t,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vmm_resv_target {
    /// Target size for VMM reservoir
    pub vrt_target_sz: size_t,

    /// Change of reservoir size to meet target will be done in multiple steps
    /// of chunk size (or smaller)
    pub vrt_chunk_sz: size_t,

    /// Resultant size of reservoir after operation.  Should match target size,
    /// except when interrupted.
    pub vrt_result_sz: size_t,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vm_npt_operation {
    pub vno_gpa: u64,
    pub vno_len: u64,
    pub vno_bitmap: *mut u8,
    pub vno_operation: u32,
}
impl Default for vm_npt_operation {
    fn default() -> Self {
        Self {
            vno_gpa: 0,
            vno_len: 0,
            vno_bitmap: std::ptr::null_mut(),
            vno_operation: 0,
        }
    }
}

// Operation & flag definitions for vm_npt_operation`vno_operation

pub const VNO_OP_RESET_DIRTY: u32 = 0x1;
pub const VNO_OP_SET_DIRTY: u32 = 0x2;
pub const VNO_OP_GET_DIRTY: u32 = 0x3;
pub const VNO_OP_GET_TRACK_DIRTY: u32 = 0x20;
pub const VNO_OP_EN_TRACK_DIRTY: u32 = 0x21;
pub const VNO_OP_DIS_TRACK_DIRTY: u32 = 0x22;
pub const VNO_FLAG_BITMAP_IN: u32 = 1 << 30;
pub const VNO_FLAG_BITMAP_OUT: u32 = 1 << 31;


================================================
FILE: crates/bhyve-api/sys/src/vmm_data.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(non_camel_case_types)]
// VMM Data Classes

pub const VDC_VERSION: u16 = 1;

// Classes bearing per-vCPU data

pub const VDC_REGISTER: u16 = 2;
pub const VDC_MSR: u16 = 3;
pub const VDC_FPU: u16 = 4;
pub const VDC_LAPIC: u16 = 5;
pub const VDC_VMM_ARCH: u16 = 6;
pub const VDC_PMU_AMD: u16 = 14;

// Classes for system-wide device state

pub const VDC_IOAPIC: u16 = 7;
pub const VDC_ATPIT: u16 = 8;
pub const VDC_ATPIC: u16 = 9;
pub const VDC_HPET: u16 = 10;
pub const VDC_PM_TIMER: u16 = 11;
pub const VDC_RTC: u16 = 12;
pub const VDC_VMM_TIME: u16 = 13;

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_version_entry_v1 {
    pub vve_class: u16,
    pub vve_version: u16,
    pub vve_len_expect: u16,
    pub vve_len_per_item: u16,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_field_entry_v1 {
    pub vfe_ident: u32,
    pub _pad: u32,
    pub vfe_value: u64,
}
impl vdi_field_entry_v1 {
    // Rather than push the duty of populating the `_pad` field on consumers,
    // offer a constructor function here which deals with it.

    /// Create a `vdi_field_entry_v1` from `ident` and `value`
    pub const fn new(ident: u32, value: u64) -> Self {
        Self { vfe_ident: ident, _pad: 0, vfe_value: value }
    }
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_lapic_page_v1 {
    pub vlp_id: u32,
    pub vlp_version: u32,
    pub vlp_tpr: u32,
    pub vlp_apr: u32,
    pub vlp_ldr: u32,
    pub vlp_dfr: u32,
    pub vlp_svr: u32,
    pub vlp_isr: [u32; 8],
    pub vlp_tmr: [u32; 8],
    pub vlp_irr: [u32; 8],
    pub vlp_esr: u32,
    pub vlp_lvt_cmci: u32,
    pub vlp_icr: u64,
    pub vlp_lvt_timer: u32,
    pub vlp_lvt_thermal: u32,
    pub vlp_lvt_pcint: u32,
    pub vlp_lvt_lint0: u32,
    pub vlp_lvt_lint1: u32,
    pub vlp_lvt_error: u32,
    pub vlp_icr_timer: u32,
    pub vlp_dcr_timer: u32,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_lapic_v1 {
    pub vl_lapic: vdi_lapic_page_v1,
    pub vl_msr_apicbase: u64,
    pub vl_timer_target: i64,
    pub vl_esr_pending: u32,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_ioapic_v1 {
    pub vi_pin_reg: [u64; 32],
    pub vi_pin_level: [u32; 32],
    pub vi_id: u32,
    pub vi_reg_sel: u32,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_atpit_channel_v1 {
    pub vac_initial: u16,
    pub vac_reg_cr: u16,
    pub vac_reg_ol: u16,
    pub vac_reg_status: u8,
    pub vac_mode: u8,

    /// `vac_status` bits:
    /// - 0b00001 status latched
    /// - 0b00010 output latched
    /// - 0b00100 control register sel
    /// - 0b01000 output latch sel
    /// - 0b10000 free-running timer
    pub vac_status: u8,

    pub vac_time_target: i64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_atpit_v1 {
    pub va_channel: [vdi_atpit_channel_v1; 3],
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_atpic_chip_v1 {
    pub vac_icw_state: u8,

    /// vac_status bits:
    /// - 0b00000001 ready
    /// - 0b00000010 auto EOI
    /// - 0b00000100 poll
    /// - 0b00001000 rotate
    /// - 0b00010000 special full nested
    /// - 0b00100000 read isr next
    /// - 0b01000000 intr raised
    /// - 0b10000000 special mask mode
    pub vac_status: u8,
    pub vac_reg_irr: u8,
    pub vac_reg_isr: u8,
    pub vac_reg_imr: u8,
    pub vac_irq_base: u8,
    pub vac_lowprio: u8,
    pub vac_elc: u8,
    pub vac_level: [u32; 8],
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_atpic_v1 {
    pub va_chip: [vdi_atpic_chip_v1; 2],
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_hpet_timer_v1 {
    pub vht_config: u64,
    pub vht_msi: u64,
    pub vht_comp_val: u32,
    pub vht_comp_rate: u32,
    pub vht_time_target: i64,
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_hpet_v1 {
    pub vh_config: u64,
    pub vh_isr: u64,
    pub vh_count_base: u32,
    pub vh_time_base: i64,

    pub vh_timers: [vdi_hpet_timer_v1; 8],
}

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_pm_timer_v1 {
    pub vpt_time_base: i64,
    /// During vmm-data reads, `vpt_ioport` carries the IO-port at which the PM
    /// timer is attached.  The field is ignored for vmm-data writes.
    pub vpt_ioport: u16,
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vdi_rtc_v1 {
    pub vr_content: [u8; 128],
    pub vr_addr: u8,
    pub vr_time_base: i64,
    pub vr_rtc_sec: u64,
    pub vr_rtc_nsec: u64,
}
impl Default for vdi_rtc_v1 {
    fn default() -> Self {
        vdi_rtc_v1 {
            vr_content: [0u8; 128],
            vr_addr: 0,
            vr_time_base: 0,
            vr_rtc_sec: 0,
            vr_rtc_nsec: 0,
        }
    }
}

#[repr(C)]
#[derive(Copy, Clone)]
pub struct vdi_rtc_v2 {
    pub vr_base_clock: i64,
    pub vr_last_period: i64,
    pub vr_content: [u8; 128],
    pub vr_addr: u8,
}
impl Default for vdi_rtc_v2 {
    fn default() -> Self {
        vdi_rtc_v2 {
            vr_base_clock: 0,
            vr_last_period: 0,
            vr_content: [0u8; 128],
            vr_addr: 0,
        }
    }
}

// VDC_VMM_TIME v1 interface

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_time_info_v1 {
    pub vt_guest_freq: u64,
    pub vt_guest_tsc: u64,
    pub vt_boot_hrtime: i64,
    pub vt_hrtime: i64,
    pub vt_hres_sec: u64,
    pub vt_hres_ns: u64,
}

// VDC_VMM_ARCH v1 data identifiers

// VM-wide:

/// Guest instance has been placed in paused state
pub const VAI_VM_IS_PAUSED: u32 = 4;

// Time-related data which was superseded by the VMM_TIME interface in API
// version 11, maintained here for reference (and for older versions)

/// Offset of guest TSC from system at time of boot
pub const VAI_TSC_BOOT_OFFSET: u32 = 1;
/// Time that guest (nominally) booted, as hrtime
pub const VAI_BOOT_HRTIME: u32 = 2;
/// Guest TSC frequency measured by hrtime (not effected by wall clock adj.)
pub const VAI_TSC_FREQ: u32 = 3;

// per-vCPU

/// NMI pending injection for vCPU (0 or 1)
pub const VAI_PEND_NMI: u32 = 10;
/// extint pending injection for vCPU (0 or 1)
pub const VAI_PEND_EXTINT: u32 = 11;
/// HW exception pending injection for vCPU
pub const VAI_PEND_EXCP: u32 = 12;
/// exception/interrupt pending injection for vCPU
pub const VAI_PEND_INTINFO: u32 = 13;

// VDC_PMU_AMD v1 interface

#[repr(C)]
#[derive(Copy, Clone, Default)]
pub struct vdi_pmu_amd_v1 {
    pub vpa_evtsel: [u64; 6],
    pub vpa_ctr: [u64; 6],
}


================================================
FILE: crates/cpuid-profile-config/Cargo.toml
================================================
[package]
name = "cpuid_profile_config"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
serde.workspace = true
serde_derive.workspace = true
toml.workspace = true
thiserror.workspace = true


================================================
FILE: crates/cpuid-profile-config/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;

use serde::{Deserialize, Serialize};

#[derive(Deserialize, Serialize, Copy, Clone, Debug, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum CpuVendor {
    Amd,
    Intel,
}

#[derive(Serialize, Deserialize, Debug, PartialEq, Clone)]
pub struct CpuidProfile {
    pub vendor: CpuVendor,
    #[serde(flatten, default)]
    pub leaf: BTreeMap<String, toml::Value>,
}

/// `cpuid` entry parsed from a configured profile
#[derive(Copy, Clone)]
pub struct CpuidEntry {
    /// Function (eax) to match for `cpuid` leaf
    pub func: u32,
    /// Index (ecx) to (optionally) match for `cpuid` leaf
    pub idx: Option<u32>,

    /// Values (eax, ebx, ecx, edx) for `cpuid` leaf
    pub values: [u32; 4],
}

#[derive(Debug, thiserror::Error)]
pub enum CpuidParseError {
    #[error("Unable to parse leaf {0}: {1}")]
    Leaf(String, std::num::ParseIntError),
    #[error("Unable to parse values: {0}")]
    Values(&'static str),
}

impl TryFrom<&CpuidProfile> for Vec<CpuidEntry> {
    type Error = CpuidParseError;

    fn try_from(value: &CpuidProfile) -> Result<Self, Self::Error> {
        let mut entries = Vec::with_capacity(value.leaf.len());

        for (leaf, values) in value.leaf.iter() {
            let (func, idx) = match leaf.split_once('-') {
                None => (
                    u32::from_str_radix(leaf, 16)
                        .map_err(|e| CpuidParseError::Leaf(leaf.clone(), e))?,
                    None,
                ),
                Some((func_part, idx_part)) => (
                    u32::from_str_radix(func_part, 16)
                        .map_err(|e| CpuidParseError::Leaf(leaf.clone(), e))?,
                    Some(
                        u32::from_str_radix(idx_part, 16).map_err(|e| {
                            CpuidParseError::Leaf(leaf.clone(), e)
                        })?,
                    ),
                ),
            };
            let raw_regs = values
                .as_array()
                .ok_or(CpuidParseError::Values("expected array of values"))?;
            if raw_regs.len() != 4 {
                return Err(CpuidParseError::Values("expected 4 cpuid values"));
            }
            let mut values = [0u32; 4];
            for (v, raw) in values.iter_mut().zip(raw_regs.iter()) {
                let num = raw.as_integer().ok_or(CpuidParseError::Values(
                    "leaf values must be numeric",
                ))?;
                *v = u32::try_from(num).map_err(|_e| {
                    CpuidParseError::Values("leaf values must be valid u32")
                })?;
            }
            entries.push(CpuidEntry { func, idx, values });
        }
        Ok(entries)
    }
}


================================================
FILE: crates/cpuid-utils/Cargo.toml
================================================
[package]
name = "cpuid_utils"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[dependencies]
bitflags.workspace = true
bhyve_api.workspace = true
propolis_api_types = {workspace = true, optional = true}
propolis_types.workspace = true
thiserror.workspace = true

[dev-dependencies]
proptest.workspace = true

[features]
instance-spec = ["propolis_api_types"]


================================================
FILE: crates/cpuid-utils/src/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Bitflags and constants that provide symbolic names for the various bits in
//! various CPUID leaves.
//!
//! Definitions here are taken from the AMD Architecture Programmer's Manual,
//! volume 3, appendix E (Publication 24594, revision 3.36, March 2024).

pub const STANDARD_BASE_LEAF: u32 = 0;
pub const HYPERVISOR_BASE_LEAF: u32 = 0x4000_0000;
pub const EXTENDED_BASE_LEAF: u32 = 0x8000_0000;

/// For leaves with subleaves we don't know ahead of time how many subleaves
/// *are* present. In some cases the top subleaf indicates the maximum subleaf,
/// in other cases there is a leaf-specific maximum described in manuals or
/// implicitly by a sentinel bit pattern. In all cases, we'll use this limit as
/// a backstop so that bogus CPUID leaves don't have us copy gigabytes of
/// subleaves around.
pub const MAX_REASONABLE_SUBLEAVES: u32 = 0x20;

bitflags::bitflags! {
    /// Leaf 1 ecx: instruction feature identifiers.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct Leaf1Ecx: u32 {
        const SSE3 = 1 << 0;
        const PCLMULQDQ = 1 << 1;
        const MONITOR = 1 << 3;
        const SSSE3 = 1 << 9;
        const FMA = 1 << 12;
        const CMPXCHG16B = 1 << 13;
        const SSE41 = 1 << 19;
        const SSE42 = 1 << 20;
        const X2APIC = 1 << 21;
        const MOVBE = 1 << 22;
        const POPCNT = 1 << 23;
        const AES = 1 << 25;
        const XSAVE = 1 << 26;
        const OSXSAVE = 1 << 27;
        const AVX = 1 << 28;
        const F16C = 1 << 29;
        const RDRAND = 1 << 30;
        const HV_GUEST = 1 << 31;

        const ALL_FLAGS = Self::SSE3.bits() | Self::PCLMULQDQ.bits() |
            Self::MONITOR.bits() | Self::SSSE3.bits() | Self::FMA.bits() |
            Self::CMPXCHG16B.bits() | Self::SSE41.bits() | Self::SSE42.bits() |
            Self::X2APIC.bits() | Self::MOVBE.bits() | Self::POPCNT.bits() |
            Self::AES.bits() | Self::XSAVE.bits() | Self::OSXSAVE.bits() |
            Self::AVX.bits() | Self::F16C.bits() | Self::RDRAND.bits() |
            Self::HV_GUEST.bits();
    }

    /// Leaf 1 edx: Instruction feature identifiers.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct Leaf1Edx: u32 {
        const FPU = 1 << 0;
        const VME = 1 << 1;
        const DE = 1 << 2;
        const PSE = 1 << 3;
        const TSC = 1 << 4;
        const MSR = 1 << 5;
        const PAE = 1 << 6;
        const MCE = 1 << 7;
        const CMPXCHG8B = 1 << 8;
        const APIC = 1 << 9;
        const SYSENTER = 1 << 11;
        const MTRR = 1 << 12;
        const PGE = 1 << 13;
        const MCA = 1 << 14;
        const CMOV = 1 << 15;
        const PAT = 1 << 16;
        const PSE36 = 1 << 17;
        const CLFLUSH = 1 << 19;
        const MMX = 1 << 23;
        const FXSR = 1 << 24;
        const SSE = 1 << 25;
        const SSE2 = 1 << 26;
        const HTT = 1 << 28;

        const ALL_FLAGS = Self::FPU.bits() | Self::VME.bits() |
            Self::DE.bits() | Self::PSE.bits() | Self::TSC.bits() |
            Self::MSR.bits() | Self::PAE.bits() | Self::MCE.bits() |
            Self::CMPXCHG8B.bits() | Self::APIC.bits() | Self::SYSENTER.bits() |
            Self::MTRR.bits() | Self::PGE.bits() | Self::MCA.bits() |
            Self::CMOV.bits() | Self::PAT.bits() | Self::PSE36.bits() |
            Self::CLFLUSH.bits() | Self::MMX.bits() | Self::FXSR.bits() |
            Self::SSE.bits() | Self::SSE2.bits() | Self::HTT.bits();
    }

    /// Leaf 7 subleaf 0 ebx: instruction feature identifiers.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct Leaf7Sub0Ebx: u32 {
        const FSGSBASE = 1 << 0;
        const BMI1 = 1 << 3;
        const AVX2 = 1 << 5;
        const SMEP = 1 << 7;
        const BMI2 = 1 << 8;
        const INVPCID = 1 << 10;
        const PQM = 1 << 12;
        const PQE = 1 << 15;
        const RDSEED = 1 << 18;
        const ADX = 1 << 19;
        const SMAP = 1 << 20;
        const CLFLUSHOPT = 1 << 23;
        const CLWB = 1 << 24;
        const SHA = 1 << 29;

        const ALL_FLAGS = Self::FSGSBASE.bits() |
            Self::BMI1.bits() | Self::AVX2.bits() | Self::SMEP.bits() |
            Self::BMI2.bits() | Self::INVPCID.bits() | Self::PQM.bits() |
            Self::PQE.bits() | Self::RDSEED.bits() | Self::ADX.bits() |
            Self::SMAP.bits() | Self::CLFLUSHOPT.bits() | Self::CLWB.bits() |
            Self::SHA.bits();
    }

    /// Leaf 0x8000_0001 ecx: Extended processor feature identifiers.
    ///
    /// NOTE: These definitions are AMD-specific.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct AmdExtLeaf1Ecx: u32 {
        const LAHF = 1 << 0;
        const CMP_LEGACY = 1 << 1;
        const SVM = 1 << 2;
        const EXT_APIC_SPACE = 1 << 3;
        const ALT_MOV_CR8 = 1 << 4;
        const ABM = 1 << 5;
        const SSE4A = 1 << 6;
        const MISALIGN_SSE = 1 << 7;
        const THREED_NOW_PREFETCH = 1 << 8;
        const OSVW = 1 << 9;
        const IBS = 1 << 10;
        const XOP = 1 << 11;
        const SKINIT = 1 << 12;
        const WDT = 1 << 13;
        const LWP = 1 << 15;
        const FMA4 = 1 << 16;
        const TCE = 1 << 17;
        const TBM = 1 << 21;
        const TOPOLOGY_EXT = 1 << 22;
        const PMC_EXT_CORE = 1 << 23;
        const PMC_EXT_NB = 1 << 24;
        const DATA_ACCESS_BP = 1 << 26;
        const PERF_TSC = 1 << 27;
        const PMC_EXT_LLC = 1 << 28;
        const MONITORX = 1 << 29;
        const DATA_BP_ADDR_MASK_EXT = 1 << 30;

        const ALL_FLAGS = Self::LAHF.bits() | Self::CMP_LEGACY.bits() |
            Self::SVM.bits() | Self::EXT_APIC_SPACE.bits() |
            Self::ALT_MOV_CR8.bits() | Self::ABM.bits() | Self::SSE4A.bits() |
            Self::MISALIGN_SSE.bits() | Self::THREED_NOW_PREFETCH.bits() |
            Self::OSVW.bits() | Self::IBS.bits() | Self::XOP.bits() |
            Self::SKINIT.bits() | Self::WDT.bits() | Self::LWP.bits() |
            Self::FMA4.bits() | Self::TCE.bits() | Self::TBM.bits() |
            Self::TOPOLOGY_EXT.bits() | Self::PMC_EXT_CORE.bits() |
            Self::PMC_EXT_NB.bits() | Self::DATA_ACCESS_BP.bits() |
            Self::PERF_TSC.bits() | Self::PMC_EXT_LLC.bits() |
            Self::MONITORX.bits() | Self::DATA_BP_ADDR_MASK_EXT.bits();
    }

    /// Leaf 0x8000_0001 edx: Extended processor feature identifiers.
    ///
    /// NOTE: These definitions are AMD-specific.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct AmdExtLeaf1Edx: u32 {
        const FPU = Leaf1Edx::FPU.bits();
        const VME = Leaf1Edx::VME.bits();
        const DE = Leaf1Edx::DE.bits();
        const PSE = Leaf1Edx::PSE.bits();
        const TSC = Leaf1Edx::TSC.bits();
        const MSR = Leaf1Edx::MSR.bits();
        const PAE = Leaf1Edx::PAE.bits();
        const MCE = Leaf1Edx::MCE.bits();
        const CMPXCHG8B = Leaf1Edx::CMPXCHG8B.bits();
        const APIC = Leaf1Edx::APIC.bits();
        const SYSCALL = 1 << 11;
        const MTRR = Leaf1Edx::MTRR.bits();
        const PGE = Leaf1Edx::PGE.bits();
        const MCA = Leaf1Edx::MCA.bits();
        const CMOV = Leaf1Edx::CMOV.bits();
        const PAT = Leaf1Edx::PAT.bits();
        const PSE36 = Leaf1Edx::PSE36.bits();
        const NX = 1 << 20;
        const MMX_EXT = 1 << 22;
        const MMX = 1 << 23;
        const FXSAVE = 1 << 24;
        const FXSAVE_OPT = 1 << 25;
        const GB_PAGE = 1 << 26;
        const RDTSCP = 1 << 27;
        const LONG_MODE = 1 << 29;
        const THREED_NOW_EXT = 1 << 30;
        const THREED_NOW = 1 << 31;

        const ALL_FLAGS = Self::FPU.bits() | Self::VME.bits() |
            Self::DE.bits() | Self::PSE.bits() | Self::TSC.bits() |
            Self::MSR.bits() | Self::PAE.bits() | Self::MCE.bits() |
            Self::CMPXCHG8B.bits() | Self::APIC.bits() | Self::SYSCALL.bits() |
            Self::MTRR.bits() | Self::PGE.bits() | Self::MCA.bits() |
            Self::CMOV.bits() | Self::PAT.bits() | Self::PSE36.bits() |
            Self::NX.bits() | Self::MMX_EXT.bits() | Self::MMX.bits() |
            Self::FXSAVE.bits() | Self::FXSAVE_OPT.bits() |
            Self::GB_PAGE.bits() | Self::RDTSCP.bits() |
            Self::LONG_MODE.bits() | Self::THREED_NOW_EXT.bits() |
            Self::THREED_NOW.bits();
    }

    /// Leaf 0x8000_001D eax: Cache topology information.
    ///
    /// NOTE: These definitions are AMD-specific.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct AmdExtLeaf1DEax: u32 {
        const NUM_SHARING_CACHE_MASK = (0xFFF << 14);
        const FULLY_ASSOCIATIVE = 1 << 9;
        const SELF_INITIALIZATION = 1 << 8;
        const CACHE_LEVEL_MASK = (0x7 << 5);
        const CACHE_TYPE_MASK = 0x1F;
    }
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum AmdExtLeaf1DCacheType {
    Null,
    Data,
    Instruction,
    Unified,
    Reserved,
}

impl AmdExtLeaf1DCacheType {
    pub fn is_null(&self) -> bool {
        matches!(self, Self::Null)
    }
}

impl TryFrom<u32> for AmdExtLeaf1DCacheType {
    type Error = ();

    /// Returns the leaf 0x8000001D cache type corresponding to the supplied
    /// value, or an error if the supplied value cannot be represented in 5 bits
    /// (the width of the cache type field in leaf 0x8000001D eax).
    fn try_from(value: u32) -> Result<Self, Self::Error> {
        match value {
            0 => Ok(Self::Null),
            1 => Ok(Self::Data),
            2 => Ok(Self::Instruction),
            3 => Ok(Self::Unified),
            4..=0x1F => Ok(Self::Reserved),
            _ => Err(()),
        }
    }
}

impl AmdExtLeaf1DEax {
    pub fn cache_type(&self) -> AmdExtLeaf1DCacheType {
        let bits = (*self & Self::CACHE_TYPE_MASK).bits();
        AmdExtLeaf1DCacheType::try_from(bits)
            .expect("invalid bits were already masked")
    }
}


================================================
FILE: crates/cpuid-utils/src/host.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use bhyve_api::{VmmCtlFd, VmmFd};
use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor};
use thiserror::Error;

use crate::{
    bits::{
        AmdExtLeaf1DCacheType, AmdExtLeaf1DEax, Leaf1Ecx, Leaf7Sub0Ebx,
        EXTENDED_BASE_LEAF, MAX_REASONABLE_SUBLEAVES, STANDARD_BASE_LEAF,
    },
    CpuidMapInsertError, CpuidSet,
};

#[derive(Debug, Error)]
pub enum GetHostCpuidError {
    #[error("failed to insert into the CPUID map")]
    CpuidInsertFailed(#[from] CpuidMapInsertError),

    #[error("CPUID vendor not recognized: {0}")]
    VendorNotRecognized(&'static str),

    #[error("I/O error from bhyve API")]
    BhyveError(#[from] std::io::Error),
}

/// A wrapper around a handle to a bhyve VM that can be used to query bhyve's
/// default CPUID values.
struct Vm(bhyve_api::VmmFd);

impl Vm {
    fn new() -> Result<Self, GetHostCpuidError> {
        let name = format!("cpuid-gen-{}", std::process::id());
        let mut req = bhyve_api::vm_create_req::new(name.as_bytes())
            .expect("valid VM name");

        let ctl = VmmCtlFd::open()?;
        let _ = unsafe { ctl.ioctl(bhyve_api::VMM_CREATE_VM, &mut req) }?;

        let vm = match VmmFd::open(&name) {
            Ok(vm) => vm,
            Err(e) => {
                // Attempt to manually destroy the VM if we cannot open it
                let _ = ctl.vm_destroy(name.as_bytes());
                return Err(e.into());
            }
        };

        Ok(Self(vm))
    }

    fn query(
        &self,
        vlc_eax: u32,
        vlc_ecx: u32,
    ) -> Result<CpuidValues, GetHostCpuidError> {
        let mut data = bhyve_api::vm_legacy_cpuid {
            vlc_eax,
            vlc_ecx,
            ..Default::default()
        };
        unsafe { self.0.ioctl(bhyve_api::VM_LEGACY_CPUID, &mut data) }?;
        Ok(CpuidValues {
            eax: data.vlc_eax,
            ebx: data.vlc_ebx,
            ecx: data.vlc_ecx,
            edx: data.vlc_edx,
        })
    }
}

impl Drop for Vm {
    fn drop(&mut self) {
        let _ = self.0.ioctl_usize(bhyve_api::VM_DESTROY_SELF, 0);
    }
}

/// Queries the supplied CPUID leaf on the caller's machine.
#[cfg(target_arch = "x86_64")]
pub fn query(leaf: CpuidIdent) -> CpuidValues {
    unsafe {
        core::arch::x86_64::__cpuid_count(leaf.leaf, leaf.subleaf.unwrap_or(0))
    }
    .into()
}

#[cfg(not(target_arch = "x86_64"))]
pub fn query(leaf: CpuidIdent) -> CpuidValues {
    panic!("host CPUID queries only work on x86-64 hosts")
}

fn collect_cpuid(
    query: &impl Fn(u32, u32) -> Result<CpuidValues, GetHostCpuidError>,
) -> Result<CpuidSet, GetHostCpuidError> {
    let mut set = CpuidSet::default();

    // Enumerate standard leaves and copy their values into the output set.
    //
    // Note that enumeration order matters here: leaf D is only treated as
    // having subleaves if leaf 1 indicates support for XSAVE.
    let std = query(STANDARD_BASE_LEAF, 0)?;
    set.vendor = CpuidVendor::try_from(std)
        .map_err(GetHostCpuidError::VendorNotRecognized)?;
    let mut xsave_supported = false;
    for leaf in 0..=std.eax {
        match leaf {
            0x1 => {
                let data = query(leaf, 0)?;
                xsave_supported = (Leaf1Ecx::from_bits_retain(data.ecx)
                    & Leaf1Ecx::XSAVE)
                    .bits()
                    != 0;
                set.insert(CpuidIdent::leaf(leaf), data)?;
            }
            // Leaf 0x4 is a series of subleaves terminated by a subleaf with
            // "type" (EAX bits 4-0) of 0. In practice there are typically four
            // subleaves but we'll gather them until there an unreasonable
            // number or we find an invalid leaf.
            0x4 => {
                for i in 0..MAX_REASONABLE_SUBLEAVES {
                    let data = query(leaf, i)?;
                    if data.eax & 0x1f == 0 {
                        break;
                    }

                    set.insert(CpuidIdent::subleaf(leaf, i), data)?;
                }
            }
            // Leaf 0x7 subleaf 0 eax indicates the total number of leaf-7
            // subleaves.
            0x7 => {
                let mut data = query(leaf, 0)?;

                // Leaf 0x7 EBX bits 12 and 15 indicate PQM and PQE support on
                // AMD CPUs, and aspects of RDT support on Intel CPUs. In both
                // cases, if the bits are set, leaves 0xF and 0x10 are actually
                // subleaves with further capability information for the
                // corresponding features. We don't support passing these
                // features along, so mask out the bits.
                data.ebx &= !(Leaf7Sub0Ebx::PQM | Leaf7Sub0Ebx::PQE).bits();

                set.insert(CpuidIdent::subleaf(leaf, 0), data)?;
                for subleaf in 1..=data.eax {
                    let sub_data = query(leaf, subleaf)?;
                    set.insert(CpuidIdent::subleaf(leaf, subleaf), sub_data)?;
                }
            }
            // Leaf 0xB contains CPU topology information. Although this leaf
            // can theoretically support many levels of information, bhyve
            // supports only subleaves 0 and 1, so just query those without
            // trying to reason about exactly how many topology nodes the host
            // exposes.
            0xB => {
                set.insert(CpuidIdent::subleaf(leaf, 0), query(leaf, 0)?)?;
                set.insert(CpuidIdent::subleaf(leaf, 1), query(leaf, 1)?)?;
            }
            // Leaf 0xD contains information about extended processor state.
            0xD if xsave_supported => {
                let data = query(leaf, 0)?;
                set.insert(CpuidIdent::subleaf(leaf, 0), data)?;

                // Subleaf 0 edx:eax contains a 64-bit mask indicating what
                // features requiring extended state can be enabled in xcr0.
                let xcr0_bits =
                    u64::from(data.eax) | (u64::from(data.edx) << 32);

                let data = query(leaf, 1)?;
                set.insert(CpuidIdent::subleaf(leaf, 1), data)?;

                // Subleaf 1 edx:ecx contains a 64-bit mask indicating what
                // features requiring extended state can be enabled in the
                // IA32_XSS MSR.
                let xss_bits =
                    u64::from(data.ecx) | (u64::from(data.edx) << 32);

                // Subleaves 2 through 63 are valid if the corresponding mask
                // bit is set either in the xcr0 mask returned by subleaf 0 or
                // the XSS mask returned by subleaf 1.
                for ecx in 2..64 {
                    if (1 << ecx) & (xcr0_bits | xss_bits) == 0 {
                        continue;
                    }

                    set.insert(
                        CpuidIdent::subleaf(leaf, ecx),
                        query(leaf, ecx)?,
                    )?;
                }
            }
            // Leaf 0xF describes Platform QoS Monitoring ("PQM").
            0xF => {
                // Since we're hiding PQM, provide an empty leaf here.
                set.insert(CpuidIdent::leaf(leaf), CpuidValues::default())?;
            }
            // Leaf 0x10 describes Platform QoS Enforcement ("PQE").
            0x10 => {
                // Since we're hiding PQE, provide an empty leaf here.
                set.insert(CpuidIdent::leaf(leaf), CpuidValues::default())?;
            }
            // Leaf 0x18 is similar to leaf 0x4: Intel-only, it is a series of
            // subleaves describing potentially-shared processor structures.
            // Unlike leaf 0x4, subleaf 0 EAX describes the maximum valid
            // subleaf, and subleaves are not guaranteed to be contiguous up to
            // that level. On real systems there are upwards of eight subleaves
            // (at least on Ice Lake).
            0x18 => {
                let top_subleaf = query(leaf, 0)?;

                let limit =
                    std::cmp::min(MAX_REASONABLE_SUBLEAVES, top_subleaf.eax);

                for i in 0..limit {
                    let data = query(leaf, i)?;
                    set.insert(CpuidIdent::subleaf(leaf, i), data)?;
                }
            }
            _ => {
                set.insert(CpuidIdent::leaf(leaf), query(leaf, 0)?)?;
            }
        }
    }

    let extended = query(EXTENDED_BASE_LEAF, 0)?;
    for leaf in EXTENDED_BASE_LEAF..=extended.eax {
        match leaf {
            0x8000_001D => {
                for subleaf in 0..=u32::MAX {
                    let data = query(leaf, subleaf)?;
                    let eax = AmdExtLeaf1DEax::from_bits_retain(data.eax);
                    if eax.cache_type() == AmdExtLeaf1DCacheType::Null {
                        break;
                    }

                    set.insert(CpuidIdent::subleaf(leaf, subleaf), data)?;
                }
            }
            // Leaf 0x8000_0020 has extended AMD PQM/PQE feature information.
            0x8000_0020 => {
                // Since we're hiding PQM and PQE, provide an empty leaf here.
                // Features described in this leaf may relate to PQM or PQE, and
                // it's not immediately clear what to present if only one
                // feature is present.
                //
                // In any case, no CPU exists that supports only one of PQM or
                // PQE.
                set.insert(CpuidIdent::leaf(leaf), CpuidValues::default())?;
            }
            // Leaf 0x8000_0026 has extended AMD CPU topology information.
            0x8000_0026 => {
                // The vCPU topology is unrelated to the underlying hardware
                // topology, so hide the real topology for the time being.
                //
                // Hidden or not, it would be erroneous to use the default
                // querying behavior for this leaf. Querying the first host
                // subleaf and reporting leaf 0x8000_0026 as a normal leaf with
                // that data would result in that first topology subleaf being
                // provided for a guest's query of any subleaf. The APM's
                // guidance on how to query extended CPU topology would then
                // lead software to loop over all values of ECX querying
                // subleaves to enumerate a bogus topology:
                //
                // > The topology level is selected by the value passed to the
                // > instruction in ECX. To discover the topology of a system,
                // > software should execute CPUID Fn8000_0026 with increasing
                // > ECX values, starting with a value of zero, until the
                // > returned hierarchy level type (CPUID
                // > Fn8000_0026_ECX[LevelType]) is equal to zero
                set.insert(CpuidIdent::leaf(leaf), CpuidValues::default())?;
            }
            _ => {
                set.insert(CpuidIdent::leaf(leaf), query(leaf, 0)?)?;
            }
        }
    }

    Ok(set)
}

/// A possible source of CPUID information.
#[derive(Clone, Copy)]
pub enum CpuidSource {
    /// Create a temporary VM and ask bhyve what values it would return if one
    /// of its CPUs executed CPUID.
    BhyveDefault,

    /// Execute the CPUID instruction on the host.
    HostCpu,
}

/// Queries the supplied `source` for a "complete" set of CPUID values, i.e., a
/// full set of leaves and subleaves describing the CPU platform the selected
/// source exposes.
pub fn query_complete(
    source: CpuidSource,
) -> Result<CpuidSet, GetHostCpuidError> {
    let query: Box<dyn Fn(u32, u32) -> Result<_, _>> = match source {
        CpuidSource::BhyveDefault => {
            let vm = Vm::new()?;
            Box::new(move |eax, ecx| vm.query(eax, ecx))
        }
        CpuidSource::HostCpu => {
            Box::new(|eax, ecx| Ok(query(CpuidIdent::subleaf(eax, ecx))))
        }
    };

    collect_cpuid(&query)
}


================================================
FILE: crates/cpuid-utils/src/instance_spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Helpers for converting instance spec CPUID entries into this module's types.

use super::*;

use propolis_api_types::instance_spec::components::board::{Cpuid, CpuidEntry};

impl super::CpuidSet {
    pub fn into_instance_spec_cpuid(self) -> Cpuid {
        Cpuid { entries: self.map.into(), vendor: self.vendor }
    }
}

impl From<CpuidMap> for Vec<CpuidEntry> {
    fn from(value: CpuidMap) -> Self {
        value
            .iter()
            .map(
                |(
                    CpuidIdent { leaf, subleaf },
                    CpuidValues { eax, ebx, ecx, edx },
                )| CpuidEntry {
                    leaf,
                    subleaf,
                    eax,
                    ebx,
                    ecx,
                    edx,
                },
            )
            .collect()
    }
}

impl TryFrom<Vec<CpuidEntry>> for CpuidMap {
    type Error = CpuidMapConversionError;

    /// Converts a set of [`CpuidEntry`] structures from an instance spec into a
    /// [`CpuidMap`]. This conversion fails if
    ///
    /// - one or more of the entries' leaves is not in the standard or extended
    ///   ranges (0x0-0xFFFF and 0x80000000-0x8000FFFF),
    /// - a leaf/subleaf pair is specified more than once, or
    /// - two input entries specify the same leaf value, one specifies a subleaf
    ///   of `None`, and one specifies a subleaf of `Some`.
    fn try_from(
        value: Vec<CpuidEntry>,
    ) -> Result<Self, CpuidMapConversionError> {
        let mut map = Self::default();
        for CpuidEntry { leaf, subleaf, eax, ebx, ecx, edx } in
            value.into_iter()
        {
            if !(STANDARD_LEAVES.contains(&leaf)
                || EXTENDED_LEAVES.contains(&leaf))
            {
                return Err(CpuidMapConversionError::LeafOutOfRange(leaf));
            }

            if map
                .insert(
                    CpuidIdent { leaf, subleaf },
                    CpuidValues { eax, ebx, ecx, edx },
                )?
                .is_some()
            {
                return Err(CpuidMapConversionError::DuplicateLeaf(
                    leaf, subleaf,
                ));
            }
        }

        Ok(map)
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn subleaf_aliasing_forbidden() {
        let entries = vec![
            CpuidEntry {
                leaf: 0,
                subleaf: None,
                eax: 0,
                ebx: 0,
                ecx: 0,
                edx: 0,
            },
            CpuidEntry {
                leaf: 0,
                subleaf: Some(0),
                eax: 0,
                ebx: 0,
                ecx: 0,
                edx: 0,
            },
        ];

        assert!(CpuidMap::try_from(entries).is_err());
    }
}


================================================
FILE: crates/cpuid-utils/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Utility functions and types for working with CPUID values.
//!
//! # The `CPUID` instruction
//!
//! The x86 CPUID instruction returns information about the executing processor.
//! This information can range from a manufacturer ID to a model number to the
//! processor's supported feature set to the calling logical processor's APIC
//! ID.
//!
//! CPUID takes as input a "leaf" or "function" value, passed in the eax
//! register, which determines what information the processor should return.
//! Some leaves accept a "subleaf" or "index" value, specified in ecx, that
//! further qualifies the information class supplied in eax. For example, leaf 4
//! returns information about a processor's cache topology; it uses the subleaf
//! value as an index that identifies a particular type and level of cache.
//!
//! The CPUID leaf space is divided into "standard" and "extended" regions.
//! Leaves in the standard region (0 to 0xFFFF) have architecturally-defined
//! semantics. Leaves in the extended region (0x80000000 to 0x8000FFFF) have
//! vendor-specific semantics.
//!
//! `propolis-server` can accept a list of CPUID leaf/subleaf/value tuples as
//! part of an instance specification. If a client supplies one, Propolis will
//! initialize each vCPU so that CPUID instructions executed there will return
//! the supplied values, possibly with some adjustments (substituting vCPU
//! numbers into those leaves that return them, adjusting CPU topology leaves
//! based on other settings in the spec, etc.).
//!
//! This module implements two collections that Propolis components can use to
//! hold CPUID information:
//!
//! - [`CpuidMap`] maps from CPUID leaf and subleaf pairs to their associated
//!   values, taking care to ensure that a single leaf is marked either as
//!   ignoring or honoring the subleaf number, but not both.
//! - [`CpuidSet`] pairs a `CpuidMap` with a [`CpuidVendor`] that Propolis can
//!   use to interpret the values of the map leaves in the extended region.
//!
//! # The `instance-spec` feature
//!
//! If this crate is built with the `instance-spec` feature, this module
//! includes mechanisms for converting from instance spec CPUID entries to and
//! from the CPUID map types in the crate. This is feature-gated so that the
//! main Propolis lib can use this library without depending on
//! `propolis-api-types`.

use std::{
    collections::{
        btree_map::{self, Entry},
        BTreeMap, BTreeSet,
    },
    ops::RangeInclusive,
};

pub use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor};
use thiserror::Error;

pub mod bits;
pub mod host;

#[cfg(feature = "instance-spec")]
mod instance_spec;

type CpuidSubleafMap = BTreeMap<u32, CpuidValues>;
type CpuidMapInsertResult = Result<Option<CpuidValues>, CpuidMapInsertError>;

/// Denotes the presence or absence of subleaves for a given CPUID leaf.
#[derive(Clone, Debug, PartialEq, Eq)]
enum Subleaves {
    /// This leaf is configured to have subleaves, whose values are stored in
    /// the inner map.
    Present(CpuidSubleafMap),

    /// This leaf does not have subleaves.
    Absent(CpuidValues),
}

/// [`CpuidMap`]'s insert functions return this error if a request to insert a
/// new value would produce a leaf that has both per-subleaf values and a
/// no-subleaf value.
#[derive(Debug, Error)]
pub enum CpuidMapInsertError {
    #[error("leaf {0:x} has entries with subleaves")]
    SubleavesAlreadyPresent(u32),

    #[error("leaf {0:x} has an entry with no subleaves")]
    SubleavesAlreadyAbsent(u32),
}

/// A mapping from CPUID leaf/subleaf pairs to CPUID return values. This struct
/// allows each registered leaf either to have or not to have subleaf values,
/// but not both at once.
///
/// Some CPUID leaves completely ignore the subleaf value passed in ecx. Others
/// pay attention to it and have their own per-leaf semantics that govern what
/// happens if the supplied ecx value is out of the function's expected range.
/// Tracking CPUID values in a simple `BTreeMap` from [`CpuidIdent`] to
/// [`CpuidValues`] allows a single leaf to use both options at once, because
/// `CpuidIdent { leaf: x, subleaf: None }` is not the same as `CpuidIdent {
/// leaf: x, subleaf: Some(y) }`. To avoid this kind of semantic confusion, this
/// type's `insert` method returns a `Result` that indicates whether inserting
/// the requested leaf/subleaf identifier would produce a semantic conflict
/// between a subleaf-bearing and subleaf-free entry.
///
/// This structure allows "holes" in a leaf's subleaf IDs; that is, the
/// structure permits `leaf: 0, subleaf: Some(0)` and `subleaf: Some(2)` to
/// appear in a map where `subleaf: Some(1)` is absent. This is mostly for
/// simplicity (it saves the map type from having to check for discontiguous
/// subleaf domains); the existing subleaf-having CPUID leaves in the Intel and
/// AMD manuals all specify contiguous subleaf domains, and a client who
/// specifies a discontiguous subleaf set may find itself with unhappy guest
/// operating systems.
#[derive(Clone, Debug, Default)]
pub struct CpuidMap(BTreeMap<u32, Subleaves>);

impl CpuidMap {
    /// Retrieves the values associated with the supplied `ident`, or `None` if
    /// the identifier is not present in the map.
    pub fn get(&self, ident: CpuidIdent) -> Option<&CpuidValues> {
        match ident.subleaf {
            None => self.0.get(&ident.leaf).map(|ent| match ent {
                Subleaves::Present(_) => None,
                Subleaves::Absent(val) => Some(val),
            }),
            Some(sl) => self.0.get(&ident.leaf).map(|ent| match ent {
                Subleaves::Absent(_) => None,
                Subleaves::Present(sl_map) => sl_map.get(&sl),
            }),
        }
        .flatten()
    }

    /// Retrieves a mutable reference to the values associated with the supplied
    /// `ident`, or `None` if the identifier is not present in the map.
    pub fn get_mut(&mut self, ident: CpuidIdent) -> Option<&mut CpuidValues> {
        match ident.subleaf {
            None => self.0.get_mut(&ident.leaf).map(|ent| match ent {
                Subleaves::Present(_) => None,
                Subleaves::Absent(val) => Some(val),
            }),
            Some(sl) => self.0.get_mut(&ident.leaf).map(|ent| match ent {
                Subleaves::Absent(_) => None,
                Subleaves::Present(sl_map) => sl_map.get_mut(&sl),
            }),
        }
        .flatten()
    }

    /// Check if a leaf is present, either as a sole entry or with subleaves.
    pub fn contains_leaf(&self, leaf: u32) -> bool {
        self.0.contains_key(&leaf)
    }

    fn insert_leaf_no_subleaf(
        &mut self,
        leaf: u32,
        values: CpuidValues,
    ) -> CpuidMapInsertResult {
        match self.0.entry(leaf) {
            Entry::Vacant(e) => {
                e.insert(Subleaves::Absent(values));
                Ok(None)
            }
            Entry::Occupied(mut e) => match e.get_mut() {
                Subleaves::Present(_) => {
                    Err(CpuidMapInsertError::SubleavesAlreadyPresent(leaf))
                }
                Subleaves::Absent(v) => Ok(Some(std::mem::replace(v, values))),
            },
        }
    }

    fn insert_leaf_subleaf(
        &mut self,
        leaf: u32,
        subleaf: u32,
        values: CpuidValues,
    ) -> CpuidMapInsertResult {
        match self.0.entry(leaf) {
            Entry::Vacant(e) => {
                e.insert(Subleaves::Present(
                    [(subleaf, values)].into_iter().collect(),
                ));
                Ok(None)
            }
            Entry::Occupied(mut e) => match e.get_mut() {
                Subleaves::Absent(_) => {
                    Err(CpuidMapInsertError::SubleavesAlreadyAbsent(leaf))
                }
                Subleaves::Present(sl_map) => {
                    Ok(sl_map.insert(subleaf, values))
                }
            },
        }
    }

    /// Inserts the supplied (`ident`, `values`) pair into the map.
    ///
    /// # Return value
    ///
    /// - `Ok(None)` if the supplied leaf/subleaf pair was not present in the
    ///   map.
    /// - `Ok(Some)` if the supplied leaf/subleaf pair was present in the map.
    ///   The wrapped value is the previous value set for this pair, which is
    ///   replaced by the supplied `values`.
    /// - `Err` if the insert would cause the selected leaf to have one entry
    ///   with no subleaf and one entry with a subleaf.
    pub fn insert(
        &mut self,
        ident: CpuidIdent,
        values: CpuidValues,
    ) -> CpuidMapInsertResult {
        match ident.subleaf {
            Some(sl) => self.insert_leaf_subleaf(ident.leaf, sl, values),
            None => self.insert_leaf_no_subleaf(ident.leaf, values),
        }
    }

    /// Removes the entry with the supplied `ident` from the map if it is
    /// present, returning its value.
    ///
    /// If `ident.subleaf` is `None`, this routine will only match leaf entries
    /// that don't specify per-subleaf values.
    pub fn remove(&mut self, ident: CpuidIdent) -> Option<CpuidValues> {
        // If the leaf isn't present there's nothing to return.
        let Entry::Occupied(mut entry) = self.0.entry(ident.leaf) else {
            return None;
        };

        let (val, remove_leaf) = {
            match (ident.subleaf, entry.get_mut()) {
                // The caller didn't supply a subleaf, and this leaf doesn't
                // have a subleaf map. Yank the leaf entry and return the
                // associated value. (This can't be done inline because the
                // entry is mutably borrowed.)
                (None, Subleaves::Absent(val)) => (*val, true),
                // The caller didn't supply a subleaf, but the leaf has one, so
                // the keys don't match.
                (None, Subleaves::Present(_)) => {
                    return None;
                }
                // The caller supplied a subleaf, but the leaf doesn't use
                // subleaves, so the keys don't match.
                (Some(_), Subleaves::Absent(_)) => {
                    return None;
                }
                // The caller supplied a subleaf, and this leaf has subleaf
                // data. If the requested subleaf is in the leaf's subleaf map,
                // remove the corresponding subleaf entry. If this empties the
                // subleaf map, also clean up the leaf entry.
                (Some(subleaf), Subleaves::Present(sl_map)) => {
                    let val = sl_map.remove(&subleaf)?;
                    (val, sl_map.is_empty())
                }
            }
        };

        if remove_leaf {
            entry.remove_entry();
        }

        Some(val)
    }

    /// Removes all data for the supplied `leaf`. If the leaf has subleaves, all
    /// their entries are removed.
    pub fn remove_leaf(&mut self, leaf: u32) {
        self.0.remove(&leaf);
    }

    /// Retains only the entries in this map for which `f` returns `true`.
    pub fn retain<F>(&mut self, mut f: F)
    where
        F: FnMut(CpuidIdent, CpuidValues) -> bool,
    {
        self.0.retain(|leaf, subleaves| match subleaves {
            Subleaves::Absent(v) => f(CpuidIdent::leaf(*leaf), *v),
            Subleaves::Present(sl_map) => {
                sl_map.retain(|subleaf, v| {
                    f(CpuidIdent::subleaf(*leaf, *subleaf), *v)
                });
                !sl_map.is_empty()
            }
        })
    }

    /// Clears the entire map.
    pub fn clear(&mut self) {
        self.0.clear();
    }

    /// Returns `true` if the map has no entries.
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }

    /// Returns the total number of leaf/subleaf/value tuples in the map.
    pub fn len(&self) -> usize {
        let mut len = 0;
        for sl in self.0.values() {
            match sl {
                Subleaves::Absent(_) => len += 1,
                Subleaves::Present(sl_map) => len += sl_map.len(),
            }
        }

        len
    }

    /// Returns an iterator over the ([`CpuidIdent`], [`CpuidValues`]) pairs in
    /// the map.
    pub fn iter(&self) -> CpuidMapIterator<'_> {
        CpuidMapIterator::new(self)
    }
}

/// An iterator over a [`CpuidMap`]'s leaf/subleaf/value tuples.
pub struct CpuidMapIterator<'a> {
    leaf_iter: btree_map::Iter<'a, u32, Subleaves>,
    subleaf_iter: Option<(u32, btree_map::Iter<'a, u32, CpuidValues>)>,
}

impl<'a> CpuidMapIterator<'a> {
    fn new(map: &'a CpuidMap) -> Self {
        Self { leaf_iter: map.0.iter(), subleaf_iter: None }
    }
}

impl Iterator for CpuidMapIterator<'_> {
    type Item = (CpuidIdent, CpuidValues);

    fn next(&mut self) -> Option<Self::Item> {
        // If a subleaf iteration is in progress, try to advance that iterator.
        // If that produces another subleaf value, return it. Otherwise, clear
        // the subleaf iterator and move to the next leaf.
        if let Some((leaf, subleaf_iter)) = &mut self.subleaf_iter {
            if let Some((subleaf, value)) = subleaf_iter.next() {
                return Some((CpuidIdent::subleaf(*leaf, *subleaf), *value));
            };

            self.subleaf_iter = None;
        }

        // Advance the leaf iterator. If there are no more leaves to iterate,
        // the entire iteration is over.
        let (leaf, subleaves) = self.leaf_iter.next()?;

        // Iteration has moved to a new leaf. Consider whether it has any
        // subleaves. If it doesn't, simply return the leaf and value.
        // Otherwise, start iterating over subleaves.
        match subleaves {
            Subleaves::Absent(val) => Some((CpuidIdent::leaf(*leaf), *val)),
            Subleaves::Present(sl_map) => {
                let mut subleaf_iter = sl_map.iter();

                // This invariant is upheld by insert/remove.
                let (subleaf, value) = subleaf_iter
                    .next()
                    .expect("subleaf maps always have at least one entry");

                // Stash the iterator along with the leaf value it
                // corresponds to so that future iterations can return
                // the entire leaf/subleaf pair.
                self.subleaf_iter = Some((*leaf, subleaf_iter));
                Some((CpuidIdent::subleaf(*leaf, *subleaf), *value))
            }
        }
    }
}

/// A map from CPUID leaves to CPUID values that includes a vendor ID. Callers
/// can use the vendor ID to interpret the meanings of any extended leaves
/// present in the map.
#[derive(Clone, Debug)]
pub struct CpuidSet {
    map: CpuidMap,
    vendor: CpuidVendor,
}

impl Default for CpuidSet {
    /// Equivalent to [`Self::new_host`].
    fn default() -> Self {
        Self::new_host()
    }
}

/// A discrepancy between two [`CpuidSet`]s.
#[derive(Debug, Error)]
pub enum CpuidSetMismatch {
    /// The sets have different CPU vendors.
    #[error("CPUID set has mismatched vendors (self: {this}, other: {other})")]
    Vendor { this: CpuidVendor, other: CpuidVendor },

    /// The two sets have different leaves or subleaves. The payload contains
    /// all of the leaf identifiers that were present in one set but not the
    /// other.
    #[error("CPUID leaves not found in both sets ({0:?})")]
    LeafSet(Vec<CpuidIdent>),

    /// The two sets disagree on the values to return for one or more
    /// leaf/subleaf pairs. The payload contains the leaf/subleaf ID and values
    /// for all such pairs.
    #[error("CPUID leaves have different values in different sets ({0:?})")]
    Values(Vec<(CpuidIdent, CpuidValues, CpuidValues)>),
}

impl CpuidSet {
    /// Creates an empty `CpuidSet` with the supplied `vendor`.
    pub fn new(vendor: CpuidVendor) -> Self {
        Self { map: CpuidMap::default(), vendor }
    }

    /// Creates a new `CpuidSet` with the supplied initial leaf/value `map` and
    /// `vendor`.
    pub fn from_map(map: CpuidMap, vendor: CpuidVendor) -> Self {
        Self { map, vendor }
    }

    /// Yields this set's vendor.
    pub fn vendor(&self) -> CpuidVendor {
        self.vendor
    }

    /// Executes the CPUID instruction on the current machine to determine its
    /// processor vendor, then creates an empty `CpuidSet` with that vendor.
    ///
    /// # Panics
    ///
    /// Panics if the host is not an Intel or AMD CPU (leaf 0 ebx/ecx/edx
    /// contain something other than "GenuineIntel" or "AuthenticAMD").
    pub fn new_host() -> Self {
        let vendor = CpuidVendor::try_from(host::query(CpuidIdent::leaf(0)))
            .expect("host CPU should be from recognized vendor");
        Self::new(vendor)
    }

    /// See [`CpuidMap::insert`].
    pub fn insert(
        &mut self,
        ident: CpuidIdent,
        values: CpuidValues,
    ) -> CpuidMapInsertResult {
        self.map.insert(ident, values)
    }

    /// See [`CpuidMap::get`].
    pub fn get(&self, ident: CpuidIdent) -> Option<&CpuidValues> {
        self.map.get(ident)
    }

    /// See [`CpuidMap::get_mut`].
    pub fn get_mut(&mut self, ident: CpuidIdent) -> Option<&mut CpuidValues> {
        self.map.get_mut(ident)
    }

    /// See [`CpuidMap::contains_leaf`].
    pub fn contains_leaf(&self, leaf: u32) -> bool {
        self.map.contains_leaf(leaf)
    }

    /// See [`CpuidMap::remove_leaf`].
    pub fn remove_leaf(&mut self, leaf: u32) {
        self.map.remove_leaf(leaf);
    }

    /// See [`CpuidMap::retain`].
    pub fn retain<F>(&mut self, f: F)
    where
        F: FnMut(CpuidIdent, CpuidValues) -> bool,
    {
        self.map.retain(f);
    }

    /// See [`CpuidMap::is_empty`].
    pub fn is_empty(&self) -> bool {
        self.map.is_empty()
    }

    /// See [`CpuidMap::iter`].
    pub fn iter(&self) -> CpuidMapIterator<'_> {
        self.map.iter()
    }

    /// Returns `Ok` if `self` is equivalent to `other`; if not, returns a
    /// [`CpuidSetMismatch`] describing the first observed difference between
    /// the two sets.
    pub fn is_equivalent_to(
        &self,
        other: &Self,
    ) -> Result<(), CpuidSetMismatch> {
        if self.vendor != other.vendor {
            return Err(CpuidSetMismatch::Vendor {
                this: self.vendor,
                other: other.vendor,
            });
        }

        let this_set: BTreeSet<_> =
            self.map.iter().map(|(ident, _)| ident).collect();
        let other_set: BTreeSet<_> =
            other.map.iter().map(|(ident, _)| ident).collect();
        let diff = this_set.symmetric_difference(&other_set);
        let diff: Vec<CpuidIdent> = diff.copied().collect();

        if !diff.is_empty() {
            return Err(CpuidSetMismatch::LeafSet(diff));
        }

        let mut mismatches = vec![];
        for (this_leaf, this_value) in self.map.iter() {
            let other_value = other
                .map
                .get(this_leaf)
                .expect("key sets were already found to be equal");

            if this_value != *other_value {
                mismatches.push((this_leaf, this_value, *other_value));
            }
        }

        if !mismatches.is_empty() {
            Err(CpuidSetMismatch::Values(mismatches))
        } else {
            Ok(())
        }
    }
}

impl From<CpuidSet> for Vec<bhyve_api::vcpu_cpuid_entry> {
    fn from(value: CpuidSet) -> Self {
        let mut out = Vec::with_capacity(value.map.len());
        out.extend(value.map.iter().map(|(ident, leaf)| {
            let vce_flags = match ident.subleaf.as_ref() {
                Some(_) => bhyve_api::VCE_FLAG_MATCH_INDEX,
                None => 0,
            };
            bhyve_api::vcpu_cpuid_entry {
                vce_function: ident.leaf,
                vce_index: ident.subleaf.unwrap_or(0),
                vce_flags,
                vce_eax: leaf.eax,
                vce_ebx: leaf.ebx,
                vce_ecx: leaf.ecx,
                vce_edx: leaf.edx,
                ..Default::default()
            }
        }));
        out
    }
}

/// An error that can occur when converting a list of CPUID entries in an
/// instance spec into a [`CpuidMap`].
#[derive(Debug, Error)]
pub enum CpuidMapConversionError {
    #[error("duplicate leaf and subleaf ({0:x}, {1:?})")]
    DuplicateLeaf(u32, Option<u32>),

    #[error("leaf {0:x} not in standard or extended range")]
    LeafOutOfRange(u32),

    #[error(transparent)]
    SubleafConflict(#[from] CpuidMapInsertError),
}

/// The range of standard, architecturally-defined CPUID leaves.
pub const STANDARD_LEAVES: RangeInclusive<u32> = 0..=0xFFFF;

/// The range of extended CPUID leaves. The meanings of these leaves are CPU
/// vendor-specific.
pub const EXTENDED_LEAVES: RangeInclusive<u32> = 0x8000_0000..=0x8000_FFFF;

#[cfg(test)]
mod test {
    use proptest::prelude::*;

    use super::*;

    #[test]
    fn insert_leaf_then_subleaf_fails() {
        let mut map = CpuidMap::default();
        assert_eq!(
            map.insert(CpuidIdent::leaf(0), CpuidValues::default()).unwrap(),
            None
        );

        map.insert(CpuidIdent::subleaf(0, 0), CpuidValues::default())
            .unwrap_err();
    }

    #[test]
    fn insert_subleaf_then_leaf_fails() {
        let mut map = CpuidMap::default();
        assert_eq!(
            map.insert(CpuidIdent::subleaf(0, 0), CpuidValues::default())
                .unwrap(),
            None
        );

        map.insert(CpuidIdent::leaf(0), CpuidValues::default()).unwrap_err();
    }

    #[test]
    fn insert_leaf_then_remove_then_insert_subleaf() {
        let mut map = CpuidMap::default();
        let values = CpuidValues { eax: 1, ebx: 2, ecx: 3, edx: 4 };
        assert_eq!(map.insert(CpuidIdent::leaf(0), values).unwrap(), None);

        map.insert(CpuidIdent::subleaf(0, 1), values).unwrap_err();
        assert_eq!(map.remove(CpuidIdent::leaf(0)).unwrap(), values);
        assert_eq!(
            map.insert(CpuidIdent::subleaf(0, 1), CpuidValues::default())
                .unwrap(),
            None
        );
    }

    #[test]
    fn insert_multiple_subleaves() {
        let mut map = CpuidMap::default();
        let values = CpuidValues { eax: 1, ebx: 2, ecx: 3, edx: 4 };
        for subleaf in 0..10 {
            assert_eq!(
                map.insert(CpuidIdent::subleaf(0, subleaf), values).unwrap(),
                None
            );
        }
    }

    #[test]
    fn leaf_subleaf_removal() {
        let mut map = CpuidMap::default();

        // Add some leaves (0-2) with no subleaves.
        for leaf in 0..3 {
            assert_eq!(
                map.insert(CpuidIdent::leaf(leaf), CpuidValues::default())
                    .unwrap(),
                None
            );
        }

        // Add some leaves (3-5) with subleaves.
        for leaf in 3..6 {
            for subleaf in 6..9 {
                assert_eq!(
                    map.insert(
                        CpuidIdent::subleaf(leaf, subleaf),
                        CpuidValues {
                            eax: leaf,
                            ebx: subleaf,
                            ..Default::default()
                        }
                    )
                    .unwrap(),
                    None
                );
            }
        }

        // One entry for each of leaves 0-2 and three entries each for 3-5.
        let mut len = map.len();
        assert_eq!(len, 3 + (3 * 3));

        // Manually remove all of leaf 5's subleaves.
        for subleaf in 6..9 {
            assert_eq!(
                map.remove(CpuidIdent::subleaf(5, subleaf)).unwrap(),
                CpuidValues { eax: 5, ebx: subleaf, ..Default::default() }
            );

            len -= 1;
            assert_eq!(map.len(), len);
        }

        // Leaf 5 should no longer be in the map in any of its guises.
        assert_eq!(map.get(CpuidIdent::leaf(5)), None);
        for subleaf in 6..9 {
            assert_eq!(map.get(CpuidIdent::subleaf(5, subleaf)), None);
        }

        // Removing leaves 3 and 4 without specifying their subleaves should
        // fail.
        assert_eq!(map.remove(CpuidIdent::leaf(3)), None);
        assert_eq!(map.remove(CpuidIdent::leaf(4)), None);
        assert_eq!(map.len(), len);

        // Remove leaf 3 and its subleaves via `remove_leaf`.
        map.remove_leaf(3);
        len -= 3;
        assert_eq!(map.len(), len);

        // Remove leaf 4 via `retain`.
        map.retain(|id, _val| id.leaf != 4);
        len -= 3;
        assert_eq!(map.len(), len);

        // Removing leaf 0, subleaf 0 should fail.
        assert_eq!(map.remove(CpuidIdent::subleaf(0, 0)), None);

        // Remove leaves 0-2 by their IDs.
        for leaf in 0..3 {
            assert!(map.remove(CpuidIdent::leaf(leaf)).is_some());
            len -= 1;
            assert_eq!(map.len(), len);
        }

        assert_eq!(len, 0);
        assert!(map.is_empty());
    }

    #[derive(Debug)]
    enum MapEntry {
        Leaf(u32, CpuidValues),
        Subleaves(u32, Vec<(u32, CpuidValues)>),
    }

    /// Produces a random CPUID leaf entry. Each entry has an leaf number in
    /// [0..8) and one of the following values:
    ///
    /// - One of sixteen random [`CpuidValues`] values
    /// - One subleaf with index [0..5) and one of two values
    /// - Two such subleaves
    /// - Three such subleaves
    ///
    /// There are (16 + 10 + 100 + 1000) = 1,126 possible values for each leaf,
    /// for a total of 9,008 possible leaf entries.
    fn map_entry_strategy() -> impl Strategy<Value = MapEntry> {
        const MAX_LEAF: u32 = 8;
        prop_oneof![
            (0..MAX_LEAF, prop::array::uniform4(0..2u32)).prop_map(
                |(leaf, value_arr)| { MapEntry::Leaf(leaf, value_arr.into()) }
            ),
            (
                0..MAX_LEAF,
                prop::collection::vec(0..5u32, 1..=3),
                proptest::bool::ANY
            )
                .prop_map(|(leaf, subleaves, set_value)| {
                    let value = if set_value {
                        CpuidValues { eax: 1, ebx: 2, ecx: 3, edx: 4 }
                    } else {
                        CpuidValues::default()
                    };
                    let subleaves = subleaves
                        .into_iter()
                        .map(|subleaf| (subleaf, value))
                        .collect();
                    MapEntry::Subleaves(leaf, subleaves)
                })
        ]
    }

    proptest! {
        /// Verifies that a [`CpuidMapIterator`] visits all of the leaf and
        /// subleaf entries in a map and does so in the expected order.
        ///
        /// proptest will generate a set of 3-8 leaves for each test according
        /// to the strategy defined in [`map_entry_strategy`].
        #[test]
        fn map_iteration_order(
            entries in prop::collection::vec(map_entry_strategy(), 3..=8)
        ) {
            let mut map = CpuidMap::default();
            let mut _expected_len = 0;

            // Insert all of the entries into the map. The input array may have
            // some duplicates and may assign both a no-subleaf and a subleaf
            // value to a single leaf; ignore all of the resulting errors and
            // substitutions.
            for entry in entries {
                match entry {
                    MapEntry::Leaf(leaf, values) => {
                        if let Ok(None) = map.insert(
                            CpuidIdent::leaf(leaf),
                            values
                        ) {
                            _expected_len += 1;
                        }
                    }
                    MapEntry::Subleaves(leaf, subleaves) => {
                        for (subleaf, values) in subleaves {
                            if let Ok(None) = map.insert(
                                CpuidIdent::subleaf(leaf, subleaf),
                                values
                            ) {
                                _expected_len += 1;
                            }
                        }
                    }
                }
            }

            assert_eq!(map.len(), _expected_len);

            // The iterator should visit leaves in order and return subleaves
            // before the next leaf. This happens to be the ordering provided by
            // `CpuidIdent`'s `Ord` implementation, so it suffices just to
            // compare identifiers directly.
            let mut _observed_len = 0;
            let output: Vec<(CpuidIdent, CpuidValues)> = map.iter().collect();
            for (first, second) in
                output.as_slice().windows(2).map(|sl| (sl[0].0, sl[1].0)) {
                assert!(first < second, "first: {first:?}, second: {second:?}");
                _observed_len += 1;
            }

            // The `windows(2)` iterator will not count the last entry (it has
            // no successor), so the actual observed length is one more than the
            // number of observed iterations. (Note that by construction the map
            // is not empty, so there is always a last entry.)
            assert_eq!(_observed_len + 1, _expected_len);
        }
    }
}


================================================
FILE: crates/dladm/Cargo.toml
================================================
[package]
name = "dladm"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
libc.workspace = true
strum = { workspace = true, features = ["derive"] }


================================================
FILE: crates/dladm/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::ffi::CString;
use std::io::{BufRead, BufReader, Error, ErrorKind, Result};
use std::process::{Command, Stdio};
use std::slice;

#[allow(non_camel_case_types)]
mod sys;

use libc::c_void;
use sys::{datalink_class, dladm_handle_t, dladm_status};

pub struct Handle {
    inner: dladm_handle_t,
}
impl Handle {
    pub fn new() -> Result<Self> {
        let mut hdl: dladm_handle_t = std::ptr::null_mut();
        Self::handle_dladm_err(unsafe {
            sys::dladm_open(&mut hdl as *mut dladm_handle_t)
        })?;
        Ok(Self { inner: hdl })
    }

    pub fn query_link(&self, name: &str) -> Result<LinkInfo> {
        let name_cstr = CString::new(name).unwrap();
        let mut link_id: sys::datalink_id_t = 0;
        let mut class: i32 = 0;
        Self::handle_dladm_err(unsafe {
            sys::dladm_name2info(
                self.inner,
                name_cstr.to_bytes_with_nul().as_ptr(),
                &mut link_id as *mut sys::datalink_id_t,
                std::ptr::null_mut(),
                &mut class,
                std::ptr::null_mut(),
            )
        })?;

        let mut res = LinkInfo { link_id, ..Default::default() };

        match datalink_class::from_repr(class) {
            // acceptable values: this supports both VNICs
            // and direct use of XDE/OPTE ports.
            Some(datalink_class::DATALINK_CLASS_VNIC) => {
                Self::get_vnic_mac(name, &mut res.mac_addr[..])?;
            }
            Some(datalink_class::DATALINK_CLASS_MISC) => {
                self.get_misc_mac(link_id, &mut res.mac_addr[..])?;
            }
            Some(c) => {
                return Err(Error::new(
                    ErrorKind::InvalidInput,
                    format!("{name} is not vnic/misc class, but {c:?}"),
                ));
            }
            None => {
                return Err(Error::new(
                    ErrorKind::InvalidInput,
                    format!("{name} is of invalid class {class:x}"),
                ));
            }
        }

        res.mtu = Self::get_mtu(name).ok();

        Ok(res)
    }
    fn get_mtu(name: &str) -> Result<u16> {
        // dladm show-linkprop -c -o value -p mtu <NIC_NAME>
        // 1500
        let output = Command::new("dladm")
            .args(["show-linkprop", "-c", "-o", "value", "-p", "mtu"])
            .arg(name)
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .stdout(Stdio::piped())
            .output()?;
        if !output.status.success() {
            return Err(Error::other("failed dladm"));
        }
        BufReader::new(&output.stdout[..])
            .lines()
            .next()
            .and_then(Result::ok)
            .and_then(|line| line.parse::<u16>().ok())
            .ok_or_else(|| Error::other("invalid mtu"))
    }
    fn get_vnic_mac(name: &str, mac: &mut [u8]) -> Result<()> {
        // dladm show-vnic -p -o macaddress <VNIC_NAME>
        // 2:8:20:2d:e9:24
        let output = Command::new("dladm")
            .args(["show-vnic", "-p", "-o", "macaddress"])
            .arg(name)
            .stderr(Stdio::null())
            .stdin(Stdio::null())
            .stdout(Stdio::piped())
            .output()?;
        if !output.status.success() {
            return Err(Error::other("failed dladm"));
        }
        let addr = BufReader::new(&output.stdout[..])
            .lines()
            .next()
            .and_then(Result::ok)
            .and_then(|line| {
                let fields: Vec<u8> = line
                    .split(':')
                    .filter_map(|f| u8::from_str_radix(f, 16).ok())
                    .collect();
                match fields.len() {
                    ETHERADDRL => Some(fields),
                    _ => None,
                }
            })
            .ok_or_else(|| Error::other("cannot query mac addr"))?;
        mac.copy_from_slice(&addr[..]);
        Ok(())
    }
    fn get_misc_mac(
        &self,
        linkid: sys::datalink_id_t,
        mac: &mut [u8],
    ) -> Result<()> {
        // Unfortunately, XDE/OPTE creates 'misc' type devices, as it is
        // a pseudo device. `dladm` has no built-in commands for these,
        // and macaddr queries for all other link types go through their
        // dedicated `dladm show-<X>` commands. As a consequence, we have
        // to go to libdladm/libdllink directly here.

        // One-off callback function and arg struct.
        // This will use the first seen mac address attached to the link.
        unsafe extern "C" fn per_macaddr(
            arg: *mut c_void,
            macaddr: *mut sys::dladm_macaddr_attr_t,
        ) -> sys::boolean_t {
            let state = &mut *(arg as *mut Arg);
            state.n_seen += 1;

            if (*macaddr).ma_addrlen == (ETHERADDRL as u32) {
                let ma_addr = slice::from_raw_parts(
                    &raw const (*macaddr).ma_addr as *const u8,
                    ETHERADDRL,
                );
                state.mac.copy_from_slice(ma_addr);
                state.written = true;
                sys::boolean_t::B_FALSE
            } else {
                // Keep going.
                sys::boolean_t::B_TRUE
            }
        }

        struct Arg<'a> {
            mac: &'a mut [u8],
            n_seen: usize,
            written: bool,
        }

        let mut state = Arg { mac, n_seen: 0, written: false };

        // SAFETY: dladm_handle_t is known to be valid, and &mut reference
        // to state is only held inside the callback.
        Self::handle_dladm_err(unsafe {
            sys::dladm_walk_macaddr(
                self.inner,
                linkid,
                &mut state as *mut _ as *mut c_void,
                per_macaddr,
            )
        })?;

        if state.n_seen == 0 {
            return Err(Error::other("no mac addrs found on link"));
        } else if !state.written {
            return Err(Error::other(
                "no mac addrs on link had correct length (6B)",
            ));
        }

        Ok(())
    }

    fn handle_dladm_err(v: i32) -> Result<()> {
        match dladm_status::from_repr(v)
            .unwrap_or(dladm_status::DLADM_STATUS_FAILED)
        {
            dladm_status::DLADM_STATUS_OK => Ok(()),
            e => Err(Error::other(format!("{e:?}"))),
        }
    }
}
impl Drop for Handle {
    fn drop(&mut self) {
        unsafe { sys::dladm_close(self.inner) }
        self.inner = std::ptr::null_mut();
    }
}

const ETHERADDRL: usize = 6;

#[derive(Copy, Clone, Default)]
pub struct LinkInfo {
    pub link_id: u32,
    pub mtu: Option<u16>,
    pub mac_addr: [u8; ETHERADDRL],
}


================================================
FILE: crates/dladm/src/sys.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use libc::{c_char, c_int, c_uchar, c_uint, c_void};
use strum::FromRepr;

#[cfg(target_os = "illumos")]
#[link(name = "dladm")]
extern "C" {
    pub fn dladm_open(handle: *mut dladm_handle_t) -> c_int;
    pub fn dladm_close(handle: dladm_handle_t);
    pub fn dladm_name2info(
        handle: dladm_handle_t,
        link: *const u8,
        linkidp: *mut datalink_id_t,
        flagp: *mut u32,
        // parse to datalink_class
        classp: *mut c_int,
        mediap: *mut u32,
    ) -> c_int;
    pub fn dladm_walk_macaddr(
        handle: dladm_handle_t,
        linkid: datalink_id_t,
        arg: *mut c_void,
        callback: unsafe extern "C" fn(
            *mut c_void,
            *mut dladm_macaddr_attr_t,
        ) -> boolean_t,
    ) -> c_int;
}

#[cfg(not(target_os = "illumos"))]
mod compat {
    #![allow(unused)]
    use super::*;

    pub unsafe extern "C" fn dladm_open(handle: *mut dladm_handle_t) -> c_int {
        panic!("illumos only");
    }
    pub unsafe extern "C" fn dladm_close(handle: dladm_handle_t) {
        panic!("illumos only");
    }
    pub unsafe extern "C" fn dladm_name2info(
        handle: dladm_handle_t,
        link: *const u8,
        linkidp: *mut datalink_id_t,
        flagp: *mut u32,
        // parse to datalink_class
        classp: *mut c_int,
        mediap: *mut u32,
    ) -> c_int {
        panic!("illumos only");
    }
    pub unsafe extern "C" fn dladm_walk_macaddr(
        handle: dladm_handle_t,
        linkid: datalink_id_t,
        arg: *mut c_void,
        callback: unsafe extern "C" fn(
            *mut c_void,
            *mut dladm_macaddr_attr_t,
        ) -> boolean_t,
    ) -> c_int {
        panic!("illumos only");
    }
}
#[cfg(not(target_os = "illumos"))]
pub use compat::*;

/* opaque dladm handle to libdladm functions */
pub enum dladm_handle {}
pub type dladm_handle_t = *mut dladm_handle;
pub type datalink_id_t = u32;
#[repr(C)]
pub struct dladm_macaddr_attr_t {
    pub ma_slot: c_uint,
    pub ma_flags: c_uint,
    pub ma_addr: [c_uchar; MAXMACADDRLEN],
    pub ma_addrlen: c_uint,
    pub ma_client_name: [c_char; MAXNAMELEN],
    pub ma_client_linkid: datalink_id_t,
}
#[repr(C)]
pub enum boolean_t {
    B_FALSE,
    B_TRUE,
}

const MAXMACADDRLEN: usize = 20;
const MAXNAMELEN: usize = 256;

#[derive(Copy, Clone, Debug, Eq, PartialEq, FromRepr)]
#[repr(i32)]
pub enum datalink_class {
    DATALINK_CLASS_PHYS = 0x01,
    DATALINK_CLASS_VLAN = 0x02,
    DATALINK_CLASS_AGGR = 0x04,
    DATALINK_CLASS_VNIC = 0x08,
    DATALINK_CLASS_ETHERSTUB = 0x10,
    DATALINK_CLASS_SIMNET = 0x20,
    DATALINK_CLASS_BRIDGE = 0x40,
    DATALINK_CLASS_IPTUN = 0x80,
    DATALINK_CLASS_PART = 0x100,
    DATALINK_CLASS_MISC = 0x400,
}

#[derive(Copy, Clone, Debug, Eq, PartialEq, FromRepr)]
#[repr(i32)]
pub enum dladm_status {
    DLADM_STATUS_OK = 0,
    DLADM_STATUS_BADARG,
    DLADM_STATUS_FAILED,
    DLADM_STATUS_TOOSMALL,
    DLADM_STATUS_NOTSUP,
    DLADM_STATUS_NOTFOUND,
    DLADM_STATUS_BADVAL,
    DLADM_STATUS_NOMEM,
    DLADM_STATUS_EXIST,
    DLADM_STATUS_LINKINVAL,
    DLADM_STATUS_PROPRDONLY,
    DLADM_STATUS_BADVALCNT,
    DLADM_STATUS_DBNOTFOUND,
    DLADM_STATUS_DENIED,
    DLADM_STATUS_IOERR,
    DLADM_STATUS_TEMPONLY,
    DLADM_STATUS_TIMEDOUT,
    DLADM_STATUS_ISCONN,
    DLADM_STATUS_NOTCONN,
    DLADM_STATUS_REPOSITORYINVAL,
    DLADM_STATUS_MACADDRINVAL,
    DLADM_STATUS_KEYINVAL,
    DLADM_STATUS_INVALIDMACADDRLEN,
    DLADM_STATUS_INVALIDMACADDRTYPE,
    DLADM_STATUS_LINKBUSY,
    DLADM_STATUS_VIDINVAL,
    DLADM_STATUS_NONOTIF,
    DLADM_STATUS_TRYAGAIN,
    DLADM_STATUS_IPTUNTYPE,
    DLADM_STATUS_IPTUNTYPEREQD,
    DLADM_STATUS_BADIPTUNLADDR,
    DLADM_STATUS_BADIPTUNRADDR,
    DLADM_STATUS_ADDRINUSE,
    DLADM_STATUS_BADTIMEVAL,
    DLADM_STATUS_INVALIDMACADDR,
    DLADM_STATUS_INVALIDMACADDRNIC,
    DLADM_STATUS_INVALIDMACADDRINUSE,
    DLADM_STATUS_MACFACTORYSLOTINVALID,
    DLADM_STATUS_MACFACTORYSLOTUSED,
    DLADM_STATUS_MACFACTORYSLOTALLUSED,
    DLADM_STATUS_MACFACTORYNOTSUP,
    DLADM_STATUS_INVALIDMACPREFIX,
    DLADM_STATUS_INVALIDMACPREFIXLEN,
    DLADM_STATUS_BADCPUID,
    DLADM_STATUS_CPUERR,
    DLADM_STATUS_CPUNOTONLINE,
    DLADM_STATUS_BADRANGE,
    DLADM_STATUS_TOOMANYELEMENTS,
    DLADM_STATUS_DB_NOTFOUND,
    DLADM_STATUS_DB_PARSE_ERR,
    DLADM_STATUS_PROP_PARSE_ERR,
    DLADM_STATUS_ATTR_PARSE_ERR,
    DLADM_STATUS_FLOW_DB_ERR,
    DLADM_STATUS_FLOW_DB_OPEN_ERR,
    DLADM_STATUS_FLOW_DB_PARSE_ERR,
    DLADM_STATUS_FLOWPROP_DB_PARSE_ERR,
    DLADM_STATUS_FLOW_ADD_ERR,
    DLADM_STATUS_FLOW_WALK_ERR,
    DLADM_STATUS_FLOW_IDENTICAL,
    DLADM_STATUS_FLOW_INCOMPATIBLE,
    DLADM_STATUS_FLOW_EXISTS,
    DLADM_STATUS_PERSIST_FLOW_EXISTS,
    DLADM_STATUS_INVALID_IP,
    DLADM_STATUS_INVALID_PREFIXLEN,
    DLADM_STATUS_INVALID_PROTOCOL,
    DLADM_STATUS_INVALID_PORT,
    DLADM_STATUS_INVALID_DSF,
    DLADM_STATUS_INVALID_DSFMASK,
    DLADM_STATUS_INVALID_MACMARGIN,
    DLADM_STATUS_NOTDEFINED,
    DLADM_STATUS_BADPROP,
    DLADM_STATUS_MINMAXBW,
    DLADM_STATUS_NO_HWRINGS,
    DLADM_STATUS_PERMONLY,
    DLADM_STATUS_OPTMISSING,
    DLADM_STATUS_POOLCPU,
    DLADM_STATUS_INVALID_PORT_INSTANCE,
    DLADM_STATUS_PORT_IS_DOWN,
    DLADM_STATUS_PKEY_NOT_PRESENT,
    DLADM_STATUS_PARTITION_EXISTS,
    DLADM_STATUS_INVALID_PKEY,
    DLADM_STATUS_NO_IB_HW_RESOURCE,
    DLADM_STATUS_INVALID_PKEY_TBL_SIZE,
    DLADM_STATUS_PORT_NOPROTO,
    DLADM_STATUS_INVALID_MTU,
    DLADM_STATUS_PERSIST_ON_TEMP,
}


================================================
FILE: crates/nvpair/Cargo.toml
================================================
[package]
name = "nvpair"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
nvpair_sys.workspace = true
libc.workspace = true


================================================
FILE: crates/nvpair/header-check/Cargo.toml
================================================
[package]
name = "nvpair-hdrchk"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"
build = "build.rs"
publish = false

[dependencies]
nvpair_sys = { path = "../sys" }
libc = "0.2"

[build-dependencies]
cc = "1"
ctest2 = "0.4.7"

[[test]]
name = "main"
path = "test/main.rs"
harness = false


================================================
FILE: crates/nvpair/header-check/build.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![deny(warnings)]

fn main() {
    let mut cfg = ctest2::TestGenerator::new();

    cfg.header("libnvpair.h");

    cfg.type_name(|ty, is_struct, is_union| match ty {
        t if t.ends_with("_t") => t.to_string(),
        t if is_struct => format!("struct {t}"),
        t if is_union => format!("union {t}"),
        t => t.to_string(),
    });

    cfg.skip_const(move |name| match name {
        _ => false,
    });

    cfg.skip_struct(|name| match name {
        _ => false,
    });

    cfg.skip_field_type(|ty, field| match (ty, field) {
        _ => false,
    });

    cfg.generate("../sys/src/lib.rs", "main.rs");
}


================================================
FILE: crates/nvpair/header-check/test/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use nvpair_sys::*;

include!(concat!(env!("OUT_DIR"), "/main.rs"));


================================================
FILE: crates/nvpair/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(clippy::new_without_default)]

use nvpair_sys::*;

use std::ffi::CStr;
use std::ptr::NonNull;

pub struct NvList(NonNull<nvlist_t>);

impl NvList {
    pub fn new() -> Self {
        unsafe {
            let nvlp = fnvlist_alloc();
            Self(NonNull::new_unchecked(nvlp))
        }
    }
    pub fn pack(&mut self) -> Packed {
        unsafe {
            let mut size = 0;
            let ptr = fnvlist_pack(self.0.as_mut(), &mut size);
            Packed { data: NonNull::new_unchecked(ptr.cast()), size }
        }
    }

    pub fn unpack(buf: &mut [u8]) -> std::io::Result<Self> {
        Self::unpack_ptr(buf.as_mut_ptr(), buf.len())
    }

    pub fn unpack_ptr(buf: *mut u8, len: usize) -> std::io::Result<Self> {
        let mut nvp = std::ptr::null_mut();
        match unsafe { nvlist_unpack(buf.cast(), len, &mut nvp, 0) } {
            0 => Ok(Self(
                NonNull::new(nvp)
                    .expect("nvlist_unpack emits non-NULL pointer on success"),
            )),
            err => Err(std::io::Error::from_raw_os_error(err)),
        }
    }

    #[inline(always)]
    pub fn add<'a>(
        &'a mut self,
        name: impl Into<NvName<'a>>,
        value: impl Into<NvData<'a>>,
    ) {
        self.add_name_value(name.into(), value.into());
    }

    pub fn add_name_value(&mut self, name: NvName, value: NvData) {
        unsafe {
            let name = name.as_ptr();
            let nvlp = self.0.as_mut();

            match value {
                NvData::Boolean => {
                    fnvlist_add_boolean(nvlp, name);
                }
                NvData::BooleanValue(val) => {
                    fnvlist_add_boolean_value(nvlp, name, val.into());
                }
                NvData::Byte(val) => {
                    fnvlist_add_byte(nvlp, name, val);
                }
                NvData::Int8(val) => {
                    fnvlist_add_int8(nvlp, name, val);
                }
                NvData::UInt8(val) => {
                    fnvlist_add_uint8(nvlp, name, val);
                }
                NvData::Int16(val) => {
                    fnvlist_add_int16(nvlp, name, val);
                }
                NvData::UInt16(val) => {
                    fnvlist_add_uint16(nvlp, name, val);
                }
                NvData::Int32(val) => {
                    fnvlist_add_int32(nvlp, name, val);
                }
                NvData::UInt32(val) => {
                    fnvlist_add_uint32(nvlp, name, val);
                }
                NvData::Int64(val) => {
                    fnvlist_add_int64(nvlp, name, val);
                }
                NvData::UInt64(val) => {
                    fnvlist_add_uint64(nvlp, name, val);
                }
                NvData::NvList(val) => {
                    // SAFETY: while this takes a *mut nvlist_t, we are counting
                    // on libnvpair to not actually mutate the to-be-added list.
                    fnvlist_add_nvlist(nvlp, name, val.0.as_ptr());
                }
                NvData::String(val) => {
                    fnvlist_add_string(nvlp, name, val.as_ptr());
                }
            }
        }
    }
}
impl Drop for NvList {
    fn drop(&mut self) {
        unsafe {
            fnvlist_free(self.0.as_mut());
        }
    }
}

pub struct Packed {
    data: NonNull<u8>,
    size: usize,
}
impl Packed {
    pub fn as_ptr(&self) -> *const u8 {
        self.data.as_ptr()
    }
    pub fn as_mut_ptr(&mut self) -> *mut u8 {
        self.data.as_ptr()
    }
}
impl AsRef<[u8]> for Packed {
    fn as_ref(&self) -> &[u8] {
        unsafe { std::slice::from_raw_parts(self.data.as_ptr(), self.size) }
    }
}
impl Drop for Packed {
    fn drop(&mut self) {
        unsafe { fnvlist_pack_free(self.data.as_ptr().cast(), self.size) }
    }
}

macro_rules! nvdata_from {
    (&$l:lifetime $t:ty, $i:ident) => {
        impl<$l> From<& $l $t> for NvData<$l> {
            fn from(value: & $l $t) -> Self {
                Self::$i(value)
            }
        }
    };
    ($t:ty, $i:ident) => {
        impl From<$t> for NvData<'_> {
            fn from(value: $t) -> Self {
                Self::$i(value)
            }
        }
    };
}

pub enum NvData<'a> {
    Boolean,
    BooleanValue(bool),
    Byte(u8),
    Int8(i8),
    UInt8(u8),
    Int16(i16),
    UInt16(u16),
    Int32(i32),
    UInt32(u32),
    Int64(i64),
    UInt64(u64),
    NvList(&'a NvList),
    String(&'a CStr),
}

nvdata_from!(bool, BooleanValue);
nvdata_from!(i8, Int8);
nvdata_from!(u8, UInt8);
nvdata_from!(i16, Int16);
nvdata_from!(u16, UInt16);
nvdata_from!(i32, Int32);
nvdata_from!(u32, UInt32);
nvdata_from!(i64, Int64);
nvdata_from!(u64, UInt64);
nvdata_from!(&'a CStr, String);

pub enum NvName<'a> {
    Owned(Vec<u8>),
    Loaned(&'a [u8]),
}
impl NvName<'_> {
    pub fn as_ptr(&self) -> *const i8 {
        match self {
            NvName::Owned(v) => v.as_ptr().cast(),
            NvName::Loaned(s) => s.as_ptr().cast(),
        }
    }
}
impl AsRef<[u8]> for NvName<'_> {
    fn as_ref(&self) -> &[u8] {
        match self {
            NvName::Owned(b) => b.as_slice(),
            NvName::Loaned(s) => s,
        }
    }
}
impl Clone for NvName<'_> {
    fn clone(&self) -> Self {
        match self {
            NvName::Owned(v) => NvName::Owned(v.clone()),
            NvName::Loaned(s) => NvName::Owned(s.to_vec()),
        }
    }
}
impl<'a> From<&'a str> for NvName<'a> {
    fn from(value: &'a str) -> Self {
        let bytes = value.as_bytes();
        if let Some(nul_idx) =
            bytes.iter().enumerate().find_map(|(idx, b)| match *b {
                0 => Some(idx),
                _ => None,
            })
        {
            Self::Loaned(&bytes[..=nul_idx])
        } else {
            let mut copy = Vec::with_capacity(bytes.len() + 1);
            copy.extend(bytes);
            copy.push(0);
            Self::Owned(copy)
        }
    }
}
impl<'a> From<&'a CStr> for NvName<'a> {
    fn from(value: &'a CStr) -> Self {
        Self::Loaned(value.to_bytes())
    }
}


================================================
FILE: crates/nvpair/sys/Cargo.toml
================================================
[package]
name = "nvpair_sys"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
libc.workspace = true


================================================
FILE: crates/nvpair/sys/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(non_camel_case_types, non_snake_case)]

use std::ffi::{c_char, c_int};

use libc::size_t;

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
#[repr(i32)]
pub enum data_type_t {
    DATA_TYPE_DONTCARE = -1,
    DATA_TYPE_UNKNOWN = 0,
    DATA_TYPE_BOOLEAN,
    DATA_TYPE_BYTE,
    DATA_TYPE_INT16,
    DATA_TYPE_UINT16,
    DATA_TYPE_INT32,
    DATA_TYPE_UINT32,
    DATA_TYPE_INT64,
    DATA_TYPE_UINT64,
    DATA_TYPE_STRING,
    DATA_TYPE_BYTE_ARRAY,
    DATA_TYPE_INT16_ARRAY,
    DATA_TYPE_UINT16_ARRAY,
    DATA_TYPE_INT32_ARRAY,
    DATA_TYPE_UINT32_ARRAY,
    DATA_TYPE_INT64_ARRAY,
    DATA_TYPE_UINT64_ARRAY,
    DATA_TYPE_STRING_ARRAY,
    DATA_TYPE_HRTIME,
    DATA_TYPE_NVLIST,
    DATA_TYPE_NVLIST_ARRAY,
    DATA_TYPE_BOOLEAN_VALUE,
    DATA_TYPE_INT8,
    DATA_TYPE_UINT8,
    DATA_TYPE_BOOLEAN_ARRAY,
    DATA_TYPE_INT8_ARRAY,
    DATA_TYPE_UINT8_ARRAY,
}

#[derive(Copy, Clone, Debug)]
#[repr(C)]
pub struct nvpair_t {
    pub nvp_size: i32,
    pub nvp_name_sz: i16,
    pub nvp_reserve: i16,
    pub nvp_value_elem: i32,
    pub nvp_type: i32,
}
impl nvpair_t {
    /// Get the name string from an `nvpair_t` pointer
    ///
    /// # Safety
    /// The `nvpair_t` pointer must be allocated from libnvpair to ensure
    /// expected positioning of name data.
    pub const unsafe fn NVP_NAME(nvp: *mut nvpair_t) -> *mut c_char {
        nvp.add(1).cast()
    }
    /// Get the value address from an `nvpair_t` pointer
    ///
    /// # Safety
    /// The `nvpair_t` pointer must be allocated from libnvpair to ensure
    /// expected positioning of value data.
    pub unsafe fn NVP_VALUE(nvp: *mut nvpair_t) -> *mut c_char {
        let name_sz = (*nvp).nvp_name_sz;

        NV_ALIGN(nvp.add(1) as usize + name_sz as usize) as *mut c_char
    }
}

#[derive(Copy, Clone, Debug)]
#[repr(C)]
pub struct nvlist_t {
    pub nvl_version: i32,
    pub nvl_nvflag: u32,
    pub nvl_priv: u64,
    pub nvl_flag: u32,
    pub nvl_pad: i32,
}

pub const NV_VERSION: i32 = 0;

pub const NV_ENCODE_NATIVE: u32 = 0;
pub const NV_ENCODE_XDR: u32 = 1;

pub const NV_UNIQUE_NAME: u32 = 0x1;
pub const NV_UNIQUE_NAME_TYPE: u32 = 0x2;

pub const NV_FLAG_NOENTOK: u32 = 0x1;

pub const fn NV_ALIGN(addr: usize) -> usize {
    (addr + 7) & !7usize
}

#[repr(C)]
pub enum boolean_t {
    B_FALSE = 0,
    B_TRUE = 1,
}
impl From<bool> for boolean_t {
    fn from(value: bool) -> Self {
        match value {
            false => boolean_t::B_FALSE,
            _ => boolean_t::B_TRUE,
        }
    }
}
impl From<boolean_t> for bool {
    fn from(value: boolean_t) -> Self {
        match value {
            boolean_t::B_FALSE => false,
            _ => false,
        }
    }
}

#[cfg_attr(target_os = "illumos", link(name = "nvpair"))]
extern "C" {
    pub fn nvlist_remove(
        nvl: *mut nvlist_t,
        name: *const c_char,
        dtype: data_type_t,
    ) -> c_int;
    pub fn nvlist_remove_all(nvl: *mut nvlist_t, name: *const c_char) -> c_int;
    pub fn nvlist_remove_nvpair(
        nvl: *mut nvlist_t,
        nvp: *mut nvpair_t,
    ) -> c_int;
    pub fn nvlist_lookup_nvpair(
        nvl: *mut nvlist_t,
        name: *const c_char,
        nvp: *mut *mut nvpair_t,
    ) -> c_int;

    pub fn nvlist_next_nvpair(
        nvl: *mut nvlist_t,
        nvp: *mut nvpair_t,
    ) -> *mut nvpair_t;
    pub fn nvlist_prev_nvpair(
        nvl: *mut nvlist_t,
        nvp: *mut nvpair_t,
    ) -> *mut nvpair_t;

    pub fn nvlist_exists(nvl: *mut nvlist_t, nvp: *const c_char) -> boolean_t;
    pub fn nvlist_empty(nvl: *mut nvlist_t) -> boolean_t;

    pub fn nvlist_unpack(
        buf: *mut c_char,
        size: size_t,
        nvlp: *mut *mut nvlist_t,
        flags: c_int,
    ) -> c_int;

    pub fn fnvlist_alloc() -> *mut nvlist_t;
    pub fn fnvlist_free(nvl: *mut nvlist_t);
    pub fn fnvlist_size(nvl: *mut nvlist_t) -> size_t;
    pub fn fnvlist_pack(nvl: *mut nvlist_t, sizep: *mut size_t) -> *mut c_char;
    pub fn fnvlist_pack_free(packed: *mut c_char, size: size_t);
    pub fn fnvlist_unpack(buf: *mut c_char, size: size_t) -> *mut nvlist_t;
    pub fn fnvlist_dup(nvl: *mut nvlist_t) -> *mut nvlist_t;
    pub fn fnvlist_merge(dst_nvl: *mut nvlist_t, src_nvl: *mut nvlist_t);
    pub fn fnvlist_num_pairs(nvl: *mut nvlist_t) -> size_t;

    pub fn fnvlist_add_boolean(nvl: *mut nvlist_t, name: *const c_char);
    pub fn fnvlist_add_boolean_value(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: boolean_t,
    );
    pub fn fnvlist_add_byte(nvl: *mut nvlist_t, name: *const c_char, val: u8);
    pub fn fnvlist_add_int8(nvl: *mut nvlist_t, name: *const c_char, val: i8);
    pub fn fnvlist_add_uint8(nvl: *mut nvlist_t, name: *const c_char, val: u8);
    pub fn fnvlist_add_int16(nvl: *mut nvlist_t, name: *const c_char, val: i16);
    pub fn fnvlist_add_uint16(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: u16,
    );
    pub fn fnvlist_add_int32(nvl: *mut nvlist_t, name: *const c_char, val: i32);
    pub fn fnvlist_add_uint32(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: u32,
    );
    pub fn fnvlist_add_int64(nvl: *mut nvlist_t, name: *const c_char, val: i64);
    pub fn fnvlist_add_uint64(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: u64,
    );
    pub fn fnvlist_add_string(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: *const c_char,
    );
    pub fn fnvlist_add_nvlist(
        nvl: *mut nvlist_t,
        name: *const c_char,
        val: *mut nvlist_t,
    );
    pub fn fnvlist_add_nvpair(nvl: *mut nvlist_t, val: *mut nvpair_t);
    // TODO: add_*_array functions
}


================================================
FILE: crates/pbind/Cargo.toml
================================================
[package]
name = "pbind"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[dependencies]
libc.workspace = true


================================================
FILE: crates/pbind/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// C-style type names follow, opt out of warnings for using names from headers.
#![allow(non_camel_case_types)]

//! Utility functions for binding LWPs to specific CPUs.
//!
//! This is generally a very light wrapper for illumos' `sysconf(3c)` and
//! `processor_bind(2)`, plus a few constants out of related headers.

use std::io::Error;

// From `<sys/types.h>`
pub type id_t = i32;

// From `<sys/processor.h>`
pub type processorid_t = i32;

// From `<sys/procset.h>`
pub type idtype_t = i32;

/// The enum values `idtype_t` can be. This is separate to be more explicit that
/// idtype_t is the ABI type, but is `repr(i32)` to make casting to `idtype_t`
/// trivial.
#[allow(non_camel_case_types)]
#[repr(i32)]
pub enum IdType {
    P_PID,
    P_PPID,
    P_PGID,
    P_SID,
    P_CID,
    P_UID,
    P_GID,
    P_ALL,
    P_LWPID,
    P_TASKID,
    P_PROJID,
    P_POOLID,
    P_ZONEID,
    P_CTID,
    P_CPUID,
    P_PSETID,
}

/// Returns an `i32` to match `processorid_t`, so that `0..online_cpus()`
/// produces a range of processor IDs without additional translation needed.
///
/// This is really just a wrapper for `sysconf(_SC_NPROCESSORS_ONLN)`.
pub fn online_cpus() -> Result<i32, Error> {
    let res = unsafe { libc::sysconf(libc::_SC_NPROCESSORS_ONLN) };

    if res == -1 {
        return Err(Error::last_os_error());
    }

    res.try_into().map_err(|_| {
        // sysconf() reports more than 2^31 processors?!
        Error::other(format!("too many processors: {res}"))
    })
}

#[cfg(target_os = "illumos")]
/// Bind the current LWP to the specified processor.
pub fn bind_lwp(bind_cpu: processorid_t) -> Result<(), Error> {
    extern "C" {
        fn processor_bind(
            idtype: idtype_t,
            id: id_t,
            processorid: processorid_t,
            obind: *mut processorid_t,
        ) -> i32;
    }

    // From `<sys/types.h>`.
    const P_MYID: id_t = -1;

    let res = unsafe {
        processor_bind(
            IdType::P_LWPID as i32,
            P_MYID,
            bind_cpu,
            std::ptr::null_mut(),
        )
    };

    if res != 0 {
        return Err(Error::last_os_error());
    }

    Ok(())
}

#[cfg(not(target_os = "illumos"))]
/// On non-illumos targets, we're not actually running a VM. We do need the
/// crate to compile to be nicer for blanket `cargo test` invocations on other
/// platforms. So a no-op function will do.
pub fn bind_lwp(_bind_cpu: processorid_t) -> Result<(), Error> {
    Ok(())
}


================================================
FILE: crates/propolis-api-types/Cargo.toml
================================================
[package]
name = "propolis_api_types"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
crucible-client-types.workspace = true
propolis-api-types-versions.workspace = true


================================================
FILE: crates/propolis-api-types/src/disk.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Disk-related types.

pub use propolis_api_types_versions::latest::disk::*;


================================================
FILE: crates/propolis-api-types/src/instance.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance management types.

pub use propolis_api_types_versions::latest::instance::*;


================================================
FILE: crates/propolis-api-types/src/instance_spec/components/backends.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Backend configuration data: the structs that tell Propolis how to configure
//! its components to talk to other services supplied by the host OS or the
//! larger rack.

pub use propolis_api_types_versions::latest::components::backends::*;


================================================
FILE: crates/propolis-api-types/src/instance_spec/components/board.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! VM mainboard components. Every VM has a board, even if it has no other
//! peripherals.

pub use propolis_api_types_versions::latest::components::board::*;


================================================
FILE: crates/propolis-api-types/src/instance_spec/components/devices.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Device configuration data: components that define VM properties that are
//! visible to a VM's guest software.

pub use propolis_api_types_versions::latest::components::devices::*;


================================================
FILE: crates/propolis-api-types/src/instance_spec/components/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Specifications for components that can be attached to a Propolis VM.
//!
//! # Versioning and compatibility
//!
//! Components are 'versionless' and can be added to any specification of any
//! format. Existing components must only change in backward-compatible ways
//! (i.e. so that old versions of the component can deserialize into an
//! equivalent new-version component). If possible, changes to a new component
//! should be expressed such that older versions of the component are forward-
//! compatible with the new version (i.e. such that the new component will
//! serialize, if possible, into a form that can be deserialized by an old
//! version of this library into an equivalent old-version component).

pub mod backends;
pub mod board;
pub mod devices;


================================================
FILE: crates/propolis-api-types/src/instance_spec/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance specifications: abstract descriptions of a VM's devices and config.
//!
//! An instance spec describes a VM's virtual devices, backends, and other
//! guest environment configuration supplied by the Propolis VMM. RFD 283
//! contains more details about how specs are used throughout the Oxide stack
//! and about the versioning considerations described below.
//!
//! # Module layout
//!
//! The data types in this module are taxonomized into "components" and
//! versioned "spec structures." Components are the "leaves" of a spec; each
//! component specifies an individual component or piece of functionality that
//! can be added to a VM. The strongly versioned structure types arrange these
//! components in a specific way; each organization is a version of the overall
//! instance spec structure.
//!
//! # Versioning & compatibility
//!
//! **NOTE:** This section is likely out of date with respect to our current
//! versioning scheme. In particular, we have decided to use strongly versioned
//! endpoint types as part of server-side versioning ([RFD
//! 532](https://rfd.shared.oxide.computer/rfd/532)), as opposed to the more
//! fine-grained versioning scheme described below. The implications for
//! Propolis are a bit more nuanced, though, because Propolis instances can talk
//! to each other directly.
//!
//! Instance specs may be sent between Propolises, sled agents, and Nexus
//! processes that use different versions of this library, so the library needs
//! to provide a versioning scheme that allows specs to be extended over time.
//! Such scheme must balance safety against developer toil. Strongly versioning
//! data types--requiring a new API endpoint or type definition every time
//! something changes--minimizes the risk that a data structure will be
//! misinterpreted, but is very toilsome to maintain, since changing one
//! structure may require many other structures to be revised and
//! `From`/`TryFrom` impls to be added for all the new version combinations.
//! Weaker versioning schemes require less toil to maintain but run the risk
//! that a spec user will be too permissive and will misconfigure a VM because
//! it missed some important context in a spec that it was passed.
//!
//! This module balances these concerns as follows:
//!
//! - **Components** are versionless but are allowed to be extended in backward-
//!   compatible ways (i.e., such that a spec produced by an old library can be
//!   interpreted correctly by a newer library). Breaking changes to components
//!   are not allowed and require a new component to be defined.
//! - **Spec structures** are strongly versioned. Backward-compatible changes to
//!   an existing version are technically allowed, but completely restructuring
//!   a spec requires a new spec version and a corresponding variant in the
//!   `VersionedInstanceSpec` structure.
//!
//! This scheme assumes that (a) components are likely to be added or changed
//! much more frequently than the spec structure itself will be revised, and (b)
//! most changes to existing components can easily be made backward-compatible
//! (e.g. by wrapping new functionality in an `Option` and taking a `None` value
//! to mean "do what all previous versions did").
//!
//! ## Compatibility rules & breaking changes
//!
//! Changes to existing data types must be backward compatible with older spec
//! versions: a spec produced by an old version of the library must always be
//! deserializable by a new version of the library.
//!
//! The following component changes are not backward compatible:
//!
//! - Adding a new required field to a struct or enum variant
//! - Removing a field from a struct or enum variant
//! - Renaming structs, enums, or their fields or variants
//!
//! Adding new *optional* fields to a struct or enum variant is OK provided that
//! the default value's semantics match the semantics expected by users of older
//! specs that don't provide the optional data.
//!
//! Forward compatibility--writing the library so that old versions can
//! interpret specs generated by new versions--is not generally guaranteed.
//! Where possible, however, spec components should be written so that it is
//! possible to downgrade from a newer spec version to an older one if a
//! component's configuration can be represented in both versions.
//!
//! ## Serde attributes
//!
//! This module doesn't directly verify that a specific Propolis version can
//! support all of the features in any particular specification. However, users
//! can generally expect that if Propolis is willing to deserialize a spec, then
//! it should be able (in at least some circumstances) to support all of the
//! features that can be expressed in that spec. To help guarantee this property
//! (i.e., if Propolis can deserialize it, then it's at least well-formed), this
//! module uses a few common `serde` attributes.
//!
//! Structs and enums in this module should be tagged with the
//! `#[serde(deny_unknown_fields)]` attribute to reduce the risk that old code
//! will silently drop information from a spec produced by newer code with more
//! available fields.
//!
//! New optional fields should use the `#[serde(default)]` field attribute to
//! provide backward compatibility to old specs. They can also use the
//! `#[serde(skip_serializing_if)]` attribute to avoid serializing new fields
//! that have their default values.
//!
//! ### Example
//!
//! As an example, consider a (hypothetical) virtio device that has backend name
//! and PCI path fields:
//!
//! ```ignore
//! use serde::{Serialize, Deserialize};
//! use schemars::JsonSchema;
//! use propolis_types::PciPath;
//!
//! #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
//! #[serde(deny_unknown_fields)]
//! struct VirtioComponent {
//!     backend_name: String,
//!     pci_path: PciPath
//! }
//! ```
//!
//! Suppose Propolis then adds support for configuring the number of virtqueues
//! this device exposes to the guest. This can be expressed compatibly as
//! follows:
//!
//! ```ignore
//! use serde::{Serialize, Deserialize};
//! use schemars::JsonSchema;
//! use propolis_types::PciPath;
//!
//! #[derive(Clone, Debug, Serialize, Deserialize, JsonSchema)]
//! #[serde(deny_unknown_fields)]
//! struct VirtioComponent {
//!     backend_name: String,
//!     pci_path: PciPath,
//!
//!     #[serde(default, skip_serializing_if = "Option::is_none")]
//!     num_virtqueues: Option<usize>
//! }
//! ```
//!
//! Old component specs will continue to deserialize with `num_virtqueues` set
//! to `None`. In this case Propolis ensures that the device gets the default
//! number of virtqueues it had before this configuration option was added. If
//! this spec is serialized again, the `num_virtqueues` option is omitted, so
//! the spec can be deserialized by downlevel versions of the library. Note
//! again that the former behavior (new library accepts old spec) is required,
//! while the latter behavior (old library accepts new spec) is nice to have and
//! may not always be possible to provide (e.g. if the value is `Some`).
//!
//! ## Naming of versioned structures
//!
//! Dropshot's OpenAPI schema generator has a known limitation. If a type or one
//! of its dependent types appears in an API, Dropshot adds to the API's schema
//! an object type with the type's name. If two separate types with the same
//! name but *different module paths* appear in the API, Dropshot chooses one
//! to include and silently ignores the rest. This issue is
//! [dropshot#383](https://github.com/oxidecomputer/dropshot/issues/383). To
//! avoid it, strongly versioned types in this module use a "V#" suffix in their
//! names, even though they may reside in separate versioned modules.

pub mod components;

pub use propolis_api_types_versions::latest::instance_spec::*;


================================================
FILE: crates/propolis-api-types/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Definitions for types exposed by the propolis-server API.
//!
//! This crate re-exports the latest versions of all API types from
//! `propolis-api-types-versions`. For versioned type access, depend
//! on that crate directly.

pub mod disk;
pub mod instance;
pub mod instance_spec;
pub mod migration;
pub mod serial;

// Re-export volume construction requests since they're part of a disk request.
pub use crucible_client_types::VolumeConstructionRequest;


================================================
FILE: crates/propolis-api-types/src/migration.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Migration types.

pub use propolis_api_types_versions::latest::migration::*;


================================================
FILE: crates/propolis-api-types/src/serial.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Serial console types.

pub use propolis_api_types_versions::latest::serial::*;


================================================
FILE: crates/propolis-api-types-versions/Cargo.toml
================================================
[package]
name = "propolis-api-types-versions"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
crucible-client-types.workspace = true
propolis_types.workspace = true
schemars.workspace = true
serde.workspace = true
thiserror.workspace = true
uuid.workspace = true

[dev-dependencies]
serde_json.workspace = true


================================================
FILE: crates/propolis-api-types-versions/src/add_vsock/api.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! API request and response types for the ADD_VSOCK API version.

use std::{collections::BTreeMap, net::SocketAddr};

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

use super::instance_spec::InstanceSpec;
use crate::v1::instance::{InstanceProperties, ReplacementComponent};
use crate::v1::instance_spec::SpecKey;
use crate::v2;

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "method", content = "value")]
pub enum InstanceInitializationMethod {
    Spec {
        spec: InstanceSpec,
    },
    MigrationTarget {
        migration_id: Uuid,
        src_addr: SocketAddr,
        replace_components: BTreeMap<SpecKey, ReplacementComponent>,
    },
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceEnsureRequest {
    pub properties: InstanceProperties,
    pub init: InstanceInitializationMethod,
}

impl From<v2::api::InstanceInitializationMethod>
    for InstanceInitializationMethod
{
    fn from(old: v2::api::InstanceInitializationMethod) -> Self {
        match old {
            v2::api::InstanceInitializationMethod::Spec { spec } => {
                Self::Spec { spec: spec.into() }
            }
            v2::api::InstanceInitializationMethod::MigrationTarget {
                migration_id,
                src_addr,
                replace_components,
            } => Self::MigrationTarget {
                migration_id,
                src_addr,
                replace_components,
            },
        }
    }
}

impl From<v2::api::InstanceEnsureRequest> for InstanceEnsureRequest {
    fn from(old: v2::api::InstanceEnsureRequest) -> Self {
        Self { properties: old.properties, init: old.init.into() }
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/add_vsock/components/devices.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

use crate::v1::instance_spec::PciPath;

/// A socket device that presents a virtio-socket interface to the guest.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(deny_unknown_fields)]
pub struct VirtioSocket {
    /// The guest's Context ID.
    pub guest_cid: u64,

    /// The PCI path at which to attach this device.
    pub pci_path: PciPath,
}


================================================
FILE: crates/propolis-api-types-versions/src/add_vsock/components/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod devices;


================================================
FILE: crates/propolis-api-types-versions/src/add_vsock/instance_spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

use crate::v1::components::backends;
use crate::v1::components::board;
use crate::v1::components::devices as v1_devices;
use crate::v1::instance::{InstanceProperties, InstanceState};
use crate::v1::instance_spec::Component as V1Component;
use crate::v1::instance_spec::SpecKey;
use crate::v2;
use crate::v2::instance_spec::SmbiosType1Input;

pub use super::components::devices::VirtioSocket;

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(
    deny_unknown_fields,
    tag = "type",
    content = "component",
    rename_all = "snake_case"
)]
pub enum Component {
    VirtioDisk(v1_devices::VirtioDisk),
    NvmeDisk(v1_devices::NvmeDisk),
    VirtioNic(v1_devices::VirtioNic),
    SerialPort(v1_devices::SerialPort),
    PciPciBridge(v1_devices::PciPciBridge),
    QemuPvpanic(v1_devices::QemuPvpanic),
    BootSettings(v1_devices::BootSettings),
    VirtioSocket(VirtioSocket),
    SoftNpuPciPort(v1_devices::SoftNpuPciPort),
    SoftNpuPort(v1_devices::SoftNpuPort),
    SoftNpuP9(v1_devices::SoftNpuP9),
    P9fs(v1_devices::P9fs),
    MigrationFailureInjector(v1_devices::MigrationFailureInjector),
    CrucibleStorageBackend(backends::CrucibleStorageBackend),
    FileStorageBackend(backends::FileStorageBackend),
    BlobStorageBackend(backends::BlobStorageBackend),
    VirtioNetworkBackend(backends::VirtioNetworkBackend),
    DlpiNetworkBackend(backends::DlpiNetworkBackend),
}

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
pub struct InstanceSpec {
    pub board: board::Board,
    pub components: BTreeMap<SpecKey, Component>,
    pub smbios: Option<SmbiosType1Input>,
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "type", content = "value")]
pub enum InstanceSpecStatus {
    WaitingForMigrationSource,
    Present(InstanceSpec),
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceSpecGetResponse {
    pub properties: InstanceProperties,
    pub state: InstanceState,
    pub spec: InstanceSpecStatus,
}

#[derive(thiserror::Error, Debug)]
#[error("no such v1 component: {0:?}")]
pub struct InvalidV1Component(Component);

impl TryFrom<Component> for V1Component {
    type Error = InvalidV1Component;

    fn try_from(value: Component) -> Result<Self, Self::Error> {
        Ok(match value {
            Component::VirtioDisk(c) => V1Component::VirtioDisk(c),
            Component::NvmeDisk(c) => V1Component::NvmeDisk(c),
            Component::VirtioNic(c) => V1Component::VirtioNic(c),
            Component::SerialPort(c) => V1Component::SerialPort(c),
            Component::PciPciBridge(c) => V1Component::PciPciBridge(c),
            Component::QemuPvpanic(c) => V1Component::QemuPvpanic(c),
            Component::BootSettings(c) => V1Component::BootSettings(c),
            component @ Component::VirtioSocket(_) => {
                return Err(InvalidV1Component(component))
            }
            Component::SoftNpuPciPort(c) => V1Component::SoftNpuPciPort(c),
            Component::SoftNpuPort(c) => V1Component::SoftNpuPort(c),
            Component::SoftNpuP9(c) => V1Component::SoftNpuP9(c),
            Component::P9fs(c) => V1Component::P9fs(c),
            Component::MigrationFailureInjector(c) => {
                V1Component::MigrationFailureInjector(c)
            }
            Component::CrucibleStorageBackend(c) => {
                V1Component::CrucibleStorageBackend(c)
            }
            Component::FileStorageBackend(c) => {
                V1Component::FileStorageBackend(c)
            }
            Component::BlobStorageBackend(c) => {
                V1Component::BlobStorageBackend(c)
            }
            Component::VirtioNetworkBackend(c) => {
                V1Component::VirtioNetworkBackend(c)
            }
            Component::DlpiNetworkBackend(c) => {
                V1Component::DlpiNetworkBackend(c)
            }
        })
    }
}

impl From<InstanceSpec> for v2::instance_spec::InstanceSpec {
    fn from(new: InstanceSpec) -> Self {
        Self {
            board: new.board,
            components: new
                .components
                .into_iter()
                .filter_map(|(k, v)| {
                    V1Component::try_from(v).ok().map(|c| (k, c))
                })
                .collect(),
            smbios: new.smbios,
        }
    }
}

impl From<V1Component> for Component {
    fn from(old: V1Component) -> Self {
        match old {
            V1Component::VirtioDisk(c) => Component::VirtioDisk(c),
            V1Component::NvmeDisk(c) => Component::NvmeDisk(c),
            V1Component::VirtioNic(c) => Component::VirtioNic(c),
            V1Component::SerialPort(c) => Component::SerialPort(c),
            V1Component::PciPciBridge(c) => Component::PciPciBridge(c),
            V1Component::QemuPvpanic(c) => Component::QemuPvpanic(c),
            V1Component::BootSettings(c) => Component::BootSettings(c),
            V1Component::SoftNpuPciPort(c) => Component::SoftNpuPciPort(c),
            V1Component::SoftNpuPort(c) => Component::SoftNpuPort(c),
            V1Component::SoftNpuP9(c) => Component::SoftNpuP9(c),
            V1Component::P9fs(c) => Component::P9fs(c),
            V1Component::MigrationFailureInjector(c) => {
                Component::MigrationFailureInjector(c)
            }
            V1Component::CrucibleStorageBackend(c) => {
                Component::CrucibleStorageBackend(c)
            }
            V1Component::FileStorageBackend(c) => {
                Component::FileStorageBackend(c)
            }
            V1Component::BlobStorageBackend(c) => {
                Component::BlobStorageBackend(c)
            }
            V1Component::VirtioNetworkBackend(c) => {
                Component::VirtioNetworkBackend(c)
            }
            V1Component::DlpiNetworkBackend(c) => {
                Component::DlpiNetworkBackend(c)
            }
        }
    }
}

impl From<InstanceSpecStatus> for v2::instance_spec::InstanceSpecStatus {
    fn from(new: InstanceSpecStatus) -> Self {
        match new {
            InstanceSpecStatus::WaitingForMigrationSource => {
                Self::WaitingForMigrationSource
            }
            InstanceSpecStatus::Present(spec) => Self::Present(spec.into()),
        }
    }
}

impl From<InstanceSpecGetResponse>
    for v2::instance_spec::InstanceSpecGetResponse
{
    fn from(new: InstanceSpecGetResponse) -> Self {
        Self {
            properties: new.properties,
            state: new.state,
            spec: new.spec.into(),
        }
    }
}

impl From<v2::instance_spec::InstanceSpec> for InstanceSpec {
    fn from(old: v2::instance_spec::InstanceSpec) -> Self {
        Self {
            board: old.board,
            components: old
                .components
                .into_iter()
                .map(|(k, v)| (k, Component::from(v)))
                .collect(),
            smbios: old.smbios,
        }
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/add_vsock/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Version `ADD_VSOCK` of the Propolis Server API.
//!
//! This version adds support for the virtio-socket device.

pub mod api;
pub mod components;
pub mod instance_spec;


================================================
FILE: crates/propolis-api-types-versions/src/crucible_volume_info/disk.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Disk and volume types for the CRUCIBLE_VOLUME_INFO API version.

use schemars::JsonSchema;
use serde::Serialize;

#[derive(Debug, Serialize, JsonSchema)]
pub struct VolumeStatus {
    pub volume_info: crucible_client_types::VolumeInfo,
}


================================================
FILE: crates/propolis-api-types-versions/src/crucible_volume_info/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Version `CRUCIBLE_VOLUME_INFO` of the Propolis Server API.
//!
//! This version changes the `disk_volume_status` endpoint to use the new
//! VolumeInfo query

pub mod disk;


================================================
FILE: crates/propolis-api-types-versions/src/impls/instance.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functional code for instance types.

use crate::latest::instance::{ErrorCode, InstanceProperties};

impl InstanceProperties {
    /// Return the name of the VMM resource backing this VM.
    pub fn vm_name(&self) -> String {
        self.id.to_string()
    }
}

impl std::fmt::Display for ErrorCode {
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        std::fmt::Debug::fmt(self, f)
    }
}

impl std::str::FromStr for ErrorCode {
    type Err = &'static str;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.trim() {
            s if s.eq_ignore_ascii_case("NoInstance") => Ok(Self::NoInstance),
            s if s.eq_ignore_ascii_case("AlreadyInitialized") => {
                Ok(ErrorCode::AlreadyInitialized)
            }
            s if s.eq_ignore_ascii_case("AlreadyRunning") => {
                Ok(ErrorCode::AlreadyRunning)
            }
            s if s.eq_ignore_ascii_case("CreateFailed") => {
                Ok(ErrorCode::CreateFailed)
            }
            _ => Err("unknown error code, expected one of: \
                'NoInstance', 'AlreadyInitialized', 'AlreadyRunning', \
                'CreateFailed'"),
        }
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/impls/instance_spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functional code for instance spec types.

use uuid::Uuid;

use crate::latest::instance_spec::SpecKey;

impl std::fmt::Display for SpecKey {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Uuid(uuid) => write!(f, "{uuid}"),
            Self::Name(name) => write!(f, "{name}"),
        }
    }
}

impl std::str::FromStr for SpecKey {
    type Err = core::convert::Infallible;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Ok(s.into())
    }
}

impl From<&str> for SpecKey {
    fn from(s: &str) -> Self {
        match Uuid::parse_str(s) {
            Ok(uuid) => Self::Uuid(uuid),
            Err(_) => Self::Name(s.to_owned()),
        }
    }
}

impl From<String> for SpecKey {
    fn from(value: String) -> Self {
        match Uuid::parse_str(value.as_str()) {
            Ok(uuid) => Self::Uuid(uuid),
            Err(_) => Self::Name(value),
        }
    }
}

impl From<Uuid> for SpecKey {
    fn from(value: Uuid) -> Self {
        Self::Uuid(value)
    }
}

#[cfg(test)]
mod test {
    use std::collections::BTreeMap;

    use uuid::Uuid;

    use super::SpecKey;
    use crate::latest::components::devices::QemuPvpanic;
    use crate::latest::instance_spec::Component;

    type TestMap = BTreeMap<SpecKey, Component>;

    // Verifies that UUID-type spec keys that are serialized and deserialized
    // continue to be interpreted as UUID-type spec keys.
    #[test]
    fn spec_key_uuid_roundtrip() {
        let id = Uuid::new_v4();
        let mut map = TestMap::new();
        map.insert(
            SpecKey::Uuid(id),
            Component::QemuPvpanic(QemuPvpanic { enable_isa: true }),
        );

        let ser = serde_json::to_string(&map).unwrap();
        let unser: TestMap = serde_json::from_str(&ser).unwrap();
        let key = unser.keys().next().expect("one key in the map");
        let SpecKey::Uuid(got_id) = key else {
            panic!("expected SpecKey::Uuid, got {key}");
        };

        assert_eq!(*got_id, id);
    }

    // Verifies that serializing a name-type spec key that happens to be the
    // string representation of a UUID causes the key to deserialize as a
    // UUID-type key.
    #[test]
    fn spec_key_uuid_string_deserializes_as_uuid_variant() {
        let id = Uuid::new_v4();
        let mut map = TestMap::new();
        map.insert(
            SpecKey::Name(id.to_string()),
            Component::QemuPvpanic(QemuPvpanic { enable_isa: true }),
        );

        let ser = serde_json::to_string(&map).unwrap();
        let unser: TestMap = serde_json::from_str(&ser).unwrap();
        let key = unser.keys().next().expect("one key in the map");
        let SpecKey::Uuid(got_id) = key else {
            panic!("expected SpecKey::Uuid, got {key}");
        };

        assert_eq!(*got_id, id);
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/impls/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functional code for the latest versions of types.

mod instance;
mod instance_spec;


================================================
FILE: crates/propolis-api-types-versions/src/initial/components/backends.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Backend configuration data: the structs that tell Propolis how to configure
//! its components to talk to other services supplied by the host OS or the
//! larger rack.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use std::num::NonZeroUsize;

/// A Crucible storage backend.
#[derive(Clone, Deserialize, Serialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct CrucibleStorageBackend {
    /// A serialized `[crucible_client_types::VolumeConstructionRequest]`. This
    /// is stored in serialized form so that breaking changes to the definition
    /// of a `VolumeConstructionRequest` do not inadvertently break instance
    /// spec deserialization.
    ///
    /// When using a spec to initialize a new instance, the spec author must
    /// ensure this request is well-formed and can be deserialized by the
    /// version of `crucible_client_types` used by the target Propolis.
    pub request_json: String,

    /// Indicates whether the storage is read-only.
    pub readonly: bool,
}

impl std::fmt::Debug for CrucibleStorageBackend {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Redact the contents of the VCR since they may contain volume
        // encryption keys.
        f.debug_struct("CrucibleStorageBackend")
            .field("request_json", &"<redacted>".to_string())
            .field("readonly", &self.readonly)
            .finish()
    }
}

/// A storage backend backed by a file in the host system's file system.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct FileStorageBackend {
    /// A path to a file that backs a disk.
    pub path: String,

    /// Indicates whether the storage is read-only.
    pub readonly: bool,

    /// Block size of the backend
    pub block_size: u32,

    /// Optional worker threads for the file backend, exposed for testing only.
    pub workers: Option<NonZeroUsize>,
}

/// A storage backend for a disk whose initial contents are given explicitly
/// by the specification.
#[derive(Clone, Deserialize, Serialize, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct BlobStorageBackend {
    /// The disk's initial contents, encoded as a base64 string.
    pub base64: String,

    /// Indicates whether the storage is read-only.
    pub readonly: bool,
}

impl std::fmt::Debug for BlobStorageBackend {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("BlobStorageBackend")
            .field("base64", &"<redacted>".to_string())
            .field("readonly", &self.readonly)
            .finish()
    }
}

/// A network backend associated with a virtio-net (viona) VNIC on the host.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct VirtioNetworkBackend {
    /// The name of the viona VNIC to use as a backend.
    pub vnic_name: String,
}

/// A network backend associated with a DLPI VNIC on the host.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct DlpiNetworkBackend {
    /// The name of the VNIC to use as a backend.
    pub vnic_name: String,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/components/board.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! VM mainboard components. Every VM has a board, even if it has no other
//! peripherals.

use std::collections::BTreeSet;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

use crate::v1::instance_spec::CpuidVendor;

/// An Intel 440FX-compatible chipset.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(deny_unknown_fields)]
pub struct I440Fx {
    /// Specifies whether the chipset should allow PCI configuration space
    /// to be accessed through the PCIe extended configuration mechanism.
    pub enable_pcie: bool,
}

/// A kind of virtual chipset.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(
    deny_unknown_fields,
    rename_all = "snake_case",
    tag = "type",
    content = "value"
)]
pub enum Chipset {
    /// An Intel 440FX-compatible chipset.
    I440Fx(I440Fx),
}

impl Default for Chipset {
    fn default() -> Self {
        Self::I440Fx(I440Fx { enable_pcie: false })
    }
}

/// A set of CPUID values to expose to a guest.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct Cpuid {
    /// A list of CPUID leaves/subleaves and their associated values.
    ///
    /// Propolis servers require that each entry's `leaf` be unique and that it
    /// falls in either the "standard" (0 to 0xFFFF) or "extended" (0x8000_0000
    /// to 0x8000_FFFF) function ranges, since these are the only valid input
    /// ranges currently defined by Intel and AMD. See the Intel 64 and IA-32
    /// Architectures Software Developer's Manual (June 2024) Table 3-17 and the
    /// AMD64 Architecture Programmer's Manual (March 2024) Volume 3's
    /// documentation of the CPUID instruction.
    //
    // It would be nice if this were an associative collection type.
    // Unfortunately, the most natural keys for such a collection are
    // structs or tuples, and JSON doesn't allow objects to be used as
    // property names. Instead of converting leaf/subleaf pairs to and from
    // strings, just accept a flat Vec and have servers verify that e.g. no
    // leaf/subleaf pairs are duplicated.
    pub entries: Vec<CpuidEntry>,

    /// The CPU vendor to emulate.
    ///
    /// CPUID leaves in the extended range (0x8000_0000 to 0x8000_FFFF) have
    /// vendor-defined semantics. Propolis uses this value to determine
    /// these semantics when deciding whether it needs to specialize the
    /// supplied template values for these leaves.
    pub vendor: CpuidVendor,
}

/// A full description of a CPUID leaf/subleaf and the values it produces.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(deny_unknown_fields)]
pub struct CpuidEntry {
    /// The leaf (function) number for this entry.
    pub leaf: u32,

    /// The subleaf (index) number for this entry, if it uses subleaves.
    pub subleaf: Option<u32>,

    /// The value to return in eax.
    pub eax: u32,

    /// The value to return in ebx.
    pub ebx: u32,

    /// The value to return in ecx.
    pub ecx: u32,

    /// The value to return in edx.
    pub edx: u32,
}

/// Flags that enable "simple" Hyper-V enlightenments that require no
/// feature-specific configuration.
//
// NOTE: This enum's variants should never have any associated data (note that
// the type doesn't use serde's `tag` and `content` attributes). If a future
// enlightenment requires associated data, it should be put into a
// `HyperVExtendedFeatures` struct (or similar), and the `HyperV` variant of
// `GuestHypervisorInterface` should be extended to `Option`ally include that
// struct.
#[derive(
    Clone,
    Deserialize,
    Serialize,
    Debug,
    JsonSchema,
    Ord,
    PartialOrd,
    Eq,
    PartialEq,
)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
pub enum HyperVFeatureFlag {
    ReferenceTsc,
}

/// A hypervisor interface to expose to the guest.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema, Default)]
#[serde(
    deny_unknown_fields,
    rename_all = "snake_case",
    tag = "type",
    content = "value"
)]
pub enum GuestHypervisorInterface {
    /// Expose a bhyve-like interface ("bhyve bhyve " as the hypervisor ID in
    /// leaf 0x4000_0000 and no additional leaves or features).
    #[default]
    Bhyve,

    /// Expose a Hyper-V-compatible hypervisor interface with the supplied
    /// features enabled.
    HyperV { features: BTreeSet<HyperVFeatureFlag> },
}

impl GuestHypervisorInterface {
    pub(crate) fn is_default(&self) -> bool {
        matches!(self, Self::Bhyve)
    }
}

/// A VM's mainboard.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct Board {
    /// The number of virtual logical processors attached to this VM.
    pub cpus: u8,

    /// The amount of guest RAM attached to this VM.
    pub memory_mb: u64,

    /// The chipset to expose to guest software.
    pub chipset: Chipset,

    /// The hypervisor platform to expose to the guest. The default is a
    /// bhyve-compatible interface with no additional features.
    ///
    /// For compatibility with older versions of Propolis, this field is only
    /// serialized if it specifies a non-default interface.
    #[serde(
        default,
        skip_serializing_if = "GuestHypervisorInterface::is_default"
    )]
    pub guest_hv_interface: GuestHypervisorInterface,

    /// The CPUID values to expose to the guest. If `None`, bhyve will derive
    /// default values from the host's CPUID values.
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub cpuid: Option<Cpuid>,
    // TODO: Processor and NUMA topology.
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/components/devices.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Device configuration data: components that define VM properties that are
//! visible to a VM's guest software.

use crate::v1::instance_spec::{PciPath, SpecKey};
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

/// A disk that presents a virtio-block interface to the guest.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct VirtioDisk {
    /// The name of the disk's backend component.
    pub backend_id: SpecKey,

    /// The PCI bus/device/function at which this disk should be attached.
    pub pci_path: PciPath,
}

/// A disk that presents an NVMe interface to the guest.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct NvmeDisk {
    /// The name of the disk's backend component.
    pub backend_id: SpecKey,

    /// The PCI bus/device/function at which this disk should be attached.
    pub pci_path: PciPath,

    /// The serial number to return in response to an NVMe Identify Controller
    /// command.
    pub serial_number: [u8; 20],
}

/// A network card that presents a virtio-net interface to the guest.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct VirtioNic {
    /// The name of the device's backend.
    pub backend_id: SpecKey,

    /// A caller-defined correlation identifier for this interface. If Propolis
    /// is configured to collect network interface kstats in its Oximeter
    /// metrics, the metric series for this interface will be associated with
    /// this identifier.
    pub interface_id: uuid::Uuid,

    /// The PCI path at which to attach this device.
    pub pci_path: PciPath,
}

/// A serial port identifier, which determines what I/O ports a guest can use to
/// access a port.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema, Hash,
)]
#[serde(deny_unknown_fields, rename_all = "snake_case")]
pub enum SerialPortNumber {
    Com1,
    Com2,
    Com3,
    Com4,
}

/// A serial port device.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(deny_unknown_fields)]
pub struct SerialPort {
    /// The serial port number for this port.
    pub num: SerialPortNumber,
}

/// A PCI-PCI bridge.
#[derive(
    Clone, Copy, Deserialize, Serialize, Debug, PartialEq, Eq, JsonSchema,
)]
#[serde(deny_unknown_fields)]
pub struct PciPciBridge {
    /// The logical bus number of this bridge's downstream bus. Other devices
    /// may use this bus number in their PCI paths to indicate they should be
    /// attached to this bridge's bus.
    pub downstream_bus: u8,

    /// The PCI path at which to attach this bridge.
    pub pci_path: PciPath,
}

#[derive(
    Clone,
    Copy,
    Deserialize,
    Serialize,
    Debug,
    PartialEq,
    Eq,
    JsonSchema,
    Default,
)]
#[serde(deny_unknown_fields)]
pub struct QemuPvpanic {
    /// Enable the QEMU PVPANIC ISA bus device (I/O port 0x505).
    pub enable_isa: bool,
    // TODO(eliza): add support for the PCI PVPANIC device...
}

/// Settings supplied to the guest's firmware image that specify the order in
/// which it should consider its options when selecting a device to try to boot
/// from.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema, Default)]
#[serde(deny_unknown_fields)]
pub struct BootSettings {
    /// An ordered list of components to attempt to boot from.
    pub order: Vec<BootOrderEntry>,
}

/// An entry in the boot order stored in a [`BootSettings`] component.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
pub struct BootOrderEntry {
    /// The ID of another component in the spec that Propolis should try to
    /// boot from.
    ///
    /// Currently, only disk device components are supported.
    pub id: SpecKey,
}

//
// Structs for Falcon devices. These devices don't support live migration.
//

/// Describes a SoftNPU PCI device.
///
/// This is only supported by Propolis servers compiled with the `falcon`
/// feature.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct SoftNpuPciPort {
    /// The PCI path at which to attach the guest to this port.
    pub pci_path: PciPath,
}

/// Describes a port in a SoftNPU emulated ASIC.
///
/// This is only supported by Propolis servers compiled with the `falcon`
/// feature.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct SoftNpuPort {
    /// The data link name for this port.
    pub link_name: String,

    /// The name of the port's associated DLPI backend.
    pub backend_id: SpecKey,
}

/// Describes a PCI device that shares host files with the guest using the P9
/// protocol.
///
/// This is only supported by Propolis servers compiled with the `falcon`
/// feature.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct SoftNpuP9 {
    /// The PCI path at which to attach the guest to this port.
    pub pci_path: PciPath,
}

/// Describes a filesystem to expose through a P9 device.
///
/// This is only supported by Propolis servers compiled with the `falcon`
/// feature.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct P9fs {
    /// The host source path to mount into the guest.
    pub source: String,

    /// The 9P target filesystem tag.
    pub target: String,

    /// The chunk size to use in the 9P protocol. Vanilla Helios images should
    /// use 8192. Falcon Helios base images and Linux can use up to 65536.
    pub chunk_size: u32,

    /// The PCI path at which to attach the guest to this P9 filesystem.
    pub pci_path: PciPath,
}

/// Describes a synthetic device that registers for VM lifecycle notifications
/// and returns errors during attempts to migrate.
///
/// This is only supported by Propolis servers compiled with the
/// `failure-injection` feature.
#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct MigrationFailureInjector {
    /// The number of times this device should fail requests to export state.
    pub fail_exports: u32,

    /// The number of times this device should fail requests to import state.
    pub fail_imports: u32,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/components/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Specifications for components that can be attached to a Propolis VM.
//!
//! Components are 'versionless' and can be added to any specification of any
//! format. Existing components must only change in backward-compatible ways.

pub mod backends;
pub mod board;
pub mod devices;


================================================
FILE: crates/propolis-api-types-versions/src/initial/disk.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Disk and volume types for the INITIAL API version.

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceVCRReplace {
    pub vcr_json: String,
}

/// Path parameters for snapshot requests.
#[derive(Deserialize, JsonSchema)]
pub struct SnapshotRequestPathParams {
    pub id: String,
    pub snapshot_id: Uuid,
}

/// Path parameters for VCR requests.
#[derive(Deserialize, JsonSchema)]
pub struct VCRRequestPathParams {
    pub id: String,
}

/// Path parameters for volume status requests.
#[derive(Deserialize, JsonSchema)]
pub struct VolumeStatusPathParams {
    pub id: String,
}

#[derive(Debug, Serialize, Deserialize, JsonSchema)]
pub struct VolumeStatus {
    pub active: bool,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/instance.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance management types for the INITIAL API version.
//!
//! This module contains types for instance properties, state management,
//! initialization, and monitoring.
//!
//! See also: [`super::instance_spec`].

use std::{collections::BTreeMap, net::SocketAddr};

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

use super::components::{backends, devices};
use super::instance_spec::{InstanceSpec, SpecKey};
use super::migration::InstanceMigrateInitiateResponse;

#[derive(Clone, Debug, Deserialize, Eq, JsonSchema, PartialEq, Serialize)]
pub struct InstanceMetadata {
    pub silo_id: Uuid,
    pub project_id: Uuid,
    pub sled_id: Uuid,
    pub sled_serial: String,
    pub sled_revision: u32,
    pub sled_model: String,
}

#[derive(Clone, Debug, Deserialize, PartialEq, Eq, Serialize, JsonSchema)]
pub struct InstanceProperties {
    /// Unique identifier for this Instance.
    pub id: Uuid,
    /// Human-readable name of the Instance.
    pub name: String,
    /// Free-form text description of an Instance.
    pub description: String,
    /// Metadata used to track statistics for this Instance.
    pub metadata: InstanceMetadata,
}

/// Current state of an Instance.
#[derive(
    Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize, JsonSchema,
)]
pub enum InstanceState {
    Creating,
    Starting,
    Running,
    Stopping,
    Stopped,
    Rebooting,
    Migrating,
    Repairing,
    Failed,
    Destroyed,
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct Instance {
    pub properties: InstanceProperties,
    pub state: InstanceState,
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceGetResponse {
    pub instance: Instance,
}

/// Requested state of an Instance.
#[derive(Clone, Copy, Deserialize, Serialize, JsonSchema)]
pub struct InstanceStateChange {
    pub state: InstanceStateRequested,
}

#[derive(Clone, Copy, Debug, Deserialize, Serialize, JsonSchema)]
pub enum InstanceStateRequested {
    Run,
    Stop,
    Reboot,
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceStateMonitorRequest {
    pub gen: u64,
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceStateMonitorResponse {
    pub gen: u64,
    pub state: InstanceState,
    pub migration: super::migration::InstanceMigrateStatusResponse,
}

/// An instance spec component that should be replaced during a live migration.
//
// When a caller asks Propolis to initialize via live migration, the target VM
// inherits the migration source's current instance spec. For the most part,
// the target can (and indeed in some cases must) use this spec without
// modifying it; this helps Propolis ensure that guest-visible configuration
// remains unchanged when a VM migrates. However, there are some components
// with no guest-visible state that may need to be reconfigured when a VM
// migrates. These include the following:
//
// - Crucible disks: After migrating, the target Propolis presents itself as a
//   new client of the Crucible downstairs servers backing the VM's disks.
//   Crucible requires the target to present a newer client generation number
//   to allow the target to connect. In a full Oxide deployment, these numbers
//   are managed by the control plane (i.e. it is not safe for Propolis to
//   manage these values directly--new Crucible volume connection information
//   must always come from Nexus).
// - Virtio network devices: Each virtio NIC in the guest needs to bind to a
//   named VNIC object on the host. These names can change when a VM migrates
//   from host to host.
//
// Each component that can be reconfigured this way has a variant in this enum;
// components not in the enum can't be reconfigured during migration. This
// saves the initialization API from having to reason about requests to replace
// a component that can't legally be replaced.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[serde(deny_unknown_fields, tag = "component", content = "spec")]
pub enum ReplacementComponent {
    MigrationFailureInjector(devices::MigrationFailureInjector),
    CrucibleStorageBackend(backends::CrucibleStorageBackend),
    VirtioNetworkBackend(backends::VirtioNetworkBackend),
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "method", content = "value")]
pub enum InstanceInitializationMethod {
    Spec {
        spec: InstanceSpec,
    },
    MigrationTarget {
        migration_id: Uuid,
        src_addr: SocketAddr,
        replace_components: BTreeMap<SpecKey, ReplacementComponent>,
    },
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceEnsureRequest {
    pub properties: InstanceProperties,
    pub init: InstanceInitializationMethod,
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceEnsureResponse {
    pub migrate: Option<InstanceMigrateInitiateResponse>,
}

/// Path parameters for instance endpoints that identify instances by name.
#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceNameParams {
    pub instance_id: String,
}

/// Path parameters for instance endpoints that identify instances by UUID.
#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstancePathParams {
    pub instance_id: Uuid,
}

/// Error codes used to populate the `error_code` field of Dropshot API responses.
#[derive(
    Clone, Copy, Debug, Deserialize, PartialEq, Eq, Serialize, JsonSchema,
)]
pub enum ErrorCode {
    /// This `propolis-server` process has not received an `InstanceEnsure`
    /// request yet.
    NoInstance,
    /// This `propolis-server` process has already received an `InstanceEnsure`
    /// request with a different ID.
    AlreadyInitialized,
    /// Cannot update a running server.
    AlreadyRunning,
    /// Instance creation failed
    CreateFailed,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/instance_spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance specification types for the INITIAL API version.

use std::collections::BTreeMap;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

pub use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor, PciPath};

use super::components::{backends, board, devices};

/// A key identifying a component in an instance spec.
//
// Some of the components Omicron attaches to Propolis VMs, like network
// interfaces and Crucible disks, are described by database records with UUID
// primary keys. It's natural to reuse these UUIDs as component identifiers in
// Propolis, especially because it lets Omicron functions that need to identify
// a specific component (e.g. a specific Crucible backend that should handle a
// disk snapshot request) pass that component's ID directly to Propolis.
//
// In some cases it's not desirable or possible to use UUIDs this way:
//
// - Some components (like the cloud-init disk) don't have their own rows in the
//   database and so don't have obvious UUIDs to use.
// - Some objects (like Crucible disks) require both a device and a backend
//   component in the spec, and these can't share the same key.
// - Propolis users outside the control plane may not have any component UUIDs
//   at all and may just want to use strings to identify all their components.
//
// For these reasons, the key type may be represented as either a UUID or a
// String. This allows the more compact, more-easily-compared UUID format to be
// used wherever it is practical while still allowing callers to use strings as
// names if they have no UUIDs available or the most obvious UUID is in use
// elsewhere. The key type's From impls will try to parse strings into UUIDs
// before storing keys as strings.
#[derive(
    Clone, Debug, Serialize, Deserialize, Eq, PartialEq, Ord, PartialOrd,
)]
// Direct serde to use an untagged enum representation for this type. Since both
// Uuid and String serialize to strings, this allows other types that contain a
// Map<K = SpecKey> to derive Serialize and successfully serialize to JSON.
// (This doesn't work with a tagged representation because JSON doesn't allow
// maps to be used as map keys.)
//
// Note that this makes the order of variants matter: serde will pick the first
// variant into which it can successfully deserialize an untagged enum value,
// and the point is to use the UUID representation for any value that can be
// interpreted as a UUID.
#[serde(untagged)]
pub enum SpecKey {
    Uuid(Uuid),
    Name(String),
}

// Manually implement JsonSchema to help Progenitor generate the expected enum
// type for spec keys.
impl JsonSchema for SpecKey {
    fn schema_name() -> String {
        "SpecKey".to_owned()
    }

    fn json_schema(
        generator: &mut schemars::gen::SchemaGenerator,
    ) -> schemars::schema::Schema {
        use schemars::schema::*;
        fn label_schema(label: &str, schema: Schema) -> Schema {
            SchemaObject {
                metadata: Some(
                    Metadata {
                        title: Some(label.to_string()),
                        ..Default::default()
                    }
                    .into(),
                ),
                subschemas: Some(
                    SubschemaValidation {
                        all_of: Some(vec![schema]),
                        ..Default::default()
                    }
                    .into(),
                ),
                ..Default::default()
            }
            .into()
        }

        SchemaObject {
            metadata: Some(
                Metadata {
                    description: Some(
                        "A key identifying a component in an instance spec."
                            .to_string(),
                    ),
                    ..Default::default()
                }
                .into(),
            ),
            subschemas: Some(Box::new(SubschemaValidation {
                one_of: Some(vec![
                    label_schema("uuid", generator.subschema_for::<Uuid>()),
                    label_schema("name", generator.subschema_for::<String>()),
                ]),
                ..Default::default()
            })),
            ..Default::default()
        }
        .into()
    }
}

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(
    deny_unknown_fields,
    tag = "type",
    content = "component",
    rename_all = "snake_case"
)]
#[schemars(rename = "ComponentV0")]
pub enum Component {
    VirtioDisk(devices::VirtioDisk),
    NvmeDisk(devices::NvmeDisk),
    VirtioNic(devices::VirtioNic),
    SerialPort(devices::SerialPort),
    PciPciBridge(devices::PciPciBridge),
    QemuPvpanic(devices::QemuPvpanic),
    BootSettings(devices::BootSettings),
    SoftNpuPciPort(devices::SoftNpuPciPort),
    SoftNpuPort(devices::SoftNpuPort),
    SoftNpuP9(devices::SoftNpuP9),
    P9fs(devices::P9fs),
    MigrationFailureInjector(devices::MigrationFailureInjector),
    CrucibleStorageBackend(backends::CrucibleStorageBackend),
    FileStorageBackend(backends::FileStorageBackend),
    BlobStorageBackend(backends::BlobStorageBackend),
    VirtioNetworkBackend(backends::VirtioNetworkBackend),
    DlpiNetworkBackend(backends::DlpiNetworkBackend),
}

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct InstanceSpec {
    pub board: board::Board,
    pub components: BTreeMap<SpecKey, Component>,
}

/// DEPRECATED: A versioned instance spec.
///
/// This structure is deprecated. It is notionally incompatible with
/// dropshot API versioning. If you wanted to add a `V1` variant to
/// `VersionedInstanceSpec`, it would change the existing blessed V1 OpenAPI
/// spec. Therefore you'd have to rename this to `VersionedInstanceSpecV0` and
/// create a new type with the new variant. This makes little sense however,
/// and is a remnant of an attempt at propolis versioning prior to dropshot
/// API versioning.
///
/// Luckily this type is only exposed via `InstanceSpecGetResponse` which is not
/// used in Omicron. Therefore we can limit that method to the V1 OpenAPI spec
/// and stop any further use of this type.
///
/// In addition to the disparate versioning mechanisms, there is also a
/// fundamental flaw in how this type was used in the existing code. It was
/// constructed in some cases from a `MaybeSpec` which contains a `Box<Spec>`.
/// Unfortunately, `Spec` is a type erased container of any version of an
/// instance spec such as `InstanceSpecV0`,`InstanceSpecV1`, or future types.
/// There is no guarantee that we could take a `Spec` and figure out which
/// versioned spec it is supposed to convert to. This only worked in the initial
/// code because the only versioned spec was `InstanceSpecV0`. It's best to stop
/// using this type altogether.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[serde(deny_unknown_fields, tag = "version", content = "spec")]
pub enum VersionedInstanceSpec {
    V0(InstanceSpec),
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "type", content = "value")]
pub enum InstanceSpecStatus {
    WaitingForMigrationSource,
    Present(VersionedInstanceSpec),
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceSpecGetResponse {
    pub properties: super::instance::InstanceProperties,
    pub state: super::instance::InstanceState,
    pub spec: InstanceSpecStatus,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/migration.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Migration types for the INITIAL API version.

use std::net::SocketAddr;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

/// Request to initiate a migration.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceMigrateInitiateRequest {
    pub migration_id: Uuid,
    pub src_addr: SocketAddr,
    pub src_uuid: Uuid,
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceMigrateInitiateResponse {
    pub migration_id: Uuid,
}

/// Request to start a migration.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceMigrateStartRequest {
    pub migration_id: Uuid,
}

/// The status of an individual live migration.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
pub struct InstanceMigrationStatus {
    /// The ID of this migration, supplied either by the external migration
    /// requester (for targets) or the other side of the migration (for
    /// sources).
    pub id: Uuid,
    /// The current phase the migration is in.
    pub state: MigrationState,
}

/// The statuses of the most recent attempts to live migrate into and out of
/// this Propolis.
///
/// If a VM is initialized by migration in and then begins to migrate out, this
/// structure will contain statuses for both migrations. This ensures that
/// clients can always obtain the status of a successful migration in even after
/// a migration out begins.
///
/// This structure only reports the status of the most recent migration in a
/// single direction. That is, if a migration in or out fails, and a new
/// migration attempt begins, the new migration's status replaces the old's.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq, Eq)]
pub struct InstanceMigrateStatusResponse {
    /// The status of the most recent attempt to initialize the current instance
    /// via migration in, or `None` if the instance has never been a migration
    /// target.
    pub migration_in: Option<InstanceMigrationStatus>,
    /// The status of the most recent attempt to migrate out of the current
    /// instance, or `None` if the instance has never been a migration source.
    pub migration_out: Option<InstanceMigrationStatus>,
}

#[derive(
    Clone,
    Copy,
    Debug,
    Deserialize,
    PartialEq,
    Eq,
    PartialOrd,
    Ord,
    Serialize,
    JsonSchema,
)]
pub enum MigrationState {
    Sync,
    RamPush,
    Pause,
    RamPushDirty,
    Device,
    Resume,
    RamPull,
    Server,
    Finish,
    Error,
}


================================================
FILE: crates/propolis-api-types-versions/src/initial/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Version `INITIAL` of the Propolis Server API.
//!
//! This is the first version of the API.

pub mod components;
pub mod disk;
pub mod instance;
pub mod instance_spec;
pub mod migration;
pub mod serial;


================================================
FILE: crates/propolis-api-types-versions/src/initial/serial.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Serial console types for the INITIAL API version.

use std::net::SocketAddr;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

/// Request a specific range of an Instance's serial console output history.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)]
pub struct InstanceSerialConsoleHistoryRequest {
    /// Character index in the serial buffer from which to read, counting the
    /// bytes output since instance start. If this is not provided,
    /// `most_recent` must be provided, and if this *is* provided, `most_recent`
    /// must *not* be provided.
    pub from_start: Option<u64>,
    /// Character index in the serial buffer from which to read, counting
    /// *backward* from the most recently buffered data retrieved from the
    /// instance. (See note on `from_start` about mutual exclusivity)
    pub most_recent: Option<u64>,
    /// Maximum number of bytes of buffered serial console contents to return.
    /// If the requested range runs to the end of the available buffer, the data
    /// returned will be shorter than `max_bytes`.
    pub max_bytes: Option<u64>,
}

/// Contents of an Instance's serial console buffer.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceSerialConsoleHistoryResponse {
    /// The bytes starting from the requested offset up to either the end of the
    /// buffer or the request's `max_bytes`. Provided as a u8 array rather than
    /// a string, as it may not be UTF-8.
    pub data: Vec<u8>,
    /// The absolute offset since boot (suitable for use as `byte_offset` in a
    /// subsequent request) of the last byte returned in `data`.
    pub last_byte_offset: u64,
}

/// Connect to an Instance's serial console via websocket, optionally sending
/// bytes from the buffered history first.
#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema, PartialEq)]
pub struct InstanceSerialConsoleStreamRequest {
    /// Character index in the serial buffer from which to read, counting the
    /// bytes output since instance start. If this is provided, `most_recent`
    /// must *not* be provided.
    // TODO: if neither is specified, send enough serial buffer history to
    // reconstruct the current contents and cursor state of an interactive
    // terminal
    pub from_start: Option<u64>,
    /// Character index in the serial buffer from which to read, counting
    /// *backward* from the most recently buffered data retrieved from the
    /// instance. (See note on `from_start` about mutual exclusivity)
    pub most_recent: Option<u64>,
}

/// Control message(s) sent through the websocket to serial console clients.
///
/// Note: Because this is associated with the websocket, and not some REST
/// endpoint, Dropshot lacks the ability to communicate it via the OpenAPI
/// document underpinning the exposed interfaces. As such, clients (including
/// the `propolis-client` crate) are expected to define their own identical copy
/// of this type in order to consume it.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub enum InstanceSerialConsoleControlMessage {
    Migrating { destination: SocketAddr, from_start: u64 },
}


================================================
FILE: crates/propolis-api-types-versions/src/latest.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Re-exports of the latest versions of all published types.
//!
//! Business logic should use these re-exports rather than versioned
//! identifiers directly.

pub mod components {
    pub mod backends {
        pub use crate::v1::components::backends::BlobStorageBackend;
        pub use crate::v1::components::backends::CrucibleStorageBackend;
        pub use crate::v1::components::backends::DlpiNetworkBackend;
        pub use crate::v1::components::backends::FileStorageBackend;
        pub use crate::v1::components::backends::VirtioNetworkBackend;
    }

    pub mod board {
        pub use crate::v1::components::board::Board;
        pub use crate::v1::components::board::Chipset;
        pub use crate::v1::components::board::Cpuid;
        pub use crate::v1::components::board::CpuidEntry;
        pub use crate::v1::components::board::GuestHypervisorInterface;
        pub use crate::v1::components::board::HyperVFeatureFlag;
        pub use crate::v1::components::board::I440Fx;
    }

    pub mod devices {
        pub use crate::v1::components::devices::BootOrderEntry;
        pub use crate::v1::components::devices::BootSettings;
        pub use crate::v1::components::devices::MigrationFailureInjector;
        pub use crate::v1::components::devices::NvmeDisk;
        pub use crate::v1::components::devices::P9fs;
        pub use crate::v1::components::devices::PciPciBridge;
        pub use crate::v1::components::devices::QemuPvpanic;
        pub use crate::v1::components::devices::SerialPort;
        pub use crate::v1::components::devices::SerialPortNumber;
        pub use crate::v1::components::devices::SoftNpuP9;
        pub use crate::v1::components::devices::SoftNpuPciPort;
        pub use crate::v1::components::devices::SoftNpuPort;
        pub use crate::v1::components::devices::VirtioDisk;
        pub use crate::v1::components::devices::VirtioNic;

        pub use crate::v3::components::devices::VirtioSocket;
    }
}

pub mod disk {
    pub use crate::v1::disk::InstanceVCRReplace;
    pub use crate::v1::disk::SnapshotRequestPathParams;
    pub use crate::v1::disk::VCRRequestPathParams;
    pub use crate::v1::disk::VolumeStatusPathParams;
    pub use crate::v5::disk::VolumeStatus;
}

pub mod instance {
    pub use crate::v1::instance::ErrorCode;
    pub use crate::v1::instance::Instance;
    pub use crate::v1::instance::InstanceEnsureResponse;
    pub use crate::v1::instance::InstanceGetResponse;
    pub use crate::v1::instance::InstanceMetadata;
    pub use crate::v1::instance::InstanceNameParams;
    pub use crate::v1::instance::InstancePathParams;
    pub use crate::v1::instance::InstanceProperties;
    pub use crate::v1::instance::InstanceState;
    pub use crate::v1::instance::InstanceStateChange;
    pub use crate::v1::instance::InstanceStateMonitorRequest;
    pub use crate::v1::instance::InstanceStateMonitorResponse;
    pub use crate::v1::instance::InstanceStateRequested;
    pub use crate::v1::instance::ReplacementComponent;

    pub use crate::v3::api::InstanceEnsureRequest;
    pub use crate::v3::api::InstanceInitializationMethod;
}

pub mod instance_spec {
    pub use crate::v1::instance_spec::CpuidIdent;
    pub use crate::v1::instance_spec::CpuidValues;
    pub use crate::v1::instance_spec::CpuidVendor;
    pub use crate::v1::instance_spec::PciPath;
    pub use crate::v1::instance_spec::SpecKey;
    pub use crate::v1::instance_spec::VersionedInstanceSpec;

    pub use crate::v2::instance_spec::SmbiosType1Input;

    pub use crate::v3::instance_spec::Component;
    pub use crate::v3::instance_spec::InstanceSpec;
    pub use crate::v3::instance_spec::InstanceSpecGetResponse;
    pub use crate::v3::instance_spec::InstanceSpecStatus;
}

pub mod migration {
    pub use crate::v1::migration::InstanceMigrateInitiateRequest;
    pub use crate::v1::migration::InstanceMigrateInitiateResponse;
    pub use crate::v1::migration::InstanceMigrateStartRequest;
    pub use crate::v1::migration::InstanceMigrateStatusResponse;
    pub use crate::v1::migration::InstanceMigrationStatus;
    pub use crate::v1::migration::MigrationState;
}

pub mod serial {
    pub use crate::v1::serial::InstanceSerialConsoleControlMessage;
    pub use crate::v1::serial::InstanceSerialConsoleHistoryRequest;
    pub use crate::v1::serial::InstanceSerialConsoleHistoryResponse;
    pub use crate::v1::serial::InstanceSerialConsoleStreamRequest;
}


================================================
FILE: crates/propolis-api-types-versions/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Versioned types for the Propolis Server API.
//!
//! # Adding a new API version
//!
//! When adding a new API version N with added or changed types:
//!
//! 1. Create <version_name>/mod.rs, where <version_name> is the lowercase
//!    form of the new version's identifier, as defined in the API trait's
//!    `api_versions!` macro.
//!
//! 2. Add to the end of this list:
//!
//!    ```rust,ignore
//!    #[path = "<version_name>/mod.rs"]
//!    pub mod vN;
//!    ```
//!
//! 3. Add your types to the new module, mirroring the module structure from
//!    earlier versions.
//!
//! 4. Update `latest.rs` with new and updated types from the new version.
//!
//! For more information, see the [detailed guide] and [RFD 619].
//!
//! [detailed guide]: https://github.com/oxidecomputer/dropshot-api-manager/blob/main/guides/new-version.md
//! [RFD 619]: https://rfd.shared.oxide.computer/rfd/619

mod impls;
pub mod latest;
#[path = "initial/mod.rs"]
pub mod v1;
#[path = "programmable_smbios/mod.rs"]
pub mod v2;
#[path = "add_vsock/mod.rs"]
pub mod v3;
#[path = "crucible_volume_info/mod.rs"]
pub mod v5;


================================================
FILE: crates/propolis-api-types-versions/src/programmable_smbios/api.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! API request and response types for the PROGRAMMABLE_SMBIOS API version.

use std::{collections::BTreeMap, net::SocketAddr};

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use uuid::Uuid;

use super::instance_spec::InstanceSpec;
use crate::v1;
use crate::v1::instance::{InstanceProperties, ReplacementComponent};
use crate::v1::instance_spec::SpecKey;

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "method", content = "value")]
pub enum InstanceInitializationMethod {
    Spec {
        spec: InstanceSpec,
    },
    MigrationTarget {
        migration_id: Uuid,
        src_addr: SocketAddr,
        replace_components: BTreeMap<SpecKey, ReplacementComponent>,
    },
}

#[derive(Clone, Debug, Deserialize, Serialize, JsonSchema)]
pub struct InstanceEnsureRequest {
    pub properties: InstanceProperties,
    pub init: InstanceInitializationMethod,
}

impl From<v1::instance::InstanceInitializationMethod>
    for InstanceInitializationMethod
{
    fn from(old: v1::instance::InstanceInitializationMethod) -> Self {
        match old {
            v1::instance::InstanceInitializationMethod::Spec { spec } => {
                Self::Spec { spec: spec.into() }
            }
            v1::instance::InstanceInitializationMethod::MigrationTarget {
                migration_id,
                src_addr,
                replace_components,
            } => Self::MigrationTarget {
                migration_id,
                src_addr,
                replace_components,
            },
        }
    }
}

impl From<v1::instance::InstanceEnsureRequest> for InstanceEnsureRequest {
    fn from(old: v1::instance::InstanceEnsureRequest) -> Self {
        Self { properties: old.properties, init: old.init.into() }
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/programmable_smbios/instance_spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Instance specification types for the PROGRAMMABLE_SMBIOS API version.

use std::collections::BTreeMap;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

use crate::v1;
use crate::v1::components::board;
use crate::v1::instance::{InstanceProperties, InstanceState};
use crate::v1::instance_spec::{Component, SpecKey};

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
#[serde(deny_unknown_fields)]
pub struct SmbiosType1Input {
    pub manufacturer: String,
    pub product_name: String,
    pub serial_number: String,
    pub version: u64,
}

#[derive(Clone, Deserialize, Serialize, Debug, JsonSchema)]
pub struct InstanceSpec {
    pub board: board::Board,
    pub components: BTreeMap<SpecKey, Component>,
    pub smbios: Option<SmbiosType1Input>,
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
#[serde(tag = "type", content = "value")]
pub enum InstanceSpecStatus {
    WaitingForMigrationSource,
    Present(InstanceSpec),
}

#[derive(Clone, Deserialize, Serialize, JsonSchema)]
pub struct InstanceSpecGetResponse {
    pub properties: InstanceProperties,
    pub state: InstanceState,
    pub spec: InstanceSpecStatus,
}

impl From<InstanceSpec> for v1::instance_spec::InstanceSpec {
    fn from(new: InstanceSpec) -> Self {
        Self { board: new.board, components: new.components }
    }
}

impl From<InstanceSpecStatus> for v1::instance_spec::InstanceSpecStatus {
    fn from(new: InstanceSpecStatus) -> Self {
        match new {
            InstanceSpecStatus::WaitingForMigrationSource => {
                Self::WaitingForMigrationSource
            }
            InstanceSpecStatus::Present(spec) => Self::Present(
                v1::instance_spec::VersionedInstanceSpec::V0(spec.into()),
            ),
        }
    }
}

impl From<InstanceSpecGetResponse>
    for v1::instance_spec::InstanceSpecGetResponse
{
    fn from(new: InstanceSpecGetResponse) -> Self {
        Self {
            properties: new.properties,
            state: new.state,
            spec: new.spec.into(),
        }
    }
}

impl From<v1::instance_spec::InstanceSpec> for InstanceSpec {
    fn from(old: v1::instance_spec::InstanceSpec) -> Self {
        Self { board: old.board, components: old.components, smbios: None }
    }
}


================================================
FILE: crates/propolis-api-types-versions/src/programmable_smbios/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Version `PROGRAMMABLE_SMBIOS` of the Propolis Server API.
//!
//! This version adds support for programmable SMBIOS Type 1 tables.

pub mod api;
pub mod instance_spec;


================================================
FILE: crates/propolis-config-toml/Cargo.toml
================================================
[package]
name = "propolis-config-toml"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
cpuid_profile_config.workspace = true
propolis-client.workspace = true
serde.workspace = true
serde_derive.workspace = true
toml.workspace = true
thiserror.workspace = true
uuid.workspace = true


================================================
FILE: crates/propolis-config-toml/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;
use std::num::NonZeroUsize;
use std::path::Path;
use std::str::FromStr;

use serde_derive::{Deserialize, Serialize};
use thiserror::Error;

pub use cpuid_profile_config::{
    CpuVendor, CpuidEntry, CpuidParseError, CpuidProfile,
};

pub mod spec;

/// Configuration for the Propolis server.
// NOTE: This is expected to change over time; portions of the hard-coded
// configuration will likely become more dynamic.
#[derive(Serialize, Deserialize, Debug, PartialEq)]
pub struct Config {
    #[serde(default, rename = "pci_bridge")]
    pub pci_bridges: Vec<PciBridge>,

    #[serde(default)]
    pub chipset: Chipset,

    #[serde(default, rename = "dev")]
    pub devices: BTreeMap<String, Device>,

    #[serde(default, rename = "block_dev")]
    pub block_devs: BTreeMap<String, BlockDevice>,

    #[serde(default, rename = "cpuid")]
    pub cpuid_profiles: BTreeMap<String, CpuidProfile>,
}
impl Default for Config {
    fn default() -> Self {
        Self {
            pci_bridges: Vec::new(),
            chipset: Chipset { options: BTreeMap::new() },
            devices: BTreeMap::new(),
            block_devs: BTreeMap::new(),
            cpuid_profiles: BTreeMap::new(),
        }
    }
}

/// The instance's chipset.
#[derive(Default, Serialize, Deserialize, Debug, PartialEq)]
pub struct Chipset {
    #[serde(flatten, default)]
    pub options: BTreeMap<String, toml::Value>,
}

impl Chipset {
    pub fn get_string<S: AsRef<str>>(&self, key: S) -> Option<&str> {
        self.options.get(key.as_ref())?.as_str()
    }

    pub fn get<T: FromStr, S: AsRef<str>>(&self, key: S) -> Option<T> {
        self.get_string(key)?.parse().ok()
    }
}

/// A PCI-PCI bridge.
#[derive(Default, Serialize, Deserialize, Debug, PartialEq, Eq)]
pub struct PciBridge {
    /// The bus/device/function of this bridge as a device in the PCI topology.
    #[serde(rename = "pci-path")]
    pub pci_path: String,

    /// The logical bus number to assign to this bridge's downstream bus.
    ///
    /// Note: This bus number is only used at configuration time to attach
    /// devices downstream of this bridge. The bridge's secondary bus number
    /// (used by the guest to address traffic to devices on this bus) is
    /// set by the guest at runtime.
    #[serde(rename = "downstream-bus")]
    pub downstream_bus: u8,
}

/// A hard-coded device, either enabled by default or accessible locally
/// on a machine.
#[derive(Clone, Serialize, Deserialize, Debug, PartialEq)]
pub struct Device {
    pub driver: String,

    #[serde(flatten, default)]
    pub options: BTreeMap<String, toml::Value>,
}

impl Device {
    pub fn get_string<S: AsRef<str>>(&self, key: S) -> Option<&str> {
        self.options.get(key.as_ref())?.as_str()
    }

    pub fn get<T: FromStr, S: AsRef<str>>(&self, key: S) -> Option<T> {
        self.get_string(key)?.parse().ok()
    }
}

#[derive(Debug, Deserialize, Serialize, PartialEq)]
pub struct BlockOpts {
    pub block_size: Option<u32>,
    pub read_only: Option<bool>,
    pub skip_flush: Option<bool>,
    pub workers: Option<NonZeroUsize>,
}

#[derive(Serialize, Deserialize, Debug, PartialEq)]
pub struct BlockDevice {
    #[serde(default, rename = "type")]
    pub bdtype: String,

    #[serde(flatten)]
    pub opts: BlockOpts,

    #[serde(flatten, default)]
    pub options: BTreeMap<String, toml::Value>,
}

/// Errors which may be returned when parsing the server configuration.
#[derive(Error, Debug)]
pub enum ParseError {
    #[error("Cannot parse toml: {0}")]
    Toml(#[from] toml::de::Error),

    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),

    #[error("Key {0} not found in {1}")]
    KeyNotFound(String, String),

    #[error("Could not unmarshall {0} with function {1}")]
    AsError(String, String),
}

/// Parses a TOML file into a configuration object.
pub fn parse<P: AsRef<Path>>(path: P) -> Result<Config, ParseError> {
    let contents = std::fs::read_to_string(path.as_ref())?;
    let cfg = toml::from_str::<Config>(&contents)?;
    Ok(cfg)
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn config_can_be_serialized_as_toml() {
        let dummy_config = Config { ..Default::default() };
        let serialized = toml::ser::to_string(&dummy_config).unwrap();
        let deserialized: Config = toml::de::from_str(&serialized).unwrap();
        assert_eq!(dummy_config, deserialized);
    }

    #[test]
    fn parse_basic_config() {
        let raw = r#"
[chipset]
chipset-opt = "copt"

[dev.drv0]
driver = "nvme"
other-opt = "value"

[dev.drv1]
driver = "widget"
foo = "bar"

[block_dev.block0]
type = "cement"
slump = "4in"

[block_dev.block1]
type = "file"
path = "/etc/passwd"
"#;
        let cfg: Config = toml::de::from_str(raw).unwrap();

        use toml::Value;

        assert_eq!(cfg.chipset.get_string("chipset-opt"), Some("copt"));

        assert!(cfg.devices.contains_key("drv0"));
        assert!(cfg.devices.contains_key("drv1"));
        let dev0 = cfg.devices.get("drv0").unwrap();
        let dev1 = cfg.devices.get("drv1").unwrap();

        assert_eq!(dev0.driver, "nvme");
        assert_eq!(dev0.get_string("other-opt"), Some("value"));
        assert_eq!(dev1.driver, "widget");
        assert_eq!(dev1.get_string("foo"), Some("bar"));

        assert!(cfg.block_devs.contains_key("block0"));
        assert!(cfg.block_devs.contains_key("block1"));
        let bdev0 = cfg.block_devs.get("block0").unwrap();
        let bdev1 = cfg.block_devs.get("block1").unwrap();

        assert_eq!(bdev0.bdtype, "cement");
        assert_eq!(
            bdev0.options.get("slump").map(Value::as_str).unwrap(),
            Some("4in")
        );
        assert_eq!(bdev1.bdtype, "file");
        assert_eq!(
            bdev1.options.get("path").map(Value::as_str).unwrap(),
            Some("/etc/passwd")
        );
    }
}


================================================
FILE: crates/propolis-config-toml/src/spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functions for converting a [`super::Config`] into instance spec elements.

use std::{
    collections::BTreeMap,
    str::{FromStr, ParseBoolError},
};

use propolis_client::{
    instance_spec::{
        Component, Cpuid, CpuidVendor, DlpiNetworkBackend, FileStorageBackend,
        MigrationFailureInjector, NvmeDisk, P9fs, PciPath, PciPciBridge,
        SoftNpuP9, SoftNpuPciPort, SoftNpuPort, SpecKey, VirtioDisk,
        VirtioNetworkBackend, VirtioNic, VirtioSocket,
    },
    support::nvme_serial_from_str,
};
use thiserror::Error;

pub const MIGRATION_FAILURE_DEVICE_NAME: &str = "test-migration-failure";

#[derive(Debug, Error)]
pub enum TomlToSpecError {
    #[error("unrecognized device type {0:?}")]
    UnrecognizedDeviceType(String),

    #[error("invalid value {0:?} for enable-pcie flag in chipset")]
    EnablePcieParseFailed(String),

    #[error("failed to get PCI path for device {0:?}")]
    InvalidPciPath(String),

    #[error("failed to parse PCI path string {0:?}")]
    PciPathParseFailed(String, #[source] std::io::Error),

    #[error("spec key {0:?} defined multiple times")]
    DuplicateSpecKey(SpecKey),

    #[error("invalid storage device kind {kind:?} for device {name:?}")]
    InvalidStorageDeviceType { kind: String, name: String },

    #[error("no backend name for storage device {0:?}")]
    NoBackendNameForStorageDevice(String),

    #[error("invalid storage backend kind {kind:?} for backend {name:?}")]
    InvalidStorageBackendType { kind: String, name: String },

    #[error("couldn't find storage device {device:?}'s backend {backend:?}")]
    StorageDeviceBackendNotFound { device: String, backend: String },

    #[error("couldn't get path for file backend {0:?}")]
    InvalidFileBackendPath(String),

    #[error("failed to parse read-only option for file backend {0:?}")]
    FileBackendReadonlyParseFailed(String, #[source] ParseBoolError),

    #[error("failed to get VNIC name for device {0:?}")]
    NoVnicName(String),

    #[error("failed to get source for p9 device {0:?}")]
    NoP9Source(String),

    #[error("failed to get source for p9 device {0:?}")]
    NoP9Target(String),

    #[error("failed to get guest_cid for vsock device {0:?}")]
    NoVsockGuestCid(String),
}

#[derive(Clone, Debug, Default)]
pub struct SpecConfig {
    pub enable_pcie: bool,
    pub components: BTreeMap<SpecKey, Component>,
}

// Inspired by `api_spec_v0.rs`'s `insert_component` and
// `propolis-cli/src/main.rs`'s `add_component_to_spec`. Same purpose as both of
// them.
//
// Before either of those are transforming one kind of spec to another, TOML
// configurations are parsed to a SpecConfig in this file, where there is *also*
// an opportunity for duplicate keys to clobber spec items.
#[track_caller]
fn spec_component_add(
    spec: &mut SpecConfig,
    key: SpecKey,
    component: Component,
) -> Result<(), TomlToSpecError> {
    if spec.components.contains_key(&key) {
        return Err(TomlToSpecError::DuplicateSpecKey(key));
    }

    spec.components.insert(key, component);
    Ok(())
}

impl TryFrom<&super::Config> for SpecConfig {
    type Error = TomlToSpecError;

    fn try_from(config: &super::Config) -> Result<Self, Self::Error> {
        let mut spec = SpecConfig {
            enable_pcie: config
                .chipset
                .options
                .get("enable-pcie")
                .map(|v| {
                    v.as_bool().ok_or_else(|| {
                        TomlToSpecError::EnablePcieParseFailed(v.to_string())
                    })
                })
                .transpose()?
                .unwrap_or(false),
            ..Default::default()
        };

        for (device_name, device) in config.devices.iter() {
            let device_id = SpecKey::Name(device_name.clone());
            let driver = device.driver.as_str();
            if device_name == MIGRATION_FAILURE_DEVICE_NAME {
                const FAIL_EXPORTS: &str = "fail_exports";
                const FAIL_IMPORTS: &str = "fail_imports";
                let fail_exports = device
                    .options
                    .get(FAIL_EXPORTS)
                    .and_then(|val| val.as_integer())
                    .unwrap_or(0)
                    .max(0) as u32;
                let fail_imports = device
                    .options
                    .get(FAIL_IMPORTS)
                    .and_then(|val| val.as_integer())
                    .unwrap_or(0)
                    .max(0) as u32;

                spec_component_add(
                    &mut spec,
                    SpecKey::Name(MIGRATION_FAILURE_DEVICE_NAME.to_owned()),
                    Component::MigrationFailureInjector(
                        MigrationFailureInjector { fail_exports, fail_imports },
                    ),
                )?;

                continue;
            }

            match driver {
                // If this is a storage device, parse its "block_dev" property
                // to get the name of its corresponding backend.
                "pci-virtio-block" | "pci-nvme" => {
                    let (device_spec, backend_id) =
                        parse_storage_device_from_config(device_name, device)?;

                    let backend_name = backend_id.to_string();
                    let backend_config =
                        config.block_devs.get(&backend_name).ok_or_else(
                            || TomlToSpecError::StorageDeviceBackendNotFound {
                                device: device_name.to_owned(),
                                backend: backend_name.to_string(),
                            },
                        )?;

                    let backend_spec = parse_storage_backend_from_config(
                        &backend_name,
                        backend_config,
                    )?;

                    spec_component_add(&mut spec, device_id, device_spec)?;
                    spec_component_add(&mut spec, backend_id, backend_spec)?;
                }
                "pci-virtio-viona" => {
                    let ParsedNic { device_spec, backend_spec, backend_id } =
                        parse_network_device_from_config(device_name, device)?;

                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::VirtioNic(device_spec),
                    )?;

                    spec_component_add(
                        &mut spec,
                        backend_id,
                        Component::VirtioNetworkBackend(backend_spec),
                    )?;
                }
                "softnpu-pci-port" => {
                    let pci_path: PciPath =
                        device.get("pci-path").ok_or_else(|| {
                            TomlToSpecError::InvalidPciPath(
                                device_name.to_owned(),
                            )
                        })?;

                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::SoftNpuPciPort(SoftNpuPciPort { pci_path }),
                    )?;
                }
                "softnpu-port" => {
                    let vnic_name =
                        device.get_string("vnic").ok_or_else(|| {
                            TomlToSpecError::NoVnicName(device_name.to_owned())
                        })?;

                    let backend_name =
                        SpecKey::Name(format!("{device_id}:backend"));

                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::SoftNpuPort(SoftNpuPort {
                            link_name: device_name.to_string(),
                            backend_id: backend_name.clone(),
                        }),
                    )?;

                    spec_component_add(
                        &mut spec,
                        backend_name,
                        Component::DlpiNetworkBackend(DlpiNetworkBackend {
                            vnic_name: vnic_name.to_owned(),
                        }),
                    )?;
                }
                "softnpu-p9" => {
                    let pci_path: PciPath =
                        device.get("pci-path").ok_or_else(|| {
                            TomlToSpecError::InvalidPciPath(
                                device_name.to_owned(),
                            )
                        })?;

                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::SoftNpuP9(SoftNpuP9 { pci_path }),
                    )?;
                }
                "pci-virtio-9p" => {
                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::P9fs(parse_p9fs_from_config(
                            device_name,
                            device,
                        )?),
                    )?;
                }
                "pci-virtio-socket" => {
                    spec_component_add(
                        &mut spec,
                        device_id,
                        Component::VirtioSocket(parse_vsock_from_config(
                            device_name,
                            device,
                        )?),
                    )?;
                }
                _ => {
                    return Err(TomlToSpecError::UnrecognizedDeviceType(
                        driver.to_owned(),
                    ))
                }
            }
        }

        for bridge in config.pci_bridges.iter() {
            let pci_path =
                PciPath::from_str(&bridge.pci_path).map_err(|e| {
                    TomlToSpecError::PciPathParseFailed(
                        bridge.pci_path.to_string(),
                        e,
                    )
                })?;

            spec_component_add(
                &mut spec,
                SpecKey::Name(format!("pci-bridge-{}", bridge.pci_path)),
                Component::PciPciBridge(PciPciBridge {
                    downstream_bus: bridge.downstream_bus,
                    pci_path,
                }),
            )?;
        }

        Ok(spec)
    }
}

fn parse_storage_device_from_config(
    name: &str,
    device: &super::Device,
) -> Result<(Component, SpecKey), TomlToSpecError> {
    enum Interface {
        Virtio,
        Nvme,
    }

    let interface = match device.driver.as_str() {
        "pci-virtio-block" => Interface::Virtio,
        "pci-nvme" => Interface::Nvme,
        _ => {
            return Err(TomlToSpecError::InvalidStorageDeviceType {
                kind: device.driver.clone(),
                name: name.to_owned(),
            });
        }
    };

    let backend_id = SpecKey::from_str(
        device
            .options
            .get("block_dev")
            .ok_or_else(|| {
                TomlToSpecError::NoBackendNameForStorageDevice(name.to_owned())
            })?
            .as_str()
            .ok_or_else(|| {
                TomlToSpecError::NoBackendNameForStorageDevice(name.to_owned())
            })?,
    )
    .expect("SpecKey::from_str is infallible");

    let pci_path: PciPath = device
        .get("pci-path")
        .ok_or_else(|| TomlToSpecError::InvalidPciPath(name.to_owned()))?;

    let id_to_return = backend_id.clone();
    Ok((
        match interface {
            Interface::Virtio => {
                Component::VirtioDisk(VirtioDisk { backend_id, pci_path })
            }
            Interface::Nvme => Component::NvmeDisk(NvmeDisk {
                backend_id,
                pci_path,
                serial_number: nvme_serial_from_str(name, b' '),
            }),
        },
        id_to_return,
    ))
}

fn parse_storage_backend_from_config(
    name: &str,
    backend: &super::BlockDevice,
) -> Result<Component, TomlToSpecError> {
    let backend_spec = match backend.bdtype.as_str() {
        "file" => Component::FileStorageBackend(FileStorageBackend {
            path: backend
                .options
                .get("path")
                .ok_or_else(|| {
                    TomlToSpecError::InvalidFileBackendPath(name.to_owned())
                })?
                .as_str()
                .ok_or_else(|| {
                    TomlToSpecError::InvalidFileBackendPath(name.to_owned())
                })?
                .to_string(),
            readonly: match backend.options.get("readonly") {
                Some(toml::Value::Boolean(ro)) => Some(*ro),
                Some(toml::Value::String(v)) => {
                    Some(v.parse::<bool>().map_err(|e| {
                        TomlToSpecError::FileBackendReadonlyParseFailed(
                            name.to_owned(),
                            e,
                        )
                    })?)
                }
                _ => None,
            }
            .unwrap_or(false),
            block_size: backend.opts.block_size.unwrap_or(512),
            workers: backend.opts.workers,
        }),
        _ => {
            return Err(TomlToSpecError::InvalidStorageBackendType {
                kind: backend.bdtype.clone(),
                name: name.to_owned(),
            });
        }
    };

    Ok(backend_spec)
}

struct ParsedNic {
    device_spec: VirtioNic,
    backend_spec: VirtioNetworkBackend,
    backend_id: SpecKey,
}

fn parse_network_device_from_config(
    name: &str,
    device: &super::Device,
) -> Result<ParsedNic, TomlToSpecError> {
    let vnic_name = device
        .get_string("vnic")
        .ok_or_else(|| TomlToSpecError::NoVnicName(name.to_owned()))?;

    let pci_path: PciPath = device
        .get("pci-path")
        .ok_or_else(|| TomlToSpecError::InvalidPciPath(name.to_owned()))?;

    let backend_id = SpecKey::Name(format!("{name}-backend"));
    Ok(ParsedNic {
        device_spec: VirtioNic {
            backend_id: backend_id.clone(),
            interface_id: uuid::Uuid::nil(),
            pci_path,
        },
        backend_spec: VirtioNetworkBackend { vnic_name: vnic_name.to_owned() },
        backend_id,
    })
}

fn parse_p9fs_from_config(
    name: &str,
    device: &super::Device,
) -> Result<P9fs, TomlToSpecError> {
    let source = device
        .get_string("source")
        .ok_or_else(|| TomlToSpecError::NoP9Source(name.to_owned()))?;
    let target = device
        .get_string("target")
        .ok_or_else(|| TomlToSpecError::NoP9Target(name.to_owned()))?;
    let pci_path: PciPath = device
        .get("pci-path")
        .ok_or_else(|| TomlToSpecError::InvalidPciPath(name.to_owned()))?;

    let chunk_size = device.get("chunk_size").unwrap_or(65536);
    Ok(P9fs {
        source: source.to_owned(),
        target: target.to_owned(),
        chunk_size,
        pci_path,
    })
}

fn parse_vsock_from_config(
    name: &str,
    device: &super::Device,
) -> Result<VirtioSocket, TomlToSpecError> {
    let guest_cid = device
        .get("guest_cid")
        .ok_or_else(|| TomlToSpecError::NoVsockGuestCid(name.to_owned()))?;
    let pci_path: PciPath = device
        .get("pci-path")
        .ok_or_else(|| TomlToSpecError::InvalidPciPath(name.to_owned()))?;

    Ok(VirtioSocket { guest_cid, pci_path })
}

/// Translate a parsed TOML-provided `CpuidEntry` into a `propolis-server`
/// API-style `CpuidEntry`.
///
/// The transformation here is trivial. Using the API-style `CpuidEntry` for the
/// TOML definition would make for clumsier text, though, so they're defined
/// slightly differently for the different use cases.
fn translate_cpuid_entry(
    toml_entry: super::CpuidEntry,
) -> propolis_client::instance_spec::CpuidEntry {
    let super::CpuidEntry { func, idx, values: [eax, ebx, ecx, edx] } =
        toml_entry;

    propolis_client::instance_spec::CpuidEntry {
        leaf: func,
        subleaf: idx,
        eax,
        ebx,
        ecx,
        edx,
    }
}

/// Not a `TryFrom` or `TryInto` because we're re-exporting types from
/// `cpuid-profile-config`, so they're actually defined in a foreign crate.
pub fn toml_cpuid_to_spec_cpuid(
    profile: &super::CpuidProfile,
) -> Result<Cpuid, super::CpuidParseError> {
    let entries = Vec::<super::CpuidEntry>::try_from(profile)?;
    let entries = entries.into_iter().map(translate_cpuid_entry).collect();

    let vendor = match profile.vendor {
        super::CpuVendor::Amd => CpuidVendor::Amd,
        super::CpuVendor::Intel => CpuidVendor::Intel,
    };
    Ok(Cpuid { entries, vendor })
}


================================================
FILE: crates/propolis-server-api/Cargo.toml
================================================
[package]
name = "propolis-server-api"
version = "0.1.0"
license = "MPL-2.0"
edition = "2024"

[dependencies]
crucible-client-types.workspace = true
dropshot.workspace = true
dropshot-api-manager-types.workspace = true
propolis-api-types-versions.workspace = true


================================================
FILE: crates/propolis-server-api/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use dropshot::{
    HttpError, HttpResponseCreated, HttpResponseOk,
    HttpResponseUpdatedNoContent, Path, Query, RequestContext, TypedBody,
    WebsocketChannelResult, WebsocketConnection,
};
use dropshot_api_manager_types::api_versions;
use propolis_api_types_versions::{latest, v1, v2};

api_versions!([
    // WHEN CHANGING THE API (part 1 of 2):
    //
    // +- Pick a new semver and define it in the list below.  The list MUST
    // |  remain sorted, which generally means that your version should go at
    // |  the very top.
    // |
    // |  Duplicate this line, uncomment the *second* copy, update that copy for
    // |  your new API version, and leave the first copy commented out as an
    // |  example for the next person.
    // v
    // (next_int, IDENT),
    (5, CRUCIBLE_VOLUME_INFO),
    (4, DROPSHOT_BUMP_WEBSOCKET),
    (3, ADD_VSOCK),
    (2, PROGRAMMABLE_SMBIOS),
    (1, INITIAL),
]);

// WHEN CHANGING THE API (part 2 of 2):
//
// The call to `api_versions!` above defines constants of type
// `semver::Version` that you can use in your Dropshot API definition to specify
// the version when a particular endpoint was added or removed.  For example, if
// you used:
//
//     (2, ADD_FOOBAR)
//
// Then you could use `VERSION_ADD_FOOBAR` as the version in which endpoints
// were added or removed.

#[dropshot::api_description]
pub trait PropolisServerApi {
    type Context;

    #[endpoint {
        method = PUT,
        path = "/instance",
        versions = VERSION_ADD_VSOCK..
    }]
    async fn instance_ensure(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<latest::instance::InstanceEnsureRequest>,
    ) -> Result<
        HttpResponseCreated<latest::instance::InstanceEnsureResponse>,
        HttpError,
    >;

    #[endpoint {
        operation_id = "instance_ensure",
        method = PUT,
        path = "/instance",
        versions = VERSION_PROGRAMMABLE_SMBIOS..VERSION_ADD_VSOCK
    }]
    async fn instance_ensure_v2(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<v2::api::InstanceEnsureRequest>,
    ) -> Result<
        HttpResponseCreated<latest::instance::InstanceEnsureResponse>,
        HttpError,
    > {
        Self::instance_ensure(
            rqctx,
            request.map(latest::instance::InstanceEnsureRequest::from),
        )
        .await
    }

    #[endpoint {
        operation_id = "instance_ensure",
        method = PUT,
        path = "/instance",
        versions = ..VERSION_PROGRAMMABLE_SMBIOS
    }]
    async fn instance_ensure_v1(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<v1::instance::InstanceEnsureRequest>,
    ) -> Result<
        HttpResponseCreated<latest::instance::InstanceEnsureResponse>,
        HttpError,
    > {
        Self::instance_ensure_v2(
            rqctx,
            request.map(v2::api::InstanceEnsureRequest::from),
        )
        .await
    }

    #[endpoint {
        method = GET,
        path = "/instance/spec",
        versions = VERSION_ADD_VSOCK..
    }]
    async fn instance_spec_get(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<
        HttpResponseOk<latest::instance_spec::InstanceSpecGetResponse>,
        HttpError,
    >;

    #[endpoint {
        operation_id = "instance_spec_get",
        method = GET,
        path = "/instance/spec",
        versions = VERSION_PROGRAMMABLE_SMBIOS..VERSION_ADD_VSOCK
    }]
    async fn instance_spec_get_v2(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<
        HttpResponseOk<v2::instance_spec::InstanceSpecGetResponse>,
        HttpError,
    > {
        Ok(Self::instance_spec_get(rqctx)
            .await?
            .map(v2::instance_spec::InstanceSpecGetResponse::from))
    }

    #[endpoint {
        operation_id = "instance_spec_get",
        method = GET,
        path = "/instance/spec",
        versions = ..VERSION_PROGRAMMABLE_SMBIOS
    }]
    async fn instance_spec_get_v1(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<
        HttpResponseOk<v1::instance_spec::InstanceSpecGetResponse>,
        HttpError,
    > {
        Ok(Self::instance_spec_get_v2(rqctx)
            .await?
            .map(v1::instance_spec::InstanceSpecGetResponse::from))
    }

    #[endpoint {
        method = GET,
        path = "/instance",
    }]
    async fn instance_get(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<latest::instance::InstanceGetResponse>, HttpError>;

    #[endpoint {
        method = GET,
        path = "/instance/state-monitor",
    }]
    async fn instance_state_monitor(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<latest::instance::InstanceStateMonitorRequest>,
    ) -> Result<
        HttpResponseOk<latest::instance::InstanceStateMonitorResponse>,
        HttpError,
    >;

    #[endpoint {
        method = PUT,
        path = "/instance/state",
    }]
    async fn instance_state_put(
        rqctx: RequestContext<Self::Context>,
        request: TypedBody<latest::instance::InstanceStateRequested>,
    ) -> Result<HttpResponseUpdatedNoContent, HttpError>;

    #[endpoint {
        method = GET,
        path = "/instance/serial/history",
    }]
    async fn instance_serial_history_get(
        rqctx: RequestContext<Self::Context>,
        query: Query<latest::serial::InstanceSerialConsoleHistoryRequest>,
    ) -> Result<
        HttpResponseOk<latest::serial::InstanceSerialConsoleHistoryResponse>,
        HttpError,
    >;

    #[channel {
        protocol = WEBSOCKETS,
        path = "/instance/serial",
    }]
    async fn instance_serial(
        rqctx: RequestContext<Self::Context>,
        query: Query<latest::serial::InstanceSerialConsoleStreamRequest>,
        websock: WebsocketConnection,
    ) -> WebsocketChannelResult;

    // See the note on instance_migrate_start below. /instance/vnc is not
    // currently used (as of 2025-10), but before it's used we'll want to think
    // about versioning considerations for the WebSocket protocol, similar to
    // instance_migrate_start.
    #[channel {
        protocol = WEBSOCKETS,
        path = "/instance/vnc",
        unpublished = true,
    }]
    async fn instance_vnc(
        rqctx: RequestContext<Self::Context>,
        _query: Query<()>,
        websock: WebsocketConnection,
    ) -> dropshot::WebsocketChannelResult;

    /// DO NOT USE THIS IF YOU'RE NOT PROPOLIS-SERVER.
    ///
    /// Internal API called during a migration from a destination instance to
    /// the source instance as part of the HTTP connection upgrade used to
    /// establish the migration link. This API is exported via OpenAPI purely
    /// to verify that its shape hasn't changed.
    //
    // # Versioning notes
    //
    // This API is expected to work even if the source and destination
    // propolis-server instances are on different versions. There are two parts
    // to versioning:
    //
    // 1. The parameters passed into the initial request.
    // 2. The protocol used for WebSocket communication.
    //
    // Part 1 is verified by the Dropshot API manager. For part 2,
    // propolis-server has internal support for protocol negotiation.
    //
    // Note that we currently bypass Progenitor and always pass in
    // VERSION_INITIAL. See `migration_start_connect` in
    // propolis-server/src/lib/migrate/destination.rs for where we do it. If we
    // introduce a change to this API, we'll have to carefully consider version
    // skew between the source and destination servers.
    #[channel {
        protocol = WEBSOCKETS,
        path = "/instance/migrate/{migration_id}/start",
    }]
    async fn instance_migrate_start(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<latest::migration::InstanceMigrateStartRequest>,
        websock: WebsocketConnection,
    ) -> dropshot::WebsocketChannelResult;

    #[endpoint {
        method = GET,
        path = "/instance/migration-status"
    }]
    async fn instance_migrate_status(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<
        HttpResponseOk<latest::migration::InstanceMigrateStatusResponse>,
        HttpError,
    >;

    /// Issues a snapshot request to a crucible backend.
    #[endpoint {
        method = POST,
        path = "/instance/disk/{id}/snapshot/{snapshot_id}",
    }]
    async fn instance_issue_crucible_snapshot_request(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<latest::disk::SnapshotRequestPathParams>,
    ) -> Result<HttpResponseOk<()>, HttpError>;

    /// Gets the status of a Crucible volume backing a disk
    #[endpoint {
        operation_id = "disk_volume_status",
        method = GET,
        path = "/instance/disk/{id}/status",
        versions = VERSION_INITIAL..VERSION_CRUCIBLE_VOLUME_INFO,
    }]
    async fn disk_volume_status_v1(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<v1::disk::VolumeStatusPathParams>,
    ) -> Result<HttpResponseOk<v1::disk::VolumeStatus>, HttpError>;

    /// Gets the status of a Crucible volume backing a disk
    #[endpoint {
        method = GET,
        path = "/instance/disk/{id}/status",
        versions = VERSION_CRUCIBLE_VOLUME_INFO..,
    }]
    async fn disk_volume_status(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<latest::disk::VolumeStatusPathParams>,
    ) -> Result<HttpResponseOk<latest::disk::VolumeStatus>, HttpError>;

    /// Issues a volume_construction_request replace to a crucible backend.
    #[endpoint {
        method = PUT,
        path = "/instance/disk/{id}/vcr",
    }]
    async fn instance_issue_crucible_vcr_request(
        rqctx: RequestContext<Self::Context>,
        path_params: Path<latest::disk::VCRRequestPathParams>,
        request: TypedBody<latest::disk::InstanceVCRReplace>,
    ) -> Result<HttpResponseOk<crucible_client_types::ReplaceResult>, HttpError>;

    /// Issues an NMI to the instance.
    #[endpoint {
        method = POST,
        path = "/instance/nmi",
    }]
    async fn instance_issue_nmi(
        rqctx: RequestContext<Self::Context>,
    ) -> Result<HttpResponseOk<()>, HttpError>;
}


================================================
FILE: crates/propolis-types/Cargo.toml
================================================
[package]
name = "propolis_types"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
schemars = { workspace = true, features = [ "uuid1" ] }
serde.workspace = true

[dev-dependencies]
serde_json.workspace = true
serde_test.workspace = true


================================================
FILE: crates/propolis-types/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Fundamental types shared by other Propolis crates.
//!
//! This crate defines some basic types that are shared by multiple other
//! Propolis crates (library, client, server, and/or standalone) such that they
//! can all use those types (and implement their own conversions to/from them)
//! without any layering oddities.

use std::fmt::Display;
use std::io::{Error, ErrorKind};
use std::str::FromStr;

use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

const PCI_DEVICES_PER_BUS: u8 = 32;
const PCI_FUNCTIONS_PER_DEVICE: u8 = 8;

/// A PCI bus/device/function tuple.
//
// N.B. Field names here should be kept in sync with the helper struct in the
// Deserialize impl below.
#[derive(
    Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug, JsonSchema, Serialize,
)]
pub struct PciPath {
    bus: u8,
    device: u8,
    function: u8,
}

impl PciPath {
    pub fn new(
        bus: u8,
        device: u8,
        function: u8,
    ) -> Result<Self, std::io::Error> {
        if device >= PCI_DEVICES_PER_BUS {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                format!(
                    "PCI device {device} outside range of 0-{}",
                    PCI_DEVICES_PER_BUS - 1
                ),
            ));
        }

        if function >= PCI_FUNCTIONS_PER_DEVICE {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                format!(
                    "PCI function {function} outside range of 0-{}",
                    PCI_FUNCTIONS_PER_DEVICE - 1
                ),
            ));
        }

        Ok(Self { bus, device, function })
    }

    #[inline]
    pub fn bus(&self) -> u8 {
        self.bus
    }

    #[inline]
    pub fn device(&self) -> u8 {
        self.device
    }

    #[inline]
    pub fn function(&self) -> u8 {
        self.function
    }
}

impl FromStr for PciPath {
    type Err = std::io::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let mut fields = Vec::with_capacity(3);
        for f in s.split('.') {
            fields.push(u8::from_str(f).map_err(|e| {
                Self::Err::new(
                    ErrorKind::InvalidInput,
                    format!("Failed to parse PCI path {s}: {e}"),
                )
            })?);
        }

        if fields.len() != 3 {
            return Err(Self::Err::new(
                ErrorKind::InvalidInput,
                format!(
                    "Expected 3 fields in PCI path {s}, got {}",
                    fields.len()
                ),
            ));
        }

        Self::new(fields[0], fields[1], fields[2])
    }
}

impl Display for PciPath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let Self { bus, device, function } = self;
        write!(f, "{bus}.{device}.{function}")
    }
}

impl<'de> Deserialize<'de> for PciPath {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        // N.B. The field names here should be kept in sync with the actual
        // PciPath structure above.
        #[derive(Deserialize)]
        struct Raw {
            bus: u8,
            device: u8,
            function: u8,
        }

        let raw = Raw::deserialize(deserializer)?;

        Self::new(raw.bus, raw.device, raw.function)
            .map_err(|e| serde::de::Error::custom(e.to_string()))
    }
}

#[cfg(test)]
mod test {
    use super::PciPath;
    use std::str::FromStr;

    #[test]
    fn pci_path_from_str() {
        const TEST_CASES: &[(&str, Result<PciPath, ()>)] = &[
            ("0.7.0", Ok(PciPath { bus: 0, device: 7, function: 0 })),
            ("1.2.3", Ok(PciPath { bus: 1, device: 2, function: 3 })),
            ("0.40.0", Err(())),
            ("0.1.9", Err(())),
            ("255.254.253", Err(())),
            ("1000.0.0", Err(())),
            ("4/3/4", Err(())),
            ("a.b.c", Err(())),
            ("1.5#4", Err(())),
            ("", Err(())),
            ("alas, poor PCI device", Err(())),
        ];

        for (input, expected) in TEST_CASES {
            match PciPath::from_str(input) {
                Ok(path) => assert_eq!(path, expected.unwrap()),
                Err(_) => assert!(
                    expected.is_err(),
                    "Expected error parsing PCI path {input}"
                ),
            }
        }
    }

    fn check_pci_path_deserialization<E>(
        input: &str,
        expected: Result<PciPath, E>,
    ) {
        let actual = serde_json::from_str::<PciPath>(input);
        match (actual, expected) {
            (Ok(parsed), Ok(expected)) => assert_eq!(parsed, expected),
            (Ok(_), Err(_)) => {
                panic!("expected to fail to deserialize input: {input}")
            }
            (Err(e), Ok(_)) => {
                panic!("failed to deserialize input {input}: {e}")
            }
            (Err(_), Err(_)) => {}
        }
    }

    #[test]
    fn pci_path_deserialization() {
        const TEST_CASES: &[(&str, Result<PciPath, ()>)] = &[
            (
                r#"{"bus": 0, "device": 7, "function": 0}"#,
                Ok(PciPath { bus: 0, device: 7, function: 0 }),
            ),
            (
                r#"{"bus": 1, "device": 2, "function": 3}"#,
                Ok(PciPath { bus: 1, device: 2, function: 3 }),
            ),
            (r#"{"bus": 0, "device": 40, "function": 0}"#, Err(())),
            (r#"{"bus": 0, "device": 1, "function": 9}"#, Err(())),
        ];

        for (input, expected) in TEST_CASES {
            check_pci_path_deserialization(input, *expected);
        }
    }

    // This test is expensive, so don't run it by default.
    #[test]
    #[ignore]
    fn pci_path_deserialization_exhaustive() {
        for bus in 0..=255 {
            for device in 0..=255 {
                for function in 0..=255 {
                    let expected = PciPath::new(bus, device, function);
                    let json = format!(
                        "{{\
                        \"bus\": {bus},\
                        \"device\": {device},\
                        \"function\": {function}\
                        }}"
                    );

                    check_pci_path_deserialization(&json, expected);
                }
            }
        }
    }
}

/// A CPUID leaf/subleaf (function/index) specifier.
#[derive(
    Clone,
    Copy,
    PartialEq,
    Eq,
    PartialOrd,
    Ord,
    Debug,
    JsonSchema,
    Serialize,
    Deserialize,
)]
pub struct CpuidIdent {
    /// A leaf number.
    pub leaf: u32,

    /// A subleaf number, or `None` if the leaf is not expected to use
    /// subleaves.
    ///
    /// When matching CPUID input values to a [`CpuidIdent`], a subleaf of
    /// `None` matches any value in ecx, while a value of `Some(s)` only matches
    /// inputs where ecx is equal to `s`.
    pub subleaf: Option<u32>,
}

impl CpuidIdent {
    /// Constructs an identifier that describes a specific leaf with no subleaf.
    pub fn leaf(leaf: u32) -> Self {
        Self { leaf, subleaf: None }
    }

    /// Constructs an identifier that specifies a leaf and subleaf.
    pub fn subleaf(leaf: u32, subleaf: u32) -> Self {
        Self { leaf, subleaf: Some(subleaf) }
    }
}

/// Values returned by a CPUID instruction.
#[derive(
    Clone,
    Copy,
    PartialEq,
    Eq,
    Debug,
    JsonSchema,
    Serialize,
    Deserialize,
    Default,
)]
pub struct CpuidValues {
    pub eax: u32,
    pub ebx: u32,
    pub ecx: u32,
    pub edx: u32,
}

impl CpuidValues {
    /// Returns a mutable iterator over eax, ebx, ecx, and edx.
    pub fn iter_mut(&mut self) -> impl Iterator<Item = &mut u32> {
        [&mut self.eax, &mut self.ebx, &mut self.ecx, &mut self.edx].into_iter()
    }

    /// Returns `true` if eax, ebx, ecx, and edx are all zero.
    pub fn all_zero(&self) -> bool {
        self.eax == 0 && self.ebx == 0 && self.ecx == 0 && self.edx == 0
    }
}

#[cfg(target_arch = "x86_64")]
impl From<core::arch::x86_64::CpuidResult> for CpuidValues {
    fn from(value: core::arch::x86_64::CpuidResult) -> Self {
        Self { eax: value.eax, ebx: value.ebx, ecx: value.ecx, edx: value.edx }
    }
}

impl From<[u32; 4]> for CpuidValues {
    fn from(value: [u32; 4]) -> Self {
        Self { eax: value[0], ebx: value[1], ecx: value[2], edx: value[3] }
    }
}

/// A CPU vendor to use when interpreting the meanings of CPUID leaves in the
/// extended ID range (0x80000000 to 0x8000FFFF).
#[derive(
    Clone, Copy, PartialEq, Eq, Debug, JsonSchema, Serialize, Deserialize,
)]
#[serde(rename_all = "snake_case")]
pub enum CpuidVendor {
    Amd,
    Intel,
}

impl std::fmt::Display for CpuidVendor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{}",
            match self {
                Self::Amd => "AMD",
                Self::Intel => "Intel",
            }
        )
    }
}

impl CpuidVendor {
    pub fn is_amd(self) -> bool {
        self == Self::Amd
    }

    pub fn is_intel(self) -> bool {
        self == Self::Intel
    }
}

impl TryFrom<CpuidValues> for CpuidVendor {
    type Error = &'static str;

    fn try_from(value: CpuidValues) -> Result<Self, Self::Error> {
        match (value.ebx, value.ecx, value.edx) {
            // AuthenticAmd
            (0x68747541, 0x444d4163, 0x69746e65) => Ok(Self::Amd),
            // GenuineIntel
            (0x756e6547, 0x6c65746e, 0x49656e69) => Ok(Self::Intel),
            _ => Err("unrecognized vendor"),
        }
    }
}


================================================
FILE: crates/rfb/Cargo.toml
================================================
[package]
name = "rfb"
version = "0.0.0"
description = "Implementation of the RFB protocol (RFC 6143)"
readme = "README.md"
license = "MPL-2.0"
edition = "2021"

[lib]
name = "rfb"
path = "src/lib.rs"

# Prevent cargo from building shared code like a binary
[[example]]
name = "example-shared"
path = "examples/shared.rs"
crate-type = ["rlib"]

[[example]]
name = "example-socket"
path = "examples/socket.rs"

[[example]]
name = "example-websock"
path = "examples/websock.rs"
required-features = ["tungstenite"]


[dependencies]
ascii = { version = "1.1", default-features = false }
bitflags.workspace = true
futures.workspace = true
thiserror.workspace = true
rgb_frame.workspace = true
strum = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["full"] }
tokio-util = { workspace = true, features = ["codec"] }
tokio-tungstenite = { workspace = true, optional = true }
zerocopy = { workspace = true, features = ["derive"] }

[dev-dependencies]
anyhow.workspace = true
clap = { workspace = true, features = ["derive"] }
image = { version = "0.25.1", default-features = false, features = ["png"] }
dropshot.workspace = true
slog.workspace = true
slog-envlogger = "2.2.0"
slog-term.workspace = true

[features]
default = []
tungstenite = ["dep:tokio-tungstenite"]


================================================
FILE: crates/rfb/README.md
================================================
# RFB

This crate implements a server-side implementation of the Remote Framebuffer
Protocol. Consumers of the crate can use the implementation while providing
their own framebuffer data by implementing the trait `rfb::server::Server`.

RFB is the protocol used to implement VNC. See [RFC
6143](https://www.rfc-editor.org/rfc/rfc6143.html) for details.

## Example Server

See the [example implementation](examples/server.rs) for a trivial
implementation.

To run the example, run:
```bash
$ cargo build --example example-server
$ ./target/debug/examples/example-server
```

Then connect to the VNC server with your favorite client (such as
[noVNC](https://github.com/novnc/noVNC)) at localhost:9000.


================================================
FILE: crates/rfb/examples/shared.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2024 Oxide Computer Company

#![allow(dead_code)]

use std::io::{BufReader, Cursor};

use clap::ValueEnum;
use image::io::Reader as ImageReader;
use rgb_frame::*;
use slog::Drain;

use rfb::encodings::RawEncoding;
use rfb::proto::{
    FramebufferUpdate, PixelFormat, Position, Rectangle, Resolution,
};

const IMG_OXIDE: &[u8] = include_bytes!("images/oxide.png");
const IMG_COLORBARS: &[u8] = include_bytes!("images/color-bars.png");

#[derive(ValueEnum, Debug, Copy, Clone)]
pub enum Image {
    Oxide,
    ColorBars,
    Red,
    Green,
    Blue,
    White,
    Black,
}
#[derive(Clone)]
pub struct ExampleBackend(Image);
impl ExampleBackend {
    pub fn new(img: Image) -> Self {
        Self(img)
    }
    pub async fn generate(
        &self,
        width: usize,
        height: usize,
        format: &PixelFormat,
    ) -> FramebufferUpdate {
        let size = Size { width, height };
        let mut frame = generate_frame(size, self.0);

        if let Ok(fourcc) = format.try_into() {
            frame.convert(fourcc);
        }

        let r = Rectangle {
            position: Position { x: 0, y: 0 },
            dimensions: Resolution {
                width: width as u16,
                height: height as u16,
            },
            data: Box::new(RawEncoding::new(frame.bytes().to_vec())),
        };
        FramebufferUpdate(vec![r])
    }
}

#[derive(Copy, Clone)]
struct Size {
    width: usize,
    height: usize,
}
impl Size {
    const fn len(&self, bytes_per_pixel: usize) -> usize {
        self.width * self.height * bytes_per_pixel
    }
}

fn generate_image(size: Size, img_bytes: &[u8]) -> Frame {
    let image = ImageReader::new(BufReader::new(Cursor::new(img_bytes)))
        .with_guessed_format()
        .unwrap()
        .decode()
        .unwrap()
        .into_rgba8();

    Frame::new_uninit(
        Spec::new(size.width, size.height, FourCC::AB24),
        |data, stride| {
            for y in 0..size.height {
                for x in 0..size.width {
                    let pix = match image.get_pixel_checked(x as u32, y as u32)
                    {
                        Some(px) => px.0,
                        // black, opaque
                        None => [0, 0, 0, 0xff],
                    };

                    let idx = y * stride.get() + x * 4;
                    data[idx].write(pix[0]);
                    data[idx + 1].write(pix[1]);
                    data[idx + 2].write(pix[2]);
                    data[idx + 3].write(pix[3]);
                }
            }
        },
    )
}

fn generate_solid(size: Size, rgb_pixel: [u8; 3]) -> Frame {
    Frame::new_uninit(
        Spec::new(size.width, size.height, FourCC::BA24),
        |data, stride| {
            for y in 0..size.height {
                for x in 0..size.width {
                    let idx = y * stride.get() + x * 4;
                    data[idx].write(rgb_pixel[0]);
                    data[idx + 1].write(rgb_pixel[1]);
                    data[idx + 2].write(rgb_pixel[2]);
                    data[idx + 3].write(0xff);
                }
            }
        },
    )
}

fn generate_frame(size: Size, img: Image) -> Frame {
    match img {
        Image::Oxide => generate_image(size, IMG_OXIDE),
        Image::ColorBars => generate_image(size, IMG_COLORBARS),
        Image::Red => generate_solid(size, [255, 0, 0]),
        Image::Green => generate_solid(size, [0, 255, 0]),
        Image::Blue => generate_solid(size, [0, 0, 255]),
        Image::White => generate_solid(size, [255, 255, 255]),
        Image::Black => generate_solid(size, [0, 0, 0]),
    }
}

pub fn build_logger() -> slog::Logger {
    slog::Logger::root(
        std::sync::Mutex::new(
            slog_envlogger::EnvLogger::new(
                slog_term::FullFormat::new(
                    slog_term::TermDecorator::new().build(),
                )
                .build()
                .fuse(),
            )
            .fuse(),
        )
        .fuse(),
        slog::o!(),
    )
}


================================================
FILE: crates/rfb/examples/socket.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

use std::net::{IpAddr, Ipv4Addr, SocketAddr};

use anyhow::Result;
use clap::Parser;
use futures::StreamExt;
use slog::info;
use tokio::net::TcpListener;
use tokio_util::codec::FramedRead;

use rfb::proto::{
    ClientMessageDecoder, PixelFormat, ProtoVersion, Resolution, SecurityType,
    SecurityTypes,
};
use rgb_frame::FourCC;

mod shared;
use shared::{ExampleBackend, Image};

const WIDTH: usize = 1024;
const HEIGHT: usize = 768;

#[derive(Parser, Debug)]
/// A simple VNC server that displays a single image or color, in a given pixel format
///
/// By default, the server will display the Oxide logo image using little-endian
/// xBGR as its pixel format.
///
/// To specify an alternate image or color, use the `-i` flag:
/// ./example-server -i colorbars
/// ./example-server -i red
///
/// To specify an alternate pixel format, use the --fourcc flag. The server will
/// transform the input image/color to the pixel format corresponding to the
/// specified fourcc and use the format for the RFB protocol.
///
/// For example, to use big-endian xRGB:
/// ./example-server --fourcc XR24
///
struct Args {
    /// Image/color to display from the server
    #[clap(value_enum, short, long, default_value_t = Image::Oxide)]
    image: Image,

    /// FourCC for pixel format
    #[clap(long, default_value_t = FourCC::XB24)]
    fourcc: FourCC,
}

#[tokio::main]
async fn main() -> Result<()> {
    let log = shared::build_logger();

    let args = Args::parse();

    let pf: PixelFormat = args.fourcc.into();
    info!(
        log,
        "Starting server: image: {:?}, pixel format; {:#?}", args.image, pf
    );

    let backend = ExampleBackend::new(args.image);

    let listener = TcpListener::bind(SocketAddr::new(
        IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)),
        9000,
    ))
    .await
    .unwrap();

    loop {
        let (mut sock, addr) = listener.accept().await.unwrap();

        info!(log, "New connection from {:?}", addr);
        let log_child = log.new(slog::o!("sock" => addr));

        let init_res = rfb::server::initialize(
            &mut sock,
            rfb::server::InitParams {
                version: ProtoVersion::Rfb38,

                sec_types: SecurityTypes(vec![
                    SecurityType::None,
                    SecurityType::VncAuthentication,
                ]),

                name: "rfb-example-server".to_string(),

                resolution: Resolution {
                    width: WIDTH as u16,
                    height: HEIGHT as u16,
                },
                format: pf.clone(),
            },
        )
        .await;

        if let Err(e) = init_res {
            slog::info!(log_child, "Error during client init {:?}", e);
            continue;
        }

        let be_clone = backend.clone();
        let input_pf = pf.clone();
        tokio::spawn(async move {
            let mut output_pf = input_pf.clone();
            let mut decoder =
                FramedRead::new(sock, ClientMessageDecoder::default());
            loop {
                let msg = match decoder.next().await {
                    Some(Ok(m)) => m,
                    Some(Err(e)) => {
                        slog::info!(
                            log_child,
                            "Error reading client msg: {:?}",
                            e
                        );
                        return;
                    }
                    None => {
                        return;
                    }
                };
                let sock = decoder.get_mut();

                use rfb::proto::ClientMessage;

                match msg {
                    ClientMessage::SetPixelFormat(out_pf) => {
                        output_pf = out_pf;
                    }
                    ClientMessage::FramebufferUpdateRequest(_req) => {
                        let fbu =
                            be_clone.generate(WIDTH, HEIGHT, &output_pf).await;

                        if let Err(e) = fbu.write_to(sock).await {
                            slog::info!(
                                log_child,
                                "Error sending FrambufferUpdate: {:?}",
                                e
                            );
                            return;
                        }
                    }
                    _ => {
                        slog::debug!(log_child, "RX: Client msg {:?}", msg);
                    }
                }
            }
        });
    }
}


================================================
FILE: crates/rfb/examples/websock.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

use std::net::{IpAddr, Ipv4Addr, SocketAddr};

use anyhow::Result;
use clap::Parser;
use dropshot::{
    channel, ApiDescription, ConfigDropshot, HttpServerStarter, Query,
    RequestContext, WebsocketConnection,
};
use futures::StreamExt;
use slog::info;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
use tokio_tungstenite::tungstenite::protocol::Role;
use tokio_util::codec::FramedRead;

use rfb::proto::{
    ClientMessageDecoder, PixelFormat, ProtoVersion, Resolution, SecurityType,
    SecurityTypes,
};
use rfb::{self, tungstenite::BinaryWs};
use rgb_frame::FourCC;

mod shared;
use shared::{build_logger, ExampleBackend, Image};

const WIDTH: usize = 1024;
const HEIGHT: usize = 768;

#[derive(Parser, Debug)]
/// A simple VNC server that displays a single image or color, in a given pixel format
struct Args {
    /// Image/color to display from the server
    #[clap(value_enum, short, long, default_value_t = Image::Oxide)]
    image: Image,

    /// FourCC for pixel format
    #[clap(long, default_value_t = FourCC::XB24)]
    fourcc: FourCC,
}

struct AppCtx {
    be: ExampleBackend,
    pf: PixelFormat,
}

async fn run_server(
    mut sock: BinaryWs<impl AsyncRead + AsyncWrite + Unpin>,
    be: ExampleBackend,
    input_pf: PixelFormat,
    log: &slog::Logger,
) {
    let init_res = rfb::server::initialize(
        &mut sock,
        rfb::server::InitParams {
            version: ProtoVersion::Rfb38,

            sec_types: SecurityTypes(vec![
                SecurityType::None,
                SecurityType::VncAuthentication,
            ]),

            name: "rfb-ws-example".to_string(),

            resolution: Resolution {
                width: WIDTH as u16,
                height: HEIGHT as u16,
            },
            format: input_pf.clone(),
        },
    )
    .await;

    match init_res {
        Ok(client_init) => {
            slog::debug!(log, "Client initialized {:?}", client_init);
        }
        Err(e) => {
            slog::info!(log, "Error during client init {:?}", e);
            return;
        }
    }

    let mut output_pf = input_pf.clone();
    let mut decoder = FramedRead::new(sock, ClientMessageDecoder::default());
    loop {
        let msg = match decoder.next().await {
            Some(Ok(m)) => m,
            Some(Err(e)) => {
                slog::info!(log, "Error reading client msg: {:?}", e);
                return;
            }
            None => {
                return;
            }
        };
        let sock = decoder.get_mut();

        use rfb::proto::ClientMessage;

        match msg {
            ClientMessage::SetPixelFormat(out_pf) => {
                output_pf = out_pf;
            }
            ClientMessage::FramebufferUpdateRequest(_req) => {
                let fbu = be.generate(WIDTH, HEIGHT, &output_pf).await;

                if let Err(e) = fbu.write_to(sock).await {
                    slog::info!(log, "Error sending FrambufferUpdate: {:?}", e);
                    return;
                }
                if let Err(e) = sock.flush().await {
                    slog::info!(
                        log,
                        "Error flushing after FrambufferUpdate: {:?}",
                        e
                    );
                    return;
                }
            }
            _ => {
                slog::debug!(log, "RX: Client msg {:?}", msg);
            }
        }
    }
}

#[tokio::main]
async fn main() -> Result<(), String> {
    let log = build_logger();

    let args = Args::parse();

    let pf = args.fourcc.into();
    let backend = ExampleBackend::new(args.image);
    let app = AppCtx { be: backend, pf };

    // Build a description of the API.
    let mut api = ApiDescription::new();
    api.register(ws_websockify).unwrap();

    // Set up the server.
    let config_dropshot = ConfigDropshot {
        bind_address: SocketAddr::new(
            IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)),
            3030,
        ),
        ..Default::default()
    };
    let server = HttpServerStarter::new(&config_dropshot, api, app, &log)
        .map_err(|error| format!("failed to create server: {error}"))?
        .start();

    server.await
}

// HTTP API interface

#[channel {
    protocol = WEBSOCKETS,
    path = "/websockify",
}]
async fn ws_websockify(
    rqctx: RequestContext<AppCtx>,
    _qp: Query<()>,
    upgraded: WebsocketConnection,
) -> dropshot::WebsocketChannelResult {
    let ws = tokio_tungstenite::WebSocketStream::from_raw_socket(
        upgraded.into_inner(),
        Role::Server,
        None,
    )
    .await;

    info!(rqctx.log, "New connection from {}", rqctx.request.remote_addr());
    let be = rqctx.server.private.be.clone();
    let pf = rqctx.server.private.pf.clone();
    run_server(BinaryWs::new(ws), be, pf, &rqctx.log).await;

    Ok(())
}


================================================
FILE: crates/rfb/src/encodings.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

use crate::proto::{Position, Resolution};

use strum::FromRepr;

#[derive(Debug, FromRepr, Ord, PartialOrd, Eq, PartialEq)]
#[repr(i32)]
pub enum EncodingType {
    Raw = 0,
    CopyRect = 1,
    RRE = 2,
    CoRRE = 4,
    Hextile = 5,
    Zlib = 6,
    TRLE = 15,
    ZRLE = 16,
    JPEG = 21,
    JRLE = 22,
    ZRLE2 = 24,
    DesktopSizePseudo = -223,
    LastRectPseudo = -224,
    CursorPseudo = -239,
    ContinuousUpdatesPseudo = -313,
}

pub trait Encoding: Send {
    fn get_type(&self) -> EncodingType;

    /// Transform this encoding from its representation into a byte vector that
    /// can be passed to the client.
    fn encode(&self) -> &[u8];
}

/// Section 7.7.1
pub struct RawEncoding {
    pixels: Vec<u8>,
}

impl RawEncoding {
    pub fn new(pixels: Vec<u8>) -> Self {
        Self { pixels }
    }
}

impl Encoding for RawEncoding {
    fn get_type(&self) -> EncodingType {
        EncodingType::Raw
    }

    fn encode(&self) -> &[u8] {
        &self.pixels
    }
}

#[allow(dead_code)]
struct RREncoding {
    background_pixel: Pixel,
    sub_rectangles: Vec<RRESubrectangle>,
}

#[allow(dead_code)]
struct Pixel {
    bytes: Vec<u8>,
}

#[allow(dead_code)]
struct RRESubrectangle {
    pixel: Pixel,
    position: Position,
    dimensions: Resolution,
}

#[allow(dead_code)]
struct HextileEncoding {
    tiles: Vec<Vec<HextileTile>>,
}

#[allow(dead_code)]
enum HextileTile {
    Raw(Vec<u8>),
    Encoded(HextileTileEncoded),
}

#[allow(dead_code)]
struct HextileTileEncoded {
    background: Option<Pixel>,
    foreground: Option<Pixel>,
    // TODO: finish this
}


================================================
FILE: crates/rfb/src/keysym.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

pub use ascii::AsciiChar;
use ascii::ToAsciiChar;
use KeySym::*;

// ascii characters have the same values as their keysym
const ASCII_MAX: u32 = 0x7f;

const KEYSYM_BACKSPACE: u32 = 0xff08;
const KEYSYM_TAB: u32 = 0xff09;
const KEYSYM_RETURN_ENTER: u32 = 0xff0d;
const KEYSYM_ESCAPE: u32 = 0xff1b;
const KEYSYM_INSERT: u32 = 0xff63;
const KEYSYM_DELETE: u32 = 0xffff;
const KEYSYM_HOME: u32 = 0xff50;
const KEYSYM_END: u32 = 0xff57;
const KEYSYM_PAGE_UP: u32 = 0xff55;
const KEYSYM_PAGE_DOWN: u32 = 0xff56;
const KEYSYM_PRINT: u32 = 0xff61;
const KEYSYM_PAUSE: u32 = 0xff13;
const KEYSYM_CAPS_LOCK: u32 = 0xffe5;
const KEYSYM_SUPER_LEFT: u32 = 0xffeb;
const KEYSYM_SUPER_RIGHT: u32 = 0xffec;
const KEYSYM_MENU: u32 = 0xff67;

const KEYSYM_LEFT: u32 = 0xff51;
const KEYSYM_UP: u32 = 0xff52;
const KEYSYM_RIGHT: u32 = 0xff53;
const KEYSYM_DOWN: u32 = 0xff54;
// function keys are in the range: 0xffbe to 0xffc9, in order
const KEYSYM_F1: u32 = 0xffbe;
const KEYSYM_F12: u32 = 0xffc9;
const KEYSYM_SHIFT_LEFT: u32 = 0xffe1;
const KEYSYM_SHIFT_RIGHT: u32 = 0xffe2;
const KEYSYM_CTRL_LEFT: u32 = 0xffe3;
const KEYSYM_CTRL_RIGHT: u32 = 0xffe4;

// XXX(JPH): do we need to support meta keys?

const KEYSYM_ALT_LEFT: u32 = 0xffe9;
const KEYSYM_ALT_RIGHT: u32 = 0xffea;
const KEYSYM_SCROLL_LOCK: u32 = 0xff14;
const KEYSYM_NUM_LOCK: u32 = 0xff7f;

const KEYSYM_KP_ENTER: u32 = 0xff8d;
const KEYSYM_KP_SLASH: u32 = 0xffaf;
const KEYSYM_KP_ASTERISK: u32 = 0xffaa;
const KEYSYM_KP_MINUS: u32 = 0xffad;
const KEYSYM_KP_PLUS: u32 = 0xffab;
const KEYSYM_KP_7: u32 = 0xffb7;
const KEYSYM_KP_HOME: u32 = 0xff95;
const KEYSYM_KP_8: u32 = 0xffb8;
const KEYSYM_KP_UP: u32 = 0xff97;
const KEYSYM_KP_9: u32 = 0xffb9;
const KEYSYM_KP_PGUP: u32 = 0xff9a;
const KEYSYM_KP_4: u32 = 0xffb4;
const KEYSYM_KP_LEFT: u32 = 0xff96;
const KEYSYM_KP_5: u32 = 0xffb5;
const KEYSYM_KP_EMPTY: u32 = 0xff9d;
const KEYSYM_KP_6: u32 = 0xffb6;
const KEYSYM_KP_RIGHT: u32 = 0xff98;
const KEYSYM_KP_1: u32 = 0xffb1;
const KEYSYM_KP_END: u32 = 0xff9c;
const KEYSYM_KP_2: u32 = 0xffb2;
const KEYSYM_KP_DOWN: u32 = 0xff99;
const KEYSYM_KP_3: u32 = 0xffb3;
const KEYSYM_KP_PGDOWN: u32 = 0xff9b;
const KEYSYM_KP_0: u32 = 0xffb0;
const KEYSYM_KP_INSERT: u32 = 0xff9e;
const KEYSYM_KP_PERIOD: u32 = 0xffae;
const KEYSYM_KP_DELETE: u32 = 0xff9f;

#[derive(Debug, Copy, Clone)]
pub enum KeySym {
    Ascii(ascii::AsciiChar),
    Backspace,
    Tab,
    ReturnOrEnter,
    Escape,
    Insert,
    Delete,
    Home,
    End,
    PageUp,
    PageDown,
    Print,
    Pause,
    CapsLock,

    // "super" = windows/command key
    SuperLeft,
    SuperRight,

    // usb-only
    Menu,

    Left,
    Up,
    Right,
    Down,

    FunctionKey(u8),

    ShiftLeft,
    ShiftRight,
    ControlLeft,
    ControlRight,
    AltLeft,
    AltRight,
    ScrollLock,

    // Number Keypad
    NumLock,
    KeypadSlash,
    KeypadAsterisk,
    KeypadMinus,
    KeypadPlus,
    KeypadEnter,
    KeypadPeriod,
    KeypadDelete,
    Keypad0,
    KeypadInsert,
    Keypad1,
    KeypadEnd,
    Keypad2,
    KeypadDown,
    Keypad3,
    KeypadPgDown,
    Keypad4,
    KeypadLeft,
    Keypad5,
    KeypadEmpty,
    Keypad6,
    KeypadRight,
    Keypad7,
    KeypadHome,
    Keypad8,
    KeypadUp,
    Keypad9,
    KeypadPgUp,
}

impl TryFrom<u32> for KeySym {
    type Error = ();

    fn try_from(value: u32) -> Result<Self, Self::Error> {
        match value {
            // SAFETY: we're within the valid ascii range
            0..=ASCII_MAX => {
                Ok(Ascii(unsafe { value.to_ascii_char_unchecked() }))
            }
            KEYSYM_BACKSPACE => Ok(Backspace),
            KEYSYM_TAB => Ok(Tab),
            KEYSYM_RETURN_ENTER => Ok(ReturnOrEnter),
            KEYSYM_ESCAPE => Ok(Escape),
            KEYSYM_INSERT => Ok(Insert),
            KEYSYM_DELETE => Ok(Delete),
            KEYSYM_HOME => Ok(Home),
            KEYSYM_END => Ok(End),
            KEYSYM_PAGE_UP => Ok(PageUp),
            KEYSYM_PRINT => Ok(Print),
            KEYSYM_PAUSE => Ok(Pause),
            KEYSYM_CAPS_LOCK => Ok(CapsLock),
            KEYSYM_SUPER_LEFT => Ok(SuperLeft),
            KEYSYM_SUPER_RIGHT => Ok(SuperRight),
            KEYSYM_MENU => Ok(Menu),

            KEYSYM_PAGE_DOWN => Ok(PageDown),
            KEYSYM_LEFT => Ok(Left),
            KEYSYM_UP => Ok(Up),
            KEYSYM_RIGHT => Ok(Right),
            KEYSYM_DOWN => Ok(Down),

            f if (KEYSYM_F1..=KEYSYM_F12).contains(&f) => {
                let n = f - KEYSYM_F1 + 1;
                // TODO: handle cast
                Ok(FunctionKey(n as u8))
            }

            KEYSYM_SHIFT_LEFT => Ok(ShiftLeft),
            KEYSYM_SHIFT_RIGHT => Ok(ShiftRight),
            KEYSYM_CTRL_LEFT => Ok(ControlLeft),
            KEYSYM_CTRL_RIGHT => Ok(ControlRight),
            KEYSYM_ALT_LEFT => Ok(AltLeft),
            KEYSYM_ALT_RIGHT => Ok(AltRight),

            KEYSYM_SCROLL_LOCK => Ok(ScrollLock),
            KEYSYM_NUM_LOCK => Ok(NumLock),

            KEYSYM_KP_ENTER => Ok(KeypadEnter),
            KEYSYM_KP_SLASH => Ok(KeypadSlash),
            KEYSYM_KP_ASTERISK => Ok(KeypadAsterisk),
            KEYSYM_KP_MINUS => Ok(KeypadMinus),
            KEYSYM_KP_PLUS => Ok(KeypadPlus),
            KEYSYM_KP_7 => Ok(Keypad7),
            KEYSYM_KP_HOME => Ok(KeypadHome),
            KEYSYM_KP_8 => Ok(Keypad8),
            KEYSYM_KP_UP => Ok(KeypadUp),
            KEYSYM_KP_9 => Ok(Keypad9),
            KEYSYM_KP_PGUP => Ok(KeypadPgUp),
            KEYSYM_KP_4 => Ok(Keypad4),
            KEYSYM_KP_LEFT => Ok(KeypadLeft),
            KEYSYM_KP_5 => Ok(Keypad5),
            KEYSYM_KP_EMPTY => Ok(KeypadEmpty),
            KEYSYM_KP_6 => Ok(Keypad6),
            KEYSYM_KP_RIGHT => Ok(KeypadRight),
            KEYSYM_KP_1 => Ok(Keypad1),
            KEYSYM_KP_END => Ok(KeypadEnd),
            KEYSYM_KP_2 => Ok(Keypad2),
            KEYSYM_KP_DOWN => Ok(KeypadDown),
            KEYSYM_KP_3 => Ok(Keypad3),
            KEYSYM_KP_PGDOWN => Ok(KeypadPgDown),
            KEYSYM_KP_0 => Ok(Keypad0),
            KEYSYM_KP_INSERT => Ok(KeypadInsert),
            KEYSYM_KP_PERIOD => Ok(KeypadPeriod),
            KEYSYM_KP_DELETE => Ok(KeypadDelete),

            _ => Err(()),
        }
    }
}


================================================
FILE: crates/rfb/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

pub mod encodings;
pub mod keysym;
pub mod proto;
pub mod server;

#[cfg(feature = "tungstenite")]
pub mod tungstenite;


================================================
FILE: crates/rfb/src/proto.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

use std::mem::size_of;

use bitflags::bitflags;
use rgb_frame::FourCC;
use strum::FromRepr;
use thiserror::Error;
use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
use tokio_util::bytes::{Buf, BytesMut};
use tokio_util::codec::Decoder;
use zerocopy::{FromBytes, Immutable, IntoBytes};

use crate::encodings::{Encoding, EncodingType};
use crate::keysym::KeySym;

#[derive(Debug, Error)]
pub enum ProtocolError {
    #[error("invalid protocol version")]
    InvalidProtocolVersion,

    #[error("invalid security type: {0}")]
    InvalidSecurityType(u8),

    #[error("invalid text encoding")]
    InvalidTextEncoding,

    #[error("unknown client message type ({0})")]
    UnknownMessageType(u8),

    #[error("unknown keysym: {0}")]
    UnknownKeysym(u32),

    #[error("message too large: {0}")]
    TooLarge(usize),

    #[error("unsupported feature: {0}")]
    UnsupportedFeat(&'static str),

    #[error("I/O error: {0}")]
    Io(#[from] std::io::Error),
}

pub type Result<T> = std::result::Result<T, ProtocolError>;

#[derive(Debug, Copy, Clone, PartialEq, PartialOrd)]
pub enum ProtoVersion {
    Rfb33,
    Rfb37,
    Rfb38,
}

impl ProtoVersion {
    pub async fn read_from(
        stream: &mut (impl AsyncRead + Unpin),
    ) -> Result<Self> {
        let mut buf = [0u8; 12];
        stream.read_exact(&mut buf).await?;

        match &buf {
            b"RFB 003.003\n" => Ok(ProtoVersion::Rfb33),
            b"RFB 003.007\n" => Ok(ProtoVersion::Rfb37),
            b"RFB 003.008\n" => Ok(ProtoVersion::Rfb38),
            _ => Err(ProtocolError::InvalidProtocolVersion),
        }
    }
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        let s = match self {
            ProtoVersion::Rfb33 => b"RFB 003.003\n",
            ProtoVersion::Rfb37 => b"RFB 003.007\n",
            ProtoVersion::Rfb38 => b"RFB 003.008\n",
        };

        Ok(stream.write_all(s).await?)
    }
}

// Section 7.1.2
#[derive(Debug, Clone)]
pub struct SecurityTypes(pub Vec<SecurityType>);

#[derive(Clone, PartialEq, Debug)]
pub enum SecurityType {
    None,
    VncAuthentication,
}

impl SecurityTypes {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        // TODO: fix cast
        stream.write_u8(self.0.len() as u8).await?;
        for t in self.0.into_iter() {
            t.write_to(stream).await?;
        }

        Ok(())
    }
}

impl SecurityType {
    pub async fn read_from(
        stream: &mut (impl AsyncRead + Unpin),
    ) -> Result<Self> {
        let t = stream.read_u8().await?;
        match t {
            1 => Ok(SecurityType::None),
            2 => Ok(SecurityType::VncAuthentication),
            v => Err(ProtocolError::InvalidSecurityType(v)),
        }
    }
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        let val = match self {
            SecurityType::None => 0,
            SecurityType::VncAuthentication => 1,
        };
        stream.write_u8(val).await?;

        Ok(())
    }
}

// Section 7.1.3
pub enum SecurityResult {
    Success,
    Failure(String),
}

impl SecurityResult {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        match self {
            SecurityResult::Success => {
                stream.write_u32(0).await?;
            }
            SecurityResult::Failure(s) => {
                stream.write_u32(1).await?;
                stream.write_all(s.as_bytes()).await?;
            }
        };
        Ok(())
    }
}

// Section 7.3.1
#[derive(Debug)]
pub struct ClientInit {
    pub shared: bool,
}

impl ClientInit {
    pub async fn read_from(
        stream: &mut (impl AsyncRead + Unpin),
    ) -> Result<Self> {
        let flag = stream.read_u8().await?;
        Ok(ClientInit { shared: flag != 0 })
    }
}

// Section 7.3.2
#[derive(Debug)]
pub struct ServerInit {
    pub initial_resolution: Resolution,
    pub pixel_format: PixelFormat,
    pub name: String,
}

impl ServerInit {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        self.initial_resolution.write_to(stream).await?;
        self.pixel_format.write_to(stream).await?;

        // TODO: cast properly
        stream.write_u32(self.name.len() as u32).await?;
        stream.write_all(self.name.as_bytes()).await?;

        Ok(())
    }
}

pub struct FramebufferUpdate(pub Vec<Rectangle>);

impl FramebufferUpdate {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        let header = raw::FramebufferUpdateHeader::new(self.0.len() as u16);
        stream.write_all(header.as_bytes()).await?;

        // rectangles
        for r in self.0.into_iter() {
            r.write_to(stream).await?;
        }

        Ok(())
    }
}

#[derive(Debug, Copy, Clone)]
pub struct Position {
    pub x: u16,
    pub y: u16,
}

#[derive(Debug, Copy, Clone)]
pub struct Resolution {
    pub width: u16,
    pub height: u16,
}

impl Resolution {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        stream.write_u16(self.width).await?;
        stream.write_u16(self.height).await?;
        Ok(())
    }
}

pub struct Rectangle {
    pub position: Position,
    pub dimensions: Resolution,
    pub data: Box<dyn Encoding>,
}

impl Rectangle {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        stream.write_u16(self.position.x).await?;
        stream.write_u16(self.position.y).await?;
        stream.write_u16(self.dimensions.width).await?;
        stream.write_u16(self.dimensions.height).await?;
        stream.write_i32(self.data.get_type() as i32).await?;

        let data = self.data.encode();
        stream.write_all(data).await?;

        Ok(())
    }
}

// Section 7.4
#[derive(Debug, Clone, PartialEq, Immutable)]
pub struct PixelFormat {
    pub bits_per_pixel: u8, // TODO: must be 8, 16, or 32
    pub depth: u8,          // TODO: must be < bits_per_pixel
    pub big_endian: bool,
    pub color_spec: ColorSpecification,
}

impl PixelFormat {
    pub async fn write_to(
        self,
        stream: &mut (impl AsyncWrite + Unpin),
    ) -> Result<()> {
        let raw: raw::PixelFormat = self.into();
        stream.write_all(raw.as_bytes()).await?;
        Ok(())
    }
}
impl TryFrom<raw::PixelFormat> for PixelFormat {
    type Error = ProtocolError;

    fn try_from(
        value: raw::PixelFormat,
    ) -> std::result::Result<Self, Self::Error> {
        if value.true_color_flag == 0 {
            // Punt until we choose to support ColorMap
            Err(ProtocolError::UnsupportedFeat("ColorMap (non-truecolor) spec"))
        } else {
            let color_spec = ColorSpecification::ColorFormat(ColorFormat {
                red_max: value.red_max.get(),
                green_max: value.green_max.get(),
                blue_max: value.blue_max.get(),
                red_shift: value.red_shift,
                green_shift: value.green_shift,
                blue_shift: value.blue_shift,
            });

            Ok(Self {
                bits_per_pixel: value.bits_per_pixel,
                depth: value.depth,
                big_endian: value.big_endian_flag != 0,
                color_spec,
            })
        }
    }
}
impl From<PixelFormat> for raw::PixelFormat {
    fn from(value: PixelFormat) -> Self {
        let PixelFormat {
            bits_per_pixel,
            depth,
            big_endian,
            color_spec:
                ColorSpecification::ColorFormat(ColorFormat {
                    red_max,
                    green_max,
                    blue_max,
                    red_shift,
                    green_shift,
                    blue_shift,
                }),
        } = value;

        Self {
            bits_per_pixel,
            depth,
            big_endian_flag: big_endian as u8,
            // Without ColorMap support, all PFs are true-color for now
            true_color_flag: 1,
            red_max: red_max.into(),
            green_max: green_max.into(),
            blue_max: blue_max.into(),
            red_shift,
            green_shift,
            blue_shift,
            _padding: [0; 3],
        }
    }
}

// While rgb_frame supports only 4-byte truecolor formats, we can make some
// simple assumptions about PixelFormat conversions.
impl From<FourCC> for PixelFormat {
    fn from(value: FourCC) -> Self {
        let idx = value.le_idx_rgba();
        PixelFormat {
            bits_per_pixel: 32,
            depth: 24,
            big_endian: false,
            color_spec: ColorSpecification::ColorFormat(ColorFormat {
                red_max: 255,
                green_max: 255,
                blue_max: 255,
                red_shift: idx.0 as u8 * 8,
                green_shift: idx.1 as u8 * 8,
                blue_shift: idx.2 as u8 * 8,
            }),
        }
    }
}
impl TryInto<FourCC> for &PixelFormat {
    type Error = &'static str;

    fn try_into(self) -> std::result::Result<FourCC, Self::Error> {
        if self.bits_per_pixel != 32 || self.depth != 24 {
            return Err("format is not 4-bytes-per-pixel truecolor");
        }
        if self.big_endian {
            return Err("big endian not supported");
        }
        let PixelFormat { color_spec, .. } = self;
        let ColorSpecification::ColorFormat(cformat) = color_spec;
        if cformat.red_max != 255
            || cformat.green_max != 255
            || cformat.blue_max != 255
        {
            return Err("max color values for truecolor not found");
        }
        match (cformat.red_shift, cformat.green_shift, cformat.blue_shift) {
            (0, 8, 16) => Ok(FourCC::XB24),
            (16, 8, 0) => Ok(FourCC::XR24),
            _ => Err("matching color shifts not found"),
        }
    }
}

#[derive(Debug, Clone, PartialEq, Immutable)]
pub enum ColorSpecification {
    ColorFormat(ColorFormat),
    // Not covered: colormap support
}

#[derive(Debug, Clone, PartialEq, Immutable)]
pub struct ColorFormat {
    // TODO: maxes must be 2^N - 1 for N bits per color
    pub red_max: u16,
    pub green_max: u16,
    pub blue_max: u16,
    pub red_shift: u8,
    pub green_shift: u8,
    pub blue_shift: u8,
}

// Section 7.5
#[derive(Debug)]
pub enum ClientMessage {
    SetPixelFormat(PixelFormat),
    SetEncodings {
        /// Encodings with a type recognized by this crate
        encodings: Vec<EncodingType>,
        /// Raw values of unrecognized encodings
        unknown: Vec<i32>,
    },
    FramebufferUpdateRequest(FramebufferUpdateRequest),
    KeyEvent(KeyEvent),
    PointerEvent(PointerEvent),
    ClientCutText(String),
}

#[derive(FromRepr)]
#[repr(u8)]
enum ClientMessageType {
    SetPixelFormat = 0,
    SetEncodings = 2,
    FramebufferUpdateRequest = 3,
    KeyEvent = 4,
    PointerEvent = 5,
    ClientCutText = 6,
}

fn read_data<T: FromBytes>(buf: &mut BytesMut) -> Option<T> {
    let sz = size_of::<T>();
    // It'd be kind of nice to return the error here instead of an Option, but
    // because the error borrows the buf we're going to try parsing from, rustc
    // believes the buffer to be immutably borrowed when we advance it below.
    // As an Option, the Err and its borrow are discarded so we avoid the issue.
    let data = T::read_from_prefix(buf).ok()?.0;
    buf.advance(sz);
    Some(data)
}

pub struct ClientMessageDecoder {
    /// Limit to how many bytes decode is willing to buffer for client messages
    pub buffer_limit: usize,
}
impl Default for ClientMessageDecoder {
    fn default() -> Self {
        // Client messages are small, so 16k should be more than enough
        Self { buffer_limit: 0x10000 }
    }
}

impl Decoder for ClientMessageDecoder {
    type Item = ClientMessage;
    type Error = ProtocolError;

    fn decode(
        &mut self,
        src: &mut BytesMut,
    ) -> std::result::Result<Option<Self::Item>, Self::Error> {
        if src.is_empty() {
            return Ok(None);
        }
        let type_byte = src[0];
        let message_type = ClientMessageType::from_repr(type_byte)
            .ok_or(ProtocolError::UnknownMessageType(type_byte))?;

        let msg_sz_reqd = match message_type {
            ClientMessageType::SetPixelFormat => {
                // 3 bytes padding + message
                3 + size_of::<raw::PixelFormat>()
            }

            ClientMessageType::SetEncodings => {
                if src.len() < 4 {
                    return Ok(None);
                }

                // 1 byte padding + u16 len + len * u32
                let num_encoding =
                    u16::from_be_bytes(src[2..4].try_into().unwrap());
                1 + size_of::<u16>()
                    + (num_encoding as usize * size_of::<u32>())
            }

            ClientMessageType::FramebufferUpdateRequest => {
                size_of::<raw::FramebufferUpdateRequest>()
            }
            ClientMessageType::KeyEvent => size_of::<raw::KeyEvent>(),
            ClientMessageType::PointerEvent => size_of::<raw::PointerEvent>(),
            ClientMessageType::ClientCutText => {
                if src.len() < 8 {
                    return Ok(None);
                }
                // 3 bytes of padding + i32 len + string
                let data_len =
                    u32::from_be_bytes(src[4..8].try_into().unwrap());
                1 + size_of::<i32>() + data_len as usize
            }
        };
        let total_sz_reqd = 1 + msg_sz_reqd;
        if total_sz_reqd > self.buffer_limit {
            return Err(ProtocolError::TooLarge(msg_sz_reqd));
        }
        if src.len() < total_sz_reqd {
            if src.capacity() < total_sz_reqd {
                src.reserve(total_sz_reqd - src.capacity());
            }
            return Ok(None);
        }

        // Now that we're sure that enough data is buffered to decode a whole
        // message, consume the type byte, and pass the rest on to the decoding
        // logic.
        src.advance(1);
        match message_type {
            ClientMessageType::SetPixelFormat => {
                // 3 bytes padding
                src.advance(3);
                let raw = read_data::<raw::PixelFormat>(src).unwrap();
                Ok(Some(ClientMessage::SetPixelFormat(raw.try_into()?)))
            }
            ClientMessageType::SetEncodings => {
                // 1 byte padding
                src.advance(1);

                let count = src.get_u16() as usize;
                let mut encodings = Vec::with_capacity(count);
                let mut unknown = Vec::new();
                for _n in 0..count {
                    let raw = src.get_i32();
                    match EncodingType::from_repr(raw) {
                        Some(enc) => encodings.push(enc),
                        None => unknown.push(raw),
                    }
                }
                Ok(Some(ClientMessage::SetEncodings { encodings, unknown }))
            }
            ClientMessageType::FramebufferUpdateRequest => {
                let raw =
                    read_data::<raw::FramebufferUpdateRequest>(src).unwrap();
                Ok(Some(ClientMessage::FramebufferUpdateRequest(raw.into())))
            }
            ClientMessageType::KeyEvent => {
                let raw = read_data::<raw::KeyEvent>(src).unwrap();
                let converted: KeyEvent = raw.try_into()?;
                Ok(Some(ClientMessage::KeyEvent(converted)))
            }
            ClientMessageType::PointerEvent => {
                let raw = read_data::<raw::PointerEvent>(src).unwrap();
                Ok(Some(ClientMessage::PointerEvent(raw.into())))
            }
            ClientMessageType::ClientCutText => {
                // 3 bytes padding
                src.advance(3);

                let len = src.get_u32() as usize;
                let buf = src[..len].to_vec();

                // TODO: The encoding RFB uses is ISO 8859-1 (Latin-1), which is
                // a subset of utf-8. Determine if this is the right approach.
                let text = String::from_utf8(buf)
                    .map_err(|_| ProtocolError::InvalidTextEncoding)?;

                Ok(Some(ClientMessage::ClientCutText(text)))
            }
        }
    }
}

#[derive(Debug, Copy, Clone)]
pub struct FramebufferUpdateRequest {
    pub incremental: bool,
    pub position: Position,
    pub resolution: Resolution,
}

impl From<raw::FramebufferUpdateRequest> for FramebufferUpdateRequest {
    fn from(value: raw::FramebufferUpdateRequest) -> Self {
        Self {
            incremental: value.incremental != 0,
            position: value.position.into(),
            resolution: value.resolution.into(),
        }
    }
}

#[derive(Debug, Copy, Clone)]
pub struct KeyEvent {
    pub is_pressed: bool,
    pub keysym: KeySym,
    pub keysym_raw: u32,
}
impl TryFrom<raw::KeyEvent> for KeyEvent {
    type Error = ProtocolError;

    fn try_from(
        value: raw::KeyEvent,
    ) -> std::result::Result<Self, Self::Error> {
        let keysym_raw = value.key.get();
        let keysym = KeySym::try_from(keysym_raw)
            .or(Err(ProtocolError::UnknownKeysym(keysym_raw)))?;
        Ok(Self { is_pressed: value.down_flag != 0, keysym, keysym_raw })
    }
}

bitflags! {
    #[derive(Debug, Copy, Clone)]
    pub struct MouseButtons: u8 {
        const LEFT = 1 << 0;
        const MIDDLE = 1 << 1;
        const RIGHT = 1 << 2;
        const SCROLL_A = 1 << 3;
        const SCROLL_B = 1 << 4;
        const SCROLL_C = 1 << 5;
        const SCROLL_D = 1 << 6;
    }
}

#[derive(Debug, Copy, Clone)]
pub struct PointerEvent {
    pub position: Position,
    pub pressed: MouseButtons,
}
impl From<raw::PointerEvent> for PointerEvent {
    fn from(value: raw::PointerEvent) -> Self {
        Self {
            position: value.position.into(),
            pressed: MouseButtons::from_bits_truncate(value.button_mask),
        }
    }
}

mod raw {
    use zerocopy::big_endian::{U16, U32};
    use zerocopy::{FromBytes, Immutable, IntoBytes};

    #[allow(dead_code)]
    #[derive(Copy, Clone, FromBytes, IntoBytes, Immutable)]
    #[repr(C, packed)]
    pub(crate) struct PixelFormat {
        pub bits_per_pixel: u8,
        pub depth: u8,
        pub big_endian_flag: u8,
        pub true_color_flag: u8,
        pub red_max: U16,
        pub green_max: U16,
        pub blue_max: U16,
        pub red_shift: u8,
        pub green_shift: u8,
        pub blue_shift: u8,
        pub _padding: [u8; 3],
    }

    #[derive(Copy, Clone, FromBytes, Immutable)]
    #[repr(C, packed)]
    pub(crate) struct FramebufferUpdateRequest {
        pub incremental: u8,
        pub position: Position,
        pub resolution: Resolution,
    }

    #[derive(Copy, Clone, FromBytes)]
    #[repr(C, packed)]
    pub(crate) struct KeyEvent {
        pub down_flag: u8,
        pub _padding: [u8; 2],
        pub key: U32,
    }

    #[derive(Copy, Clone, FromBytes)]
    #[repr(C, packed)]
    pub(crate) struct PointerEvent {
        pub button_mask: u8,
        pub position: Position,
    }

    #[derive(Copy, Clone, FromBytes, Immutable)]
    #[repr(C, packed)]
    pub(crate) struct Position {
        pub x: U16,
        pub y: U16,
    }
    impl From<Position> for super::Position {
        fn from(value: Position) -> Self {
            Self { x: value.x.get(), y: value.y.get() }
        }
    }

    #[derive(Copy, Clone, FromBytes, Immutable)]
    #[repr(C, packed)]
    pub(crate) struct Resolution {
        width: U16,
        height: U16,
    }
    impl From<Resolution> for super::Resolution {
        fn from(value: Resolution) -> Self {
            Self { width: value.width.get(), height: value.height.get() }
        }
    }

    #[derive(Copy, Clone, IntoBytes, Immutable)]
    #[repr(C, packed)]
    #[allow(dead_code)]
    pub(crate) struct FramebufferUpdateHeader {
        msg_type: u8,
        _padding: u8,
        pub num_rects: U16,
    }
    impl FramebufferUpdateHeader {
        pub fn new(num_rects: u16) -> Self {
            Self { msg_type: 0, _padding: 0, num_rects: U16::new(num_rects) }
        }
    }
}


================================================
FILE: crates/rfb/src/server.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2022 Oxide Computer Company

use thiserror::Error;
use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};

use crate::proto::{
    ClientInit, PixelFormat, ProtoVersion, Resolution, SecurityResult,
    SecurityType, SecurityTypes, ServerInit,
};

#[derive(Error, Debug)]
pub enum InitError {
    #[error("unsupported client version {0:?}")]
    UnsupportedVersion(ProtoVersion),

    #[error("unsupported security type {0:?}")]
    UnsupportedSecurityType(SecurityType),

    #[error("protocol error {source}")]
    Protocol {
        #[from]
        source: crate::proto::ProtocolError,
    },

    #[error("IO error {source}")]
    Io {
        #[from]
        source: std::io::Error,
    },
}

pub type Result<T> = std::result::Result<T, InitError>;

pub struct InitParams {
    /// Supported protocol version
    pub version: ProtoVersion,
    /// Supported security types
    pub sec_types: SecurityTypes,

    /// Server name
    pub name: String,

    /// Initial framebuffer resolution
    pub resolution: Resolution,
    /// Initial framebuffer pixel format
    pub format: PixelFormat,
}

async fn rfb_handshake(
    s: &mut (impl AsyncRead + AsyncWrite + Unpin),
    version: ProtoVersion,
    sec_types: SecurityTypes,
) -> Result<()> {
    // ProtocolVersion handshake
    version.write_to(s).await?;
    s.flush().await?;

    let client_version = ProtoVersion::read_from(s).await?;
    if client_version < version {
        return Err(InitError::UnsupportedVersion(client_version));
    }

    // Security Handshake
    let supported_types = sec_types.clone();
    supported_types.write_to(s).await?;
    s.flush().await?;

    let client_choice = SecurityType::read_from(s).await?;
    if !sec_types.0.contains(&client_choice) {
        let failure =
            SecurityResult::Failure("unsupported security type".to_string());
        failure.write_to(s).await?;
        return Err(InitError::UnsupportedSecurityType(client_choice));
    }

    let res = SecurityResult::Success;
    res.write_to(s).await?;
    s.flush().await?;

    Ok(())
}

async fn rfb_initialization(
    s: &mut (impl AsyncRead + AsyncWrite + Unpin),
    initial_resolution: Resolution,
    pixel_format: PixelFormat,
    name: String,
) -> Result<ClientInit> {
    let client_init = ClientInit::read_from(s).await?;

    let server_init = ServerInit { initial_resolution, pixel_format, name };
    server_init.write_to(s).await?;
    s.flush().await?;

    Ok(client_init)
}

/// Perform server initialization handshake with client
pub async fn initialize(
    sock: &mut (impl AsyncRead + AsyncWrite + Unpin),
    params: InitParams,
) -> Result<ClientInit> {
    assert!(
        !params.sec_types.0.is_empty(),
        "at least one security type must be defined"
    );

    rfb_handshake(sock, params.version, params.sec_types).await?;
    rfb_initialization(sock, params.resolution, params.format, params.name)
        .await
}


================================================
FILE: crates/rfb/src/tungstenite.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2024 Oxide Computer Company

//! Utilities for using rfb over a tungstnite websocket

use core::pin::Pin;
use core::task::{Context, Poll};
use std::io;

use futures::{sink::Sink, stream::Stream};
use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
use tokio_tungstenite::tungstenite::error::Error as TungError;
use tokio_tungstenite::tungstenite::protocol::Message;
use tokio_tungstenite::WebSocketStream;

/// Convert from Tungstenite error to io::Error
fn tung_err_to_io(err: TungError) -> io::Error {
    match err {
        TungError::Io(io_err) => io_err,
        err => io::Error::other(err),
    }
}

/// Wrap a [WebSocketStream] so it implements [AsyncRead] and [AsyncWrite]
pub struct BinaryWs<T> {
    ws: WebSocketStream<T>,
    buf: Option<(Vec<u8>, usize)>,
}

impl<T: AsyncRead + AsyncWrite + Unpin> BinaryWs<T> {
    pub fn new(ws: WebSocketStream<T>) -> Self {
        Self { ws, buf: None }
    }
}
impl<T: AsyncRead + AsyncWrite + Unpin> AsyncWrite for BinaryWs<T> {
    fn poll_write(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
        buf: &[u8],
    ) -> Poll<Result<usize, io::Error>> {
        let ws = Pin::new(&mut self.ws);
        match ws.poll_ready(cx) {
            Poll::Ready(Ok(())) => {
                let ws = Pin::new(&mut self.ws);
                let msg = Message::binary(buf);
                if let Err(e) = ws.start_send(msg) {
                    Poll::Ready(Err(tung_err_to_io(e)))
                } else {
                    Poll::Ready(Ok(buf.len()))
                }
            }
            Poll::Ready(Err(e)) => Poll::Ready(Err(tung_err_to_io(e))),
            Poll::Pending => Poll::Pending,
        }
    }

    fn poll_flush(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Result<(), io::Error>> {
        let ws = Pin::new(&mut self.ws);
        ws.poll_flush(cx).map_err(tung_err_to_io)
    }

    fn poll_shutdown(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Result<(), io::Error>> {
        let ws = Pin::new(&mut self.ws);
        ws.poll_close(cx).map_err(tung_err_to_io)
    }
}
impl<T: AsyncRead + AsyncWrite + Unpin> AsyncRead for BinaryWs<T> {
    fn poll_read(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
        buf: &mut ReadBuf<'_>,
    ) -> Poll<io::Result<()>> {
        loop {
            // Emit cached data which has not been read yet
            if let Some((msg, consumed)) = self.buf.take() {
                let (_used, remain) = msg.split_at(consumed);
                let to_write = buf.remaining().min(remain.len());
                buf.put_slice(&remain[..to_write]);
                if to_write < remain.len() {
                    self.buf = Some((msg, consumed + to_write))
                }
                return Poll::Ready(Ok(()));
            }

            // Otherwise poll for more data to receive
            let ws = Pin::new(&mut self.ws);
            match ws.poll_next(cx) {
                Poll::Pending => return Poll::Pending,
                Poll::Ready(None) => return Poll::Ready(Ok(())),
                Poll::Ready(Some(Err(e))) => match e {
                    tokio_tungstenite::tungstenite::Error::Io(ioe) => {
                        return Poll::Ready(Err(ioe));
                    }
                    _ => {
                        return Poll::Ready(Err(std::io::Error::other(e)));
                    }
                },
                Poll::Ready(Some(Ok(rmsg))) => {
                    if let Message::Binary(msgbuf) = rmsg {
                        self.buf = Some((msgbuf, 0));
                        continue;
                    }
                    // For all other types, ignore and continue polling
                }
            }
        }
    }
}


================================================
FILE: crates/rgb-frame/Cargo.toml
================================================
[package]
name = "rgb_frame"
version = "0.0.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
strum = { workspace = true, features = ["derive"] }


================================================
FILE: crates/rgb-frame/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
//
// Copyright 2024 Oxide Computer Company

use std::mem::MaybeUninit;
use std::num::NonZeroUsize;

#[derive(Clone, Copy)]
pub struct Spec {
    /// Width of Frame in pixels
    pub width: NonZeroUsize,
    /// Height of Frame in pixels
    pub height: NonZeroUsize,
    /// Width (in bytes) of each row of pixels.
    ///
    /// May be larger than `width * bytes_per_pixel` in order to better align
    /// pixel data in memory.
    pub stride: NonZeroUsize,
    pub fourcc: FourCC,
}
impl Spec {
    pub fn new(width: usize, height: usize, fourcc: FourCC) -> Self {
        Self {
            width: NonZeroUsize::new(width).expect("width is non-zero"),
            height: NonZeroUsize::new(height).expect("height is non-zero"),
            stride: unsafe {
                // Safety: height and width have already been checked for zero
                NonZeroUsize::new_unchecked(
                    width
                        .checked_mul(height)
                        .expect("stride does not overflow"),
                )
            },
            fourcc,
        }
    }
}

/// A frame of pixel data and accompanying metadata
pub struct Frame {
    spec: Spec,
    data: Vec<u8>,
}

impl Frame {
    /// Create a new Frame, sized based on the provided [Spec]
    ///
    /// The contents of the pixel buffer for this frame will be initialized with
    /// all zeroes.
    pub fn new(spec: Spec) -> Self {
        let (mut data, stride) = Self::allocate_for_spec(&spec);
        data.resize_with(data.capacity(), Default::default);
        let spec = Spec { stride, ..spec };

        Self { spec, data }
    }

    /// Create a few Frame, sized based on the provided [Spec], with its pixel
    /// contents initalized via the `populate` function.
    ///
    /// The responsibility is left to the caller to properly initalize the
    /// entire [`MaybeUninit<u8>`] slice provided to the `populate` argument.  The
    /// stride length of that buffer is provided as the second argument.
    pub fn new_uninit<F>(spec: Spec, populate: F) -> Self
    where
        F: FnOnce(&mut [MaybeUninit<u8>], NonZeroUsize),
    {
        let (mut data, stride) = Self::allocate_for_spec(&spec);
        let spare = data.spare_capacity_mut();
        populate(spare, stride);
        unsafe {
            data.set_len(data.capacity());
        };
        Self { spec: Spec { stride, ..spec }, data }
    }

    fn allocate_for_spec(spec: &Spec) -> (Vec<u8>, NonZeroUsize) {
        let bytepp = spec.fourcc.bytes_per_pixel();

        let line_sz = bytepp
            .get()
            .checked_mul(spec.width.get())
            .expect("line size does not overflow");

        // TODO: align allocate for SIMD ops
        let stride = line_sz;
        let buf = Vec::with_capacity(line_sz * spec.height.get());

        (buf, NonZeroUsize::new(stride).expect("stride is non-zero"))
    }

    /// Get the [Spec] for this frame.
    pub fn spec(&self) -> Spec {
        self.spec
    }

    /// Access to the raw pixel bytes of this frame
    pub fn bytes(&self) -> &[u8] {
        &self.data
    }

    /// Mutable access to the raw pixel bytes of this frame
    pub fn bytes_mut(&mut self) -> &mut [u8] {
        &mut self.data
    }

    /// Convert between recognized 4-byte pixel formats
    pub fn convert(&mut self, target: FourCC) {
        let source = self.spec.fourcc;
        if source == target {
            return;
        }

        self.spec.fourcc = target;

        let source_rgba = source.le_idx_rgba();
        let target_rgba = target.le_idx_rgba();

        if source_rgba == target_rgba && !target.has_alpha() {
            // order is already the same, and the new format does not need the
            // alpha channel populated
            return;
        }

        // TODO: rub some SIMD on this, when possible
        for pixel in self.data.chunks_exact_mut(4) {
            let red = pixel[source_rgba.0];
            let green = pixel[source_rgba.1];
            let blue = pixel[source_rgba.2];
            // TODO: alpha assumed to be 100% for now
            let alpha = 0xff;

            pixel[target_rgba.0] = red;
            pixel[target_rgba.1] = green;
            pixel[target_rgba.2] = blue;
            pixel[target_rgba.3] = alpha;
        }
    }
}

#[derive(
    Copy,
    Clone,
    Eq,
    PartialEq,
    Debug,
    strum::FromRepr,
    strum::EnumString,
    strum::Display,
    strum::VariantNames,
)]
#[repr(u32)]
#[strum(serialize_all = "UPPERCASE")]
pub enum FourCC {
    /// x:R:G:B `\[` 31:0 `\]` little endian
    XR24 = 0x34325258,
    /// R:G:B:x `\[` 31:0 `\]` little endian
    RX24 = 0x34325852,
    /// x:B:G:R `\[` 31:0 `\]` little endian
    XB24 = 0x34325842,
    /// B:G:R:x `\[` 31:0 `\]` little endian
    BX24 = 0x34324258,

    /// A:R:G:B `\[` 31:0 `\]` little endian
    AR24 = 0x34325241,
    /// R:G:B:A `\[` 31:0 `\]` little endian
    RA24 = 0x34324152,
    /// A:B:G:R `\[` 31:0 `\]` little endian
    AB24 = 0x34324142,
    /// B:G:R:A `\[` 31:0 `\]` little endian
    BA24 = 0x34324241,
}
impl FourCC {
    /// Does this FourCC contain an alpha channel?
    pub const fn has_alpha(self) -> bool {
        matches!(self, Self::AR24 | Self::RA24 | Self::AB24 | Self::BA24)
    }
    /// Returns the (little-endian) byte index of red/green/blue/alpha
    /// components (respectively) in a pixel of this format
    pub const fn le_idx_rgba(self) -> (usize, usize, usize, usize) {
        match self {
            FourCC::XR24 | FourCC::AR24 => (2, 1, 0, 3),
            FourCC::RX24 | FourCC::RA24 => (3, 2, 1, 0),
            FourCC::BX24 | FourCC::BA24 => (1, 2, 3, 0),
            FourCC::XB24 | FourCC::AB24 => (0, 1, 2, 3),
        }
    }
    pub const fn bytes_per_pixel(self) -> NonZeroUsize {
        // Our existing definitions are all 4-byte formats
        // SAFETY: it's a constant
        unsafe { NonZeroUsize::new_unchecked(4) }
    }
}


================================================
FILE: crates/viona-api/Cargo.toml
================================================
[package]
name = "viona_api"
version = "0.0.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
libc.workspace = true

# nvpair dependency only enabled when building on illumos to avoid any attempts
# to link to an absent libnvpair
[target.'cfg(target_os = "illumos")'.dependencies]
nvpair.workspace = true

[features]
falcon = []


================================================
FILE: crates/viona-api/header-check/Cargo.toml
================================================
[package]
name = "viona_api-hdrchk"
version = "0.0.0"
license = "MPL-2.0"
build = "build.rs"
publish = false
edition = "2021"

[dependencies]
viona_api = { path = ".." }
libc = "0.2"

[build-dependencies]
cc = "1"
ctest2 = "0.4.7"

[[test]]
name = "main"
path = "test/main.rs"
harness = false


================================================
FILE: crates/viona-api/header-check/build.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![deny(warnings)]

use std::convert::TryFrom;
use std::env;
use std::path::PathBuf;

fn main() {
    let mut cfg = ctest2::TestGenerator::new();

    let gate_dir = match env::var("GATE_SRC").map(PathBuf::try_from) {
        Ok(Ok(dir)) => dir,
        _ => {
            eprintln!("Must specify path to illumos-gate sources with GATE_SRC env var");
            std::process::exit(1);
        }
    };

    let include_paths = ["usr/src/uts/intel", "usr/src/uts/common"];
    cfg.include("/usr/include");
    for p in include_paths {
        cfg.include(gate_dir.join(p));
    }

    cfg.header("sys/types.h");
    cfg.header("sys/viona_io.h");

    cfg.skip_const(move |name| match name {
        "VIONA_DEV_PATH" => true,

        _ => false,
    });

    cfg.skip_field(move |name, field| match (name, field) {
        // C header currently lacks explicit pad fields
        ("vioc_intr_poll_mq", "_pad") => true,
        ("vioc_ring_init_modern", "_pad") => true,
        ("vioc_ring_msi", "_pad") => true,

        _ => false,
    });

    cfg.skip_roundtrip(move |name| match name {
        // lack of explicit padding causes round-trip problems
        "vioc_ring_init" => true,
        "vioc_ring_msi" => true,

        _ => false,
    });

    cfg.generate("../src/ffi.rs", "main.rs");
}


================================================
FILE: crates/viona-api/header-check/test/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use viona_api::*;

include!(concat!(env!("OUT_DIR"), "/main.rs"));


================================================
FILE: crates/viona-api/src/ffi.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(non_camel_case_types)]

use libc::size_t;
use std::ffi::c_void;

const fn vna_ioc(ioc: i32) -> i32 {
    const V: i32 = b'V' as i32;
    const C: i32 = b'C' as i32;
    V << 16 | C << 8 | ioc
}

pub const VNA_IOC_CREATE: i32 = vna_ioc(0x01);
pub const VNA_IOC_DELETE: i32 = vna_ioc(0x02);
pub const VNA_IOC_VERSION: i32 = vna_ioc(0x03);
pub const VNA_IOC_DEFAULT_PARAMS: i32 = vna_ioc(0x04);

pub const VNA_IOC_RING_INIT: i32 = vna_ioc(0x10);
pub const VNA_IOC_RING_RESET: i32 = vna_ioc(0x11);
pub const VNA_IOC_RING_KICK: i32 = vna_ioc(0x12);
pub const VNA_IOC_RING_SET_MSI: i32 = vna_ioc(0x13);
pub const VNA_IOC_RING_INTR_CLR: i32 = vna_ioc(0x14);
pub const VNA_IOC_RING_SET_STATE: i32 = vna_ioc(0x15);
pub const VNA_IOC_RING_GET_STATE: i32 = vna_ioc(0x16);
pub const VNA_IOC_RING_PAUSE: i32 = vna_ioc(0x17);
pub const VNA_IOC_RING_INIT_MODERN: i32 = vna_ioc(0x18);

pub const VNA_IOC_INTR_POLL: i32 = vna_ioc(0x20);
pub const VNA_IOC_SET_FEATURES: i32 = vna_ioc(0x21);
pub const VNA_IOC_GET_FEATURES: i32 = vna_ioc(0x22);
pub const VNA_IOC_SET_NOTIFY_IOP: i32 = vna_ioc(0x23);
pub const VNA_IOC_SET_PROMISC: i32 = vna_ioc(0x24);
pub const VNA_IOC_GET_PARAMS: i32 = vna_ioc(0x25);
pub const VNA_IOC_SET_PARAMS: i32 = vna_ioc(0x26);
pub const VNA_IOC_GET_MTU: i32 = vna_ioc(0x27);
pub const VNA_IOC_SET_MTU: i32 = vna_ioc(0x28);
pub const VNA_IOC_SET_NOTIFY_MMIO: i32 = vna_ioc(0x29);
pub const VNA_IOC_INTR_POLL_MQ: i32 = vna_ioc(0x2a);

/// VirtIO 1.2 queue pair support.
pub const VNA_IOC_GET_PAIRS: i32 = vna_ioc(0x30);
pub const VNA_IOC_SET_PAIRS: i32 = vna_ioc(0x31);
pub const VNA_IOC_GET_USEPAIRS: i32 = vna_ioc(0x32);
pub const VNA_IOC_SET_USEPAIRS: i32 = vna_ioc(0x33);

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn test_vna_ioc() {
        assert_eq!(vna_ioc(0x22), 0x00_56_43_22);
    }
}

/// The minimum number of queue pairs supported by a device.
pub const VIONA_MIN_QPAIR: usize = 1;

/// The maximum number of queue pairs supported by a device.
///
/// Note that the VirtIO limit is much higher (0x8000); Viona artificially
/// limits the number to 256 pairs, which makes it possible to implmeent
/// interrupt notification with a reasonably sized bitmap.
pub const VIONA_MAX_QPAIR: usize = 0x100;

const fn howmany(x: usize, y: usize) -> usize {
    assert!(y > 0);
    x.div_ceil(y)
}

/// The number of 32-bit words required to detect interrupts for the maximum
/// number of supported queue pairs.  Note the factor of two here: interrupts
/// are per-queue, not per-pair.
pub const VIONA_INTR_WORDS: usize = howmany(VIONA_MAX_QPAIR * 2, 32);

#[repr(C)]
pub struct vioc_create {
    pub c_linkid: u32,
    pub c_vmfd: i32,
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_ring_init_modern {
    pub rim_index: u16,
    pub rim_qsize: u16,
    pub _pad: [u16; 2],
    pub rim_qaddr_desc: u64,
    pub rim_qaddr_avail: u64,
    pub rim_qaddr_used: u64,
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_ring_msi {
    pub rm_index: u16,
    pub _pad: [u16; 3],
    pub rm_addr: u64,
    pub rm_msg: u64,
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_intr_poll_mq {
    pub vipm_nrings: u16,
    pub _pad: u16,
    pub vipm_status: [u32; VIONA_INTR_WORDS],
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_notify_mmio {
    pub vim_address: u64,
    pub vim_size: u32,
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_ring_state {
    pub vrs_index: u16,
    pub vrs_avail_idx: u16,
    pub vrs_used_idx: u16,
    pub vrs_qsize: u16,
    pub vrs_qaddr_desc: u64,
    pub vrs_qaddr_avail: u64,
    pub vrs_qaddr_used: u64,
}

pub const VIONA_PROMISC_NONE: i32 = 0;
pub const VIONA_PROMISC_MULTI: i32 = 1;
pub const VIONA_PROMISC_ALL: i32 = 2;
#[cfg(feature = "falcon")]
pub const VIONA_PROMISC_ALL_VLAN: i32 = 3;

#[repr(C)]
#[derive(Default)]
pub struct vioc_get_params {
    pub vgp_param: *mut c_void,
    pub vgp_param_sz: size_t,
}

#[repr(C)]
#[derive(Default)]
pub struct vioc_set_params {
    pub vsp_param: *mut c_void,
    pub vsp_param_sz: size_t,
    pub vsp_error: *mut c_void,
    pub vsp_error_sz: size_t,
}

/// This is the viona interface version which viona_api expects to operate
/// against.  All constants and structs defined by the crate are done so in
/// terms of that specific version.
pub const VIONA_CURRENT_INTERFACE_VERSION: u32 = 6;

/// Maximum size of packed nvlists used in viona parameter ioctls
pub const VIONA_MAX_PARAM_NVLIST_SZ: usize = 4096;


================================================
FILE: crates/viona-api/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs::{File, OpenOptions};
use std::io::{Error, ErrorKind, Result};
use std::os::fd::*;
use std::os::unix::fs::MetadataExt;

mod ffi;

pub use ffi::*;

// Hide libnvpair usage when not building on illumos to avoid linking errors
#[cfg(target_os = "illumos")]
pub use nvpair::NvList;

pub const VIONA_DEV_PATH: &str = "/dev/viona";

pub struct VionaFd(File);
impl VionaFd {
    /// Open viona device and associate it with a given link and vmm instance,
    /// provided in `link_id` and `vm_fd`, respectively.
    pub fn new(link_id: u32, vm_fd: RawFd) -> Result<Self> {
        let this = Self::open()?;

        let mut vna_create = vioc_create { c_linkid: link_id, c_vmfd: vm_fd };
        let _ = unsafe { this.ioctl(VNA_IOC_CREATE, &mut vna_create) }?;
        Ok(this)
    }

    /// Open viona device instance without performing any other initialization
    pub fn open() -> Result<Self> {
        let fp =
            OpenOptions::new().read(true).write(true).open(VIONA_DEV_PATH)?;

        Ok(Self(fp))
    }

    #[cfg(target_os = "illumos")]
    pub fn set_parameters(
        &self,
        params: &mut NvList,
    ) -> std::result::Result<(), ParamError> {
        let mut errbuf: Vec<u8> = Vec::with_capacity(VIONA_MAX_PARAM_NVLIST_SZ);

        let mut packed = params.pack();
        let vsp_param_sz = packed.as_ref().len();

        let mut ioc = vioc_set_params {
            vsp_param: packed.as_mut_ptr().cast(),
            vsp_param_sz,
            vsp_error: errbuf.as_mut_ptr().cast(),
            vsp_error_sz: errbuf.capacity(),
        };
        match unsafe { self.ioctl(VNA_IOC_SET_PARAMS, &mut ioc) } {
            Ok(_) if ioc.vsp_error_sz == 0 => Ok(()),
            Ok(_) => {
                assert!(ioc.vsp_error_sz <= errbuf.capacity());
                unsafe { errbuf.set_len(ioc.vsp_error_sz) };

                match NvList::unpack(&mut errbuf[..]) {
                    Ok(detail) => Err(ParamError::Detailed(detail)),
                    Err(e) => Err(ParamError::Io(e)),
                }
            }
            Err(e) => Err(ParamError::Io(e)),
        }
    }

    #[cfg(target_os = "illumos")]
    pub fn get_parameters(&self) -> Result<NvList> {
        let mut buf: Vec<u8> = Vec::with_capacity(VIONA_MAX_PARAM_NVLIST_SZ);

        let mut ioc = vioc_get_params {
            vgp_param: buf.as_mut_ptr().cast(),
            vgp_param_sz: buf.capacity(),
        };
        let _ = unsafe { self.ioctl(VNA_IOC_GET_PARAMS, &mut ioc) }?;

        assert!(ioc.vgp_param_sz <= buf.capacity());
        unsafe { buf.set_len(ioc.vgp_param_sz) };

        NvList::unpack(&mut buf[..])
    }

    /// Issue ioctl against open viona instance
    ///
    /// # Safety
    ///
    /// Caller is charged with providing `data` argument which is adequate for
    /// any copyin/copyout actions which may occur as part of the ioctl
    /// processing.
    pub unsafe fn ioctl<T>(&self, cmd: i32, data: *mut T) -> Result<i32> {
        ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void)
    }

    pub fn ioctl_usize(&self, cmd: i32, data: usize) -> Result<i32> {
        if !Self::ioctl_usize_safe(cmd) {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "unsafe cmd provided",
            ));
        }
        // Safety: Since we are explicitly filtering for vmm ioctls which will
        // not assume the data argument is a pointer for copyin/copyout, we can
        // dismiss those dangers.  The caller is assumed to be cognizant of
        // other potential side effects.
        unsafe { ioctl(self.as_raw_fd(), cmd, data as *mut libc::c_void) }
    }

    /// Query the API version exposed by the kernel VMM.
    pub fn api_version(&self) -> Result<u32> {
        let vers = self.ioctl_usize(VNA_IOC_VERSION, 0)?;

        // We expect and demand a positive version number from the
        // VNA_IOC_VERSION interface.
        assert!(vers > 0);
        Ok(vers as u32)
    }

    /// Retrieve the minor number of the viona device instance.
    /// This is used for matching kernel statistic entries to the viona device.
    pub fn instance_id(&self) -> Result<u32> {
        let meta = self.0.metadata()?;
        Ok(minor(&meta))
    }

    /// Check viona ioctl command against those known to not require any
    /// copyin/copyout to function.
    const fn ioctl_usize_safe(cmd: i32) -> bool {
        matches!(
            cmd,
            VNA_IOC_DELETE
                | VNA_IOC_RING_RESET
                | VNA_IOC_RING_KICK
                | VNA_IOC_RING_PAUSE
                | VNA_IOC_RING_INTR_CLR
                | VNA_IOC_VERSION
                | VNA_IOC_SET_NOTIFY_IOP
                | VNA_IOC_SET_PROMISC
                | VNA_IOC_GET_MTU
                | VNA_IOC_SET_MTU
                | VNA_IOC_GET_PAIRS
                | VNA_IOC_SET_PAIRS
                | VNA_IOC_GET_USEPAIRS
                | VNA_IOC_SET_USEPAIRS,
        )
    }
}
impl AsRawFd for VionaFd {
    fn as_raw_fd(&self) -> RawFd {
        self.0.as_raw_fd()
    }
}

#[cfg(target_os = "illumos")]
pub enum ParamError {
    Io(std::io::Error),
    Detailed(NvList),
}

#[cfg(target_os = "illumos")]
unsafe fn ioctl(fd: RawFd, cmd: i32, data: *mut libc::c_void) -> Result<i32> {
    match libc::ioctl(fd, cmd, data) {
        -1 => Err(Error::last_os_error()),
        other => Ok(other),
    }
}

#[cfg(not(target_os = "illumos"))]
unsafe fn ioctl(
    _fd: RawFd,
    _cmd: i32,
    _data: *mut libc::c_void,
) -> Result<i32> {
    Err(Error::other("illumos required"))
}

#[cfg(target_os = "illumos")]
fn minor(meta: &std::fs::Metadata) -> u32 {
    // With #4208 backported into libc-0.2, minor() became a const-fn for
    // practically all of the UNIX-y platforms, save for illumos.
    //
    // Until we address that, just paper over it with a wrapper here.
    // Viona is not usable anywhere but illumos.
    unsafe { libc::minor(meta.rdev()) }
}
#[cfg(not(target_os = "illumos"))]
fn minor(meta: &std::fs::Metadata) -> u32 {
    let _rdev = meta.rdev();
    panic!("illumos required");
}

/// Convenience constants to provide some documentation on what changes have
/// been introduced in the various viona API versions.
#[repr(u32)]
#[derive(Copy, Clone)]
pub enum ApiVersion {
    /// Adds multi-queue support and change the data structure for per-queue
    /// interrupt polling to a compact bitmap.
    V6 = 6,

    /// Adds support for VirtIO 1.0 (modern) virtqueues.
    V5 = 5,

    /// Adds support for getting/setting MTU
    V4 = 4,

    /// Adds support for interface parameters
    V3 = 3,

    /// Adds support for non-vnic datalink devices
    V2 = 2,

    /// Initial version available for query
    V1 = 1,
}
impl ApiVersion {
    pub const fn current() -> Self {
        Self::V6
    }
}
impl PartialEq<ApiVersion> for u32 {
    fn eq(&self, other: &ApiVersion) -> bool {
        *self == *other as u32
    }
}
impl PartialOrd<ApiVersion> for u32 {
    fn partial_cmp(&self, other: &ApiVersion) -> Option<std::cmp::Ordering> {
        Some(self.cmp(&(*other as u32)))
    }
}

use std::sync::atomic::{AtomicI64, Ordering};

/// Store a cached copy of the queried API version.  Negative values indicate an
/// error occurred during query (and hold the corresponding negated `errno`).
/// A positive value indicates the cached version, and should be less than
/// `u32::MAX`.  A value of 0 indicates that no query has been performed yet.
static VERSION_CACHE: AtomicI64 = AtomicI64::new(0);

/// Query the API version from the viona device on the system.
///
/// Caches said version (or any emitted error) for later calls. The API version
/// may be used at runtime in operating the virtual machine, where the delay to
/// query again would be more directly guest-impactful.
pub fn api_version() -> Result<u32> {
    cache_api_version(|| -> Result<u32> {
        let ctl = VionaFd::open()?;
        let vers = ctl.api_version()?;
        Ok(vers)
    })
}

fn cache_api_version(do_query: impl FnOnce() -> Result<u32>) -> Result<u32> {
    if VERSION_CACHE.load(Ordering::Acquire) == 0 {
        let newval = match do_query() {
            Ok(x) => i64::from(x),
            Err(e) => -i64::from(e.raw_os_error().unwrap_or(libc::ENOENT)),
        };
        let _ = VERSION_CACHE.compare_exchange(
            0,
            newval,
            Ordering::Relaxed,
            Ordering::Relaxed,
        );
    }

    match VERSION_CACHE.load(Ordering::Acquire) {
        0 => {
            panic!("expected VERSION_CACHE to be initialized")
        }
        x if x < 0 => Err(Error::from_raw_os_error(-x as i32)),
        y => {
            assert!(y < i64::from(u32::MAX));

            Ok(y as u32)
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn latest_api_version() {
        let cur = ApiVersion::current();
        assert_eq!(VIONA_CURRENT_INTERFACE_VERSION, cur as u32);
    }

    #[test]
    fn u32_comparisons() {
        assert!(1u32 < ApiVersion::V2);
        assert!(2u32 == ApiVersion::V2);
        assert!(3u32 > ApiVersion::V2);
    }
}


================================================
FILE: docs/lifecycle.md
================================================
This is, for now, an aspirational description of the various states a Propolis
instance could travel through over the course of its life, and what events (and
their requisite details) might trigger those state transitions.

Notably absent from this ideation is mention of states or events related to
live migration.  That will be added once more of the system has been
prototyped.

# States

## Start

Propolis is empty.  No VMM resources exist, nor do any emulated device
definitions.  It waits in this state until it receives a valid creation payload.

## Initialize

A creation payload has been received.  In-kernel VMM resources are allocated.
Any devices spelled out by the payload which are emulated in userspace are
instantiated.  Resources backing those devices are attached.  Worker threads
running vCPUs, driving device emulation (like block IO), and event handling are
started.  The vCPU threads themselves are held outside the running state.  The
vCPUs themselves are initialized with the architecturally-defined INIT state.

## Boot

If guest boot was made conditional on certain state changes, such as a console
connection being established, it is here that progress will be blocked until
those requirements are met.

Once all conditions are fulfilled, the vCPU threads are released from their
holds, allowing them to enter running context.  Emulation for all devices
defined in the initial machine manifest is running as well.  The in-guest boot
software (UEFI) begins execution.

## Run

The VM is running its guest workload.

## Quiesce

The VM ceases to run its guest workload.  All vCPU threads are to exit their
run-loops.  All emulated devices are notified of the quiesce, and are to report
when they have completed termination of all pending operations.


## Halt

All emulated devices must already be quiesced before entering halt state, so
little is left to do.  It is in the Halt state that any diagnostic data could
be collected from the instance (if it were to be halted for a fault, say),
prior to its reboot, or destruction.

## Reboot

All emulated devices are notified that the VM is undergoing a reboot.  The
vCPUs are set to their architecturally defined RESET states, with vCPU threads
re-entering their run-loops to wait at a hold point.

Once all device reboot processing is complete, the instance proceeds back to
the Boot state.

## Destroy

Destruction of the VM and all all its associated emulation resources, such as
in-kernel VMM state and guest DRAM, is initiated.  Persistent resources
utilized by the VM, such as the backing for block devices, or vnics provisioned
in the host network stack are not destroyed, but rather just detached from.

Once all necessary destruction and clean-up has occurred, Propolis proceeds
back to Start, or the process exits, depending on its configuration.

# Event Progression

At any given time, a Propolis instance will be in one of the above states.  It
will also have an optional Target State, set by external events, which it will
use to drive through the state graph as defined above.

Example events would include:
- Forced Poweroff: Target State set to Destroy
- Forced Reset: Target State set to Reboot
- Guest triple-fault: Target State set to Reboot
- "Soft" Reset: No Target State change.  ACPI notification only
- "Soft" Poweroff: No Target State change.  ACPI notification only


================================================
FILE: docs/migrate-with-crucible.md
================================================
# Running a live migration "by hand" with a crucible boot disk

In the product, live migration is managed by nexus. Still, it is extremely
useful for development to be able to test software components in isolation. One
obstacle for testing inter-machine migration in propolis without a full control
plane is the need for shared storage — in particular, a source of shared storage
for the guest's boot disk.

Since crucible will be providing storage in the product, I chose to get
that working over other options. This document has some instructions on how to
get propolis to use crucible as a backend for a boot disk. At the moment it's
not the most user-friendly experience, but since it took some effort to figure
out, I wanted to at least capture what I did.

## Requirements

For this setup, you'll need:
- a "source" propolis server
- a "destination" propolis server (on the same machine or otherwise)
- a propolis CLI
- a copy of [crucible](https://github.com/oxidecomputer/crucible) that **matches the revision that propolis is compiled with** and a place to
  run downstairs processes that the source/destination machines can access
- an OS image that you'd like to boot

## Setup

### Seed crucible downstairs with the OS image

From the machine where you will run the crucible downstairs, set up 3 crucible
downstairs regions. Use  the `--import-path` flag to specify where the OS image
is on the filesystem. Specify the address and port the downstairs will listen
on using the `-a` and the `-p` flags, respectively. The address may be
`localhost` or the external IP address of the machine where the downstairs is
running. Note that the IP:port specification will be used again later in the
JSON file that gets passed to propolis.

For example:
```
$ ./target/release/crucible-downstairs create --import-path /home/jordan/images/helios-generic-ttya-base_20230109.raw --data region8810 --uuid $(uuidgen) --extent-size 64000 --extent-count 64

$ ./target/release/crucible-downstairs create --import-path /home/jordan/images/helios-generic-ttya-base_20230109.raw --data region8820 --uuid $(uuidgen) --extent-size 64000 --extent-count 64

$ ./target/release/crucible-downstairs create --import-path /home/jordan/images/helios-generic-ttya-base_20230109.raw --data region8830 --uuid $(uuidgen) --extent-size 64000 --extent-count 64
```

Each `create` will setup a region file. In the above example, these files are
`region8810`, `region8820`, and `region8830`, respectively.

### Run the crucible downstairs

After seeding the downstairs with the image, run the downstairs processes.

For example:
```
$ ./target/release/crucible-downstairs run -d region8810 -p 8810 -a 172.20.3.73
$ ./target/release/crucible-downstairs run -d region8820 -p 8820 -a 172.20.3.73
$ ./target/release/crucible-downstairs run -d region8830 -p 8830 -a 172.20.3.73
```

### Create a JSON file with disk requests

Now that we've got a crucible volume setup, we need to configure propolis to be
aware of it as a backend. One can do this by passing the `--crucible-disks`
flag and a JSON file of an array of `DiskRequest`s  when creating or migrating
a VM.

On the source machine, create a JSON file like this:

```
[
{
    "device": "virtio",
    "name": "helios-blockdev",
    "read_only": false,
    "slot": 1,
    "volume_construction_request": {
        "type": "volume",
        "block_size": 512,
        "id": "0cedae45-3d6e-4d90-b2cb-56f1a1a42a89",
        "read_only_parent": null,
        "sub_volumes": [
            {
                "type": "region",
                "block_size": 512,
                "blocks_per_extent": 64000,
                "extent_count": 64,
                "gen": 1,
                "opts": {
                    "cert_pem": null,
                    "control": null,
                    "flush_timeout": null,
                    "id": "0cedae45-3d6e-4d90-b2cb-56f1a1a42a89",
                    "key": null,
                    "key_pem": null,
                    "lossy": false,
                    "read_only": false,
                    "root_cert_pem": null,
                    "target": ["172.20.3.73:8810",
                             "172.20.3.73:8820",
                             "172.20.3.73:8830"
                    ]
                }
            }
        ]
    }
}
]
```

Several fields in this file must match the parameters specified when the
crucible downstairs processes were created, specifically: `block_size` (note
that it occurs twice in the JSON file), `blocks_per_extent`, and
`extent_count`. The `target` field is an array of IP:port addresses where the
downstairs are expected to be running.

One important thing to know is that the generation number field (`gen`) must be
bumped manually each time a VM is created (or migrated). (In the product, the
generation number is tracked by nexus.) A fresh crucible downstairs will start
with generation number 1.

To see the current generation number of a downstairs, you can dump the region
and check the highest generation number. Use the `-d` flag to select the
directory containing the region:

```
$ ./target/debug/crucible-downstairs dump -d region8810
EXT          BLOCKS GEN0   FL0  D0
  0 0000000-0063999   10   608   F
  1 0064000-0127999    0     1   F
  2 0128000-0191999    0     1   F
  3 0192000-0255999    0     1   F

... (output elided)

Max gen: 11,  Max flush: 642
```

You will need to use the max generation number of all 3 downstairs.

### Create the VM on the source server

On the source machine, run the propolis server with whatever TOML configuration
you desire, except for the boot disk, which will be specified through the API.

Create the VM using the `--crucible-disks` flag and the JSON file. For example:
```
$ ./target/debug/propolis-cli -s 172.20.3.73 -p 8000 new --crucible-disks disks.json vm0
```

Run the VM:
```
$ ./target/debug/propolis-cli -s 172.20.3.73 -p 8000 state run
```

You may wish to watch the console to make sure it boots:
```
$ ./target/debug/propolis-cli -s 172.20.3.73 -p 8000 serial
```

### Migrate the VM to the destination server

Now it's time to migrate the VM. The destination server will need to have the
same instance spec as the source server, so run the destination server with the
same TOML configuration as the source server. Similarly, the destination server
will need to know about the crucible backend. Like with the `create` command, we
can tell the destination server about this disk via request with the `migrate`
command and the `crucible-disks` flag.

Ensure the destination server is running. Make a copy of the JSON file you
created above and increment the generation number. Then, from the source, run
something like:
```
$ ./target/debug/propolis-cli -s 172.20.3.73 -p 8000 migrate 172.20.3.71 -p 8000 --crucible-disks disks2.json
```

If successful, you should be able to run the VM and see the serial console on
the destination side.


================================================
FILE: docs/server-send-vcr.md
================================================
# How to use the VCR replacement endpoint in propolis-server

This document describes how to use the `/instance/disk/{id}/vcr` API endpoint
to replace a downstairs of a Crucible volume attached to a Propolis instance.
We will use both propolis-server and propolis-cli to do this.

## Setup

You will need:
 * `Propolis-server` and `propolis-cli` binaries
 * A bootable VM image file
 * The OVMF file
 * A server.toml file for propolis-server
 * A crucible-disks file for crucible configuration.
 * A VCR replace json file.
 * A copy of the binaries `crucible-downstairs` and `dsc`, these should
   match what version Propolis expects.

## Start with crucible downstairs

In one window, run `dsc` to create and start four downstairs on four
different regions.

`--ds-bin` tells `dsc` where to find the `crucible-downstairs` binary.

The `--extent_size`, `--extent_count`, and `--block_size` values here are all
used in later config files and they must match otherwise crucible will
fail to start.

```
dsc start --create --cleanup --extent-size 16384 --extent-count 128 --block-size 4096 --region-count 4 --ds-bin ./target/release/crucible-downstairs
```

## Start propolis-server

In another window, start propolis server.
I used this toml file.  You will need to change the paths to your OVMF file
and your bootable VM image file.
```
bootrom = "/home/alan/vm/OVMF_CODE.fd"

[block_dev.ubuntu]
type = "file"
path = "/home/alan/vm/large-focal.raw"

[dev.block0]
driver = "pci-nvme"
block_dev = "ubuntu"
pci-path = "0.4.0"
```

To start the server, run this:

```
pfexec ./target/release/propolis-server run server.toml 127.0.0.1:55400
```

Leave this window running, it will show output from propolis and crucible.

## Create the VM with a crucible disk:

Next we create a crucible NVMe disk using `propolis-cli`.

Here is an example crucible-disks.json file.  Note the values you used above
for dsc need to match what is in this file.
```
[
{
    "device": "nvme",
    "name": "block2",
    "read_only": false,
    "slot": 3,
    "volume_construction_request": {
        "type": "volume",
        "block_size": 4096,
        "id": "0cedae45-3d6e-4d90-b2cb-56f1a1a42a89",
        "read_only_parent": null,
        "sub_volumes": [
            {
                "type": "region",
                "block_size": 4096,
                "blocks_per_extent": 16384,
                "extent_count": 128,
                "gen": 1,
                "opts": {
                    "cert_pem": null,
                    "control": null,
                    "flush_timeout": null,
                    "id": "0cedae45-3d6e-4d90-b2cb-56f1a1a42a89",
                    "key": null,
                    "key_pem": null,
                    "lossy": false,
                    "read_only": false,
                    "root_cert_pem": null,
                    "target": ["127.0.0.1:8810",
                             "127.0.0.1:8820",
                             "127.0.0.1:8830"
                    ]
                }
            }
        ]
    }
}
]
```

Using a third window, create the VM and add the crucible disk like this:

```
propolis-cli -s 127.0.0.1 -p 55400 new crub --crucible-disks crucible-disks.json
```

Then, start the VM:
```
propolis-cli -s 127.0.0.1 -p 55400 state run
```

## Replace a downstairs.

To replace a downstairs, we are using the almost same VCR that we used
to create our crucible disk, but with two things different.
1. The generation number has increased by one.
2. One (and only one) of the `target`s has changed to a different IP:Port.

The `.json` file we use for this looks similar to our previous one, but
because the replacement VCR is now considered a string input to propolis,
we have to stuff our new VCR into a string.

This means, take a valid VCR, put it all on one line, and put a \\ in front
of all quotes.

here is an example file:
```
{
    "name": "block2",
    "vcr_json": "{ \"type\": \"volume\", \"block_size\": 4096, \"id\": \"0cedae45-3d6e-4d90-b2cb-56f1a1a42a89\", \"read_only_parent\": null, \"sub_volumes\": [ { \"type\": \"region\", \"block_size\": 4096, \"blocks_per_extent\": 16384, \"extent_count\": 128, \"gen\": 7, \"opts\": { \"cert_pem\": null, \"control\": null, \"flush_timeout\": null, \"id\": \"0cedae45-3d6e-4d90-b2cb-56f1a1a42a89\", \"key\": null, \"key_pem\": null, \"lossy\": false, \"read_only\": false, \"root_cert_pem\": null, \"target\": [\"127.0.0.1:8810\", \"127.0.0.1:8820\", \"127.0.0.1:8840\" ] } } ] }"
}
```

You might notice in this second file, our generation number has gone up one,
and `target[2]` is different.  Send this replace.json file over to the server
with this propolis-cli command:

```
propolis-cli -s 127.0.0.1 -p 55400 vcr -u 0cedae45-3d6e-4d90-b2cb-56f1a1a42a89 --vcr-replace ./replace.json
```

This should result in more messages on the propolis-server window, and,
eventually, a new downstairs.


================================================
FILE: docs/standalone-with-crucible.md
================================================
# Run propolis-standalone with crucible disks

This document serves as an overview for running propolis-standalone using
crucible disks.

## Background: Why standalone?

In the product, the userspace VMM component for instances is
[propolis-server](../bin/propolis-server), which exposes API endpoints for the
rest of the control plane to manage instances. One can run propolis-server in
isolation, such as for development, and interact with its endpoints using a
client such as the [propolis CLI](../bin/propolis-cli).


The [standalone](../bin/propolis-standalone) version of propolis is useful
development tool to a VM up and running quickly without having to hit any API
endpoints. It takes an input a single TOML file, sets up a unix domain socket
that is connected to a VM's uart, and starts the guests when the user connects
to the socket. It also cleans up the VM gracefully when the user sends CTRL+C
to the running `propolis-standalone` program.

Beyond these differences, there are some differences in the state machine
related to the instance lifecycle as well as the emulation.

TODO: flesh out more of these differences, and maybe capture them in a
higher-level README.

## Requirements
### Building `propolis-standalone`
- Clone this repository on an illumos box (e.g. `atrium`)
- In that folder, run
  `cargo build --release -ppropolis-standalone --features=crucible`
- This will produce a `propolis-standalone` binary in `target/release/`
- Copy this binary to your target Gimlet

### Building `crucible`
- Clone [`oxidecomputer/crucible`](https://github.com/oxidecomputer/crucible) on
  an illumos box (e.g. `atrium`)
- In that folder, run `cargo build --release -pcrucible-downstairs -pdsc`
- This will produce `crucible-downstairs` and `dsc` binaries in `target/release`
- Copy those files to your target Gimlet

### VM stuff
See the [`propolis-standalone` README](../bin/propolis-standalone/README.md)
for details on how to get
  * VM Image file
  * VM OVMF file

Copy those files to your target Gimlet.

## Instructions

### Onetime setup on the Gimlet.

Setup for a virtual NIC to be used by the VM.

```
dladm create-vnic -t -l igb0 -m 02:08:20:ac:e9:16 vnic_prop0
```

Setup of a zpool on three SSDs.

Crucible downstairs runs on top of a filesystem (ZFS in our case).
On your bench Gimlet, you should select three NVMe disks, and create a zpool
on each of them.  You can use an existing zpool.

If you're creating new zpools, start by running `format` to list disk names:
```
BRM42220012 # format
Searching for disks...done


AVAILABLE DISK SELECTIONS:
       0. c1t00A0750130082207d0 <NVMe-Micron_7300_MTFDHBG1T9TDF-95420260-1.75TB>
          /pci@0,0/pci1de,fff9@1,3/pci1344,3100@0/blkdev@w00A0750130082207,0
       1. c2t0014EE81000BC481d0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@0,0/pci1de,fff9@3,2/pci1b96,0@0/blkdev@w0014EE81000BC481,0
       2. c3t0014EE81000BC783d0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@0,0/pci1de,fff9@3,3/pci1b96,0@0/blkdev@w0014EE81000BC783,0
       3. c4t0014EE81000BC78Fd0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@0,0/pci1de,fff9@3,4/pci1b96,0@0/blkdev@w0014EE81000BC78F,0
       4. c5t0014EE81000BC37Dd0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@38,0/pci1de,fff9@1,2/pci1b96,0@0/blkdev@w0014EE81000BC37D,0
       5. c6t0014EE81000BC28Ad0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@38,0/pci1de,fff9@1,3/pci1b96,0@0/blkdev@w0014EE81000BC28A,0
       6. c7t00A0750130082248d0 <NVMe-Micron_7300_MTFDHBG1T9TDF-95420260-1.75TB>
          /pci@38,0/pci1de,fff9@3,3/pci1344,3100@0/blkdev@w00A0750130082248,0
       7. c8t0014EE81000BC39Bd0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@ab,0/pci1de,fff9@1,1/pci1b96,0@0/blkdev@w0014EE81000BC39B,0
       8. c9t0014EE81000BC3C8d0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@ab,0/pci1de,fff9@1,2/pci1b96,0@0/blkdev@w0014EE81000BC3C8,0
       9. c10t0014EE81000BC4CCd0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@ab,0/pci1de,fff9@1,3/pci1b96,0@0/blkdev@w0014EE81000BC4CC,0
      10. c11t0014EE81000BC786d0 <NVMe-WUS4C6432DSP3X3-R2210000-2.91TB>
          /pci@ab,0/pci1de,fff9@1,4/pci1b96,0@0/blkdev@w0014EE81000BC786,0`
Specify disk (enter its number): ^C
```

Then, create zpools with your desired serial names, e.g.
```
zpool create -f -o ashift=12 -O atime=off -m /pool/disk0 cru0 c1t00A0750130082207d0
```

On each zpool, create a directory where the crucible downstairs will live:
In our case, the pools are mounted at `/pool/disk1`, `/pool/disk2`, and
`/pool/disk3`.

```
mkdir /pool/disk0/region
mkdir /pool/disk1/region
mkdir /pool/disk2/region
```

### Create the crucible downstairs regions

With three pools created, you can now create the three downstairs crucible
regions where your data will live.  The values you specify here for
extent_size, extent_count, and block_size will decide how big your region
is as well as can change the performance characteristics of the region.

Omicron's current defaults are to have a 64 MiB extent size:
```
pub const EXTENT_SIZE: u64 = 64_u64 << 20;
```

Which ends up like this:
For 512 byte blocks, 131072 is the extent size.
For 4096 byte blocks, 16384 is the extent size.

```
./target/release/dsc create \
  --ds-bin ./target/release/crucible-downstairs \
  --cleanup \
  --extent-size 16384 \
  --extent-count 512 \
  --block-size 4096 \
  --encrypted \
  --region-dir /pool/disk0/region \
  --region-dir /pool/disk1/region \
  --region-dir /pool/disk2/region
```

(modify the `dsc` and `crucible-downstairs` paths based on where you put those
binaries)

### Run the three downstairs

Once the regions are created, you can start the three downstairs using the
`dsc` command.

```
./target/release/dsc start \
  --ds-bin ./target/release/crucible-downstairs \
  --region-dir /pool/disk0/region \
  --region-dir /pool/disk1/region \
  --region-dir /pool/disk2/region
```

### Start `propolis-standalone`

To start `propolis-standalone`, you'll need a configuration file.  The specifics
will depend on file paths, image type, etc.

Here's an example TOML file, assuming you have used the above settings for block
size, extent size, and extent count.

```toml
[main]
name = "testvm"
cpus = 4
bootrom = "/tmp/OVMF_CODE.fd"
memory = 2048

[block_dev.ubuntu]
type = "file"
path = "/tmp/large-focal.raw"

[dev.block0]
driver = "pci-nvme"
block_dev = "ubuntu"
pci-path = "0.4.0"

[block_dev.my_crucible]
type = "crucible"
# these MUST match the region configuration downstairs
block_size = 4096
blocks_per_extent = 131072
extent_count = 128
targets = [
  "127.0.0.1:8810",
  "127.0.0.1:8820",
  "127.0.0.1:8830",
]
generation = 5
upstairs_id = "e4396bd0-ede1-48d7-ac14-3d2094dfba5b"

# Create your own key (openssl rand -base64 32) Or use this.
encryption_key = "tCw7zw0hAsPuxMOTWwnPEFYjBK9qJRtYyGdEXKEnrg0="

[dev.block1]
driver = "pci-nvme"
block_dev = "my_crucible"
pci-path = "0.5.0"

[dev.net0]
driver = "pci-virtio-viona"
vnic = "vnic_prop0"
pci-path = "0.6.0"
```

Start propolis-standalone like this:

```
propolis-standalone standalone.toml
```


================================================
FILE: lib/propolis/Cargo.toml
================================================
[package]
name = "propolis"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"
rust-version = "1.90"

[dependencies]
libc.workspace = true
bit_field.workspace = true
bitflags = { workspace = true, features = ["serde"] }
bitstruct.workspace = true
byteorder.workspace = true
dice-verifier.workspace = true
lazy_static.workspace = true
thiserror.workspace = true
bhyve_api.workspace = true
cpuid_utils.workspace = true
dladm.workspace = true
viona_api.workspace = true
propolis_types.workspace = true
usdt = { workspace = true, features = ["asm"] }
tokio = { workspace = true, features = ["full"] }
futures.workspace = true
paste.workspace = true
pin-project-lite.workspace = true
anyhow.workspace = true
rgb_frame.workspace = true
rfb.workspace = true
slog.workspace = true
serde.workspace = true
serde_arrays.workspace = true
erased-serde.workspace = true
serde_json.workspace = true
sha2.workspace = true
strum = { workspace = true, features = ["derive"] }
uuid.workspace = true
zerocopy = { workspace = true, features = ["derive"] }
crucible-client-types = { workspace = true, optional = true }
crucible = { workspace = true, optional = true }
oximeter = { workspace = true, optional = true }
nexus-client = { workspace = true, optional = true }
async-trait.workspace = true
iddqd.workspace = true
nix.workspace = true
vm-attest.workspace = true
itertools.workspace = true

# falcon
libloading = { workspace = true, optional = true }
p9ds = { workspace = true, optional = true }
ispf = { workspace = true, optional = true }
rand = { workspace = true, optional = true }
softnpu = { workspace = true, optional = true }
dlpi = { workspace = true, optional = true }
static_assertions = "1.1.0"

[dev-dependencies]
crossbeam-channel.workspace = true
tempfile.workspace = true
slog-term.workspace = true
slog-async.workspace = true
rand.workspace = true

[features]
default = []
crucible-full = ["crucible", "crucible-client-types", "oximeter", "nexus-client"]
falcon = ["libloading", "p9ds", "dlpi", "ispf", "rand", "softnpu", "viona_api/falcon"]

# TODO until crucible#1280 is addressed, enabling Nexus notifications is done
# through a feature flag.
omicron-build = ["crucible/notify-nexus"]


================================================
FILE: lib/propolis/src/accessors.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Hierarchical access control for emulated resources.
//!
//! The structures in this module are designed to support some of our current
//! needs, but also aspirationally for anticipated needs.
//!
//! First and foremost, device emulation logic requires access to resources
//! which may be subsequently moderated by intervening parts of the emulation.
//! Acquisition of the underlying resource is fallible for this reason.
//!
//! For example: A PCI device performs DMA to guest memory.  If bus-mastering is
//! disabled on the device, or any parent bridge in its bus hierarchy, then its
//! contained emulation should fail any DMA accesses.
//!
//! This also motivates the tree-like structure of the accessor.  Keeping the
//! PCI bus mastering example, an individual endpoint can be allowed to perform
//! bus mastering if Bus Master Enable is set.  Additionally, a PCI-PCI bridge
//! has a Bus Master Enable bit with a similar semantic for all devices behind
//! that bridge.
//!
//! There is not yet any support for bus mastering bits, but it's expected this
//! should be straightforward on top of `Node` or `NodeEntry`.
//!
//! Secondly, and more relevant to how Accessor is used in Propolis today, an
//! accessor tree provides a mechanism to provide or remove a reference to the
//! protected resource from an entire device or machine. While the accessor tree
//! is at heart a fancy `Arc<Mutex<Arc<T>>`, an `Arc<T>` is never exposed in the
//! accessor's API; only a wrapper that derefs as `T`.
//!
//! Accessor structures being the sole access mechanism to a guarded resource
//! ensures that the resource can be added or removed *almost*[1] arbitrarily.
//! [`MsiAccessor`] is an example of double-duty here; on one hand, a PCI bridge
//! can have MSI enabled or disabled, as well as the functions behind that
//! bridge. On the other hand, the MSI accessor is mostly just an `Arc<VmmHdl>`,
//! and it would be unfortunate to have stray `Arc<VmmHdl>` littered across
//! device emulation[2].
//!
//! 1: A user of Propolis should only change the guarded resource for devices that
//! are in the initial (pre-run) state, paused, or halted.  Removing a guarded
//! resource during arbitrary device operation could, at worst, look to a device
//! like it was the bus master while also losing its ownership of the bus!
//! There is no expectation of correct operation in such a bogus state.
//!
//! 2: `Arc<VmmHdl>` has since found its way into device emulation in different
//! ways, though the ownership model is simple enough there is little risk of
//! cyclic references keeping a `VmmHdl` alive overly-long.

use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::marker::PhantomData;
use std::ptr::NonNull;
use std::sync::{Arc, Mutex, MutexGuard, Weak};

use crate::vmm::VmmHdl;

pub trait AccessedResource {
    type Root;
    type Leaf: Clone;
    type Target;

    fn derive(root: &Self::Root) -> Self::Leaf;
    fn deref(leaf: &Self::Leaf) -> &Self::Target;
}

/// Key type for identifying nodes referenced by `Tree`.
#[derive(Ord, PartialOrd, Eq, PartialEq, Debug, Copy, Clone)]
pub struct NodeKey(NonNull<Node<NodeKeyNull>>);
impl<T: AccessedResource> From<&Arc<Node<T>>> for NodeKey {
    fn from(value: &Arc<Node<T>>) -> Self {
        let raw = Arc::as_ptr(value) as *const Node<NodeKeyNull>;
        let inner =
            unsafe { NonNull::new_unchecked(raw as *mut Node<NodeKeyNull>) };
        NodeKey(inner)
    }
}
impl std::fmt::Display for NodeKey {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:p}", self.0.as_ptr())
    }
}
// Safety: While the key uses a pointer (!Send) type internally, it is for
// unique identification purposes only, and is never meant to be dereferenced,
// copied from, or transformed into a reference of any kind.
unsafe impl Send for NodeKey {}

enum NodeKeyNull {}
impl AccessedResource for NodeKeyNull {
    type Root = ();
    type Leaf = ();
    type Target = ();
    fn derive(_root: &Self::Root) -> Self::Leaf {
        unreachable!()
    }
    fn deref(_derived: &Self::Root) -> &Self::Target {
        unreachable!()
    }
}

struct TreeNode<T: AccessedResource> {
    /// [NodeKey] of the parent to this node
    ///
    /// Holds [None] if the node is the root of the [Tree]
    parent_key: Option<NodeKey>,
    /// [Weak] reference back to the node.  This access is needed if the node
    /// undergoes adoption (being moved to a different [Tree])
    node_ref: Weak<Node<T>>,
    /// List of keys to child nodes (if any)
    children: BTreeSet<NodeKey>,
    /// Display name for [Tree::print()]-ing
    name: Option<String>,
}
impl<T: AccessedResource> TreeNode<T> {
    fn new(
        parent_key: NodeKey,
        node_ref: Weak<Node<T>>,
        name: Option<String>,
    ) -> Self {
        Self {
            parent_key: Some(parent_key),
            node_ref,
            children: BTreeSet::new(),
            name,
        }
    }
    fn new_root(node_ref: Weak<Node<T>>) -> Self {
        Self {
            parent_key: None,
            node_ref,
            children: BTreeSet::new(),
            name: None,
        }
    }
}

struct Tree<T: AccessedResource> {
    /// Root resource (if any) that this hierarchy is granting access to
    res_root: Option<T::Root>,

    /// Key of the root node of this hierarchy
    ///
    /// Only when the tree is being initialized, should `root_key` be [None]
    root_key: Option<NodeKey>,

    /// Nodes within this hierarchy
    nodes: BTreeMap<NodeKey, TreeNode<T>>,

    /// Weak self-reference, used when building [TreeNode] entries as nodes are
    /// added to the tree.  Held as a convenience, instead of requiring it to be
    /// passed in by the caller.
    self_weak: Weak<Mutex<Tree<T>>>,
}
impl<T: AccessedResource> Tree<T> {
    /// Record a node in the tree
    fn add_child(
        &mut self,
        parent: NodeKey,
        name: Option<String>,
    ) -> Arc<Node<T>> {
        let child_node = Arc::new(Node(Mutex::new(NodeEntry {
            tree: Weak::upgrade(&self.self_weak).expect("tree ref still live"),
            res_leaf: self.res_root.as_ref().map(T::derive),
        })));

        let child_key = NodeKey::from(&child_node);
        let conflict = self.nodes.insert(
            child_key,
            TreeNode::new(parent, Arc::downgrade(&child_node), name),
        );
        assert!(
            conflict.is_none(),
            "new child should not conflict with existing node"
        );

        self.nodes
            .get_mut(&parent)
            .expect("parent node must exist")
            .children
            .insert(child_key);

        child_node
    }

    /// Adopt the root node and all its descendants into our tree, under the
    /// node specified by `parent_key`
    fn adopt(&mut self, parent_key: NodeKey, adopt_tree: &mut Tree<T>) {
        debug_assert!(
            self.nodes
                .get(&parent_key)
                .and_then(|node| Weak::upgrade(&node.node_ref))
                .is_some(),
            "leaf target for re-parenting missing"
        );

        let child_key = adopt_tree.root_key();
        let tree_ref = self.self_weak.upgrade().unwrap();

        let mut queue = VecDeque::new();
        queue.push_back(child_key);
        while let Some(adopt_key) = queue.pop_front() {
            if let Some(mut tnode) = adopt_tree.nodes.remove(&adopt_key) {
                let node = match Weak::upgrade(&tnode.node_ref) {
                    Some(nr) => nr,
                    None => {
                        continue;
                    }
                };

                // Associate the node with this tree and resource
                {
                    let mut ent = node.0.lock().unwrap();
                    ent.tree = Arc::clone(&tree_ref);
                    ent.res_leaf = self.res_root.as_ref().map(T::derive);
                }

                if adopt_key == child_key {
                    // The root of the adopted tree needs its parent set (and to
                    // be added to the children list of said parent.
                    //
                    // All of the descendant nodes will have those relationships
                    // properly established when they are copied over.
                    tnode.parent_key = Some(parent_key);
                    let parent_node = self
                        .nodes
                        .get_mut(&parent_key)
                        .expect("parent node is present");
                    parent_node.children.insert(adopt_key);
                }

                queue.extend(tnode.children.iter());

                let _conflict = self.nodes.insert(adopt_key, tnode);
                assert!(_conflict.is_none());
            }
        }
        debug_assert!(adopt_tree.nodes.is_empty());
    }

    /// Remove traces of a node from the tree as it is dropped
    fn remove_dead_node(&mut self, key: NodeKey) {
        let mut tnode =
            self.nodes.remove(&key).expect("tree node should be present");

        if let Some(pkey) = tnode.parent_key.as_ref() {
            let was_removed = self
                .nodes
                .get_mut(pkey)
                .expect("parent for node exists")
                .children
                .remove(&key);
            assert!(was_removed, "parent should list node as child");
        } else {
            assert_eq!(
                Some(key),
                self.root_key,
                "node without parent must be tree root"
            );
        }

        // orphan any children of the node
        for child in std::mem::take(&mut tnode.children) {
            self.orphan_node(child);
        }
    }

    /// Remove a node from this Tree into a new empty tree, with all of its
    /// descendants in tow.
    fn orphan_node(&mut self, key: NodeKey) {
        let mut tnode =
            self.nodes.remove(&key).expect("node-to-orphan is present in tree");

        let orphan_tree = Self::new_empty(None);

        // This node now becomes the root of the orphaned tree
        {
            let node =
                tnode.node_ref.upgrade().expect("node-to-orphan is still live");
            let mut guard = node.0.lock().unwrap();
            guard.tree = orphan_tree.clone();
            guard.res_leaf.take();
        }
        tnode.parent_key = None;

        let mut needs_moved = VecDeque::new();
        needs_moved.extend(tnode.children.iter());

        let mut tguard = orphan_tree.lock().unwrap();
        tguard.root_key = Some(key);
        tguard.nodes.insert(key, tnode);

        while let Some(move_key) = needs_moved.pop_front() {
            let tnode = self
                .nodes
                .remove(&move_key)
                .expect("child tree node is present");

            // Progeny of the orphaned node which are still "live" need to be
            // associated with the new tree.  Anything which happens to be
            // "dead" will clean itself from the existing tree and orphan its
            // subsequent progeny when given access to the tree lock.
            if let Some(node) = tnode.node_ref.upgrade() {
                let mut ent = node.0.lock().unwrap();
                ent.tree = orphan_tree.clone();
                ent.res_leaf = None;

                needs_moved.extend(tnode.children.iter());

                tguard.nodes.insert(move_key, tnode);
            }
        }
    }

    /// Set the string name of node specified by `key`
    fn rename_node(&mut self, key: NodeKey, name: Option<String>) {
        if let Some(tnode) = self.nodes.get_mut(&key) {
            tnode.name = name;
        }
    }

    /// Returns `true` if a given `node` is the root of this tree
    fn node_is_root(&self, node: &Arc<Node<T>>) -> bool {
        self.root_key() == node.into()
    }

    fn set_root_resource(
        &mut self,
        new_root: Option<T::Root>,
    ) -> Option<T::Root> {
        // Swap out the existing root resource
        let old = std::mem::replace(&mut self.res_root, new_root);

        // ... and invalidate all nodes too
        for tnode in self.nodes.values() {
            if let Some(node) = tnode.node_ref.upgrade() {
                let _ = node.0.lock().unwrap().res_leaf.take();
            }
        }

        old
    }

    /// How many nodes exist in this tree hierarchy?
    fn node_count(&self) -> usize {
        self.nodes.len()
    }

    /// Traverse tree in order conducive to printing, applying a provided
    /// `print_fn` to each node.
    fn print(&self, print_fn: impl Fn(PrintNode)) {
        // Seed the root of the tree to be processed at depth 0
        let mut initial = BTreeSet::new();
        let root_key = self.root_key();
        initial.insert(root_key);
        let mut to_process = vec![(0, initial)];

        while let Some((depth, mut children)) = to_process.pop() {
            let key = match children.pop_first() {
                Some(i) => {
                    to_process.push((depth, children));
                    i
                }
                None => continue,
            };

            if let Some(tnode) = self.nodes.get(&key) {
                let pnode = PrintNode {
                    depth,
                    key,
                    is_root: key == root_key,
                    name: tnode.name.as_deref(),
                };
                print_fn(pnode);
                if !tnode.children.is_empty() {
                    to_process.push((depth + 1, tnode.children.clone()))
                }
            }
        }
    }

    /// Get the [NodeKey] of the tree root
    ///
    /// Panics if called before the tree is initialized.
    fn root_key(&self) -> NodeKey {
        self.root_key.expect("root_key is non-None once tree is initialized")
    }

    /// Create a [Tree] with no nodes (not even a root)
    fn new_empty(primary: Option<T::Root>) -> Arc<Mutex<Tree<T>>> {
        Arc::new_cyclic(|self_weak| {
            Mutex::new(Tree {
                res_root: primary,
                nodes: BTreeMap::new(),
                root_key: None,
                self_weak: self_weak.clone(),
            })
        })
    }

    /// Create a [Tree] returning the root node
    fn new(res_root: Option<T::Root>) -> Arc<Node<T>> {
        let tree = Self::new_empty(res_root);

        let node = Node::new_root(tree.clone());
        let mut guard = tree.lock().unwrap();
        let res_leaf = guard.res_root.as_ref().map(T::derive);
        node.0.lock().unwrap().res_leaf = res_leaf;
        let root_key = NodeKey::from(&node);
        guard.root_key = Some(root_key);
        guard.nodes.insert(root_key, TreeNode::new_root(Arc::downgrade(&node)));

        node
    }
}

/// Data provided to `print_fn` callback as part of `Tree::print()`
pub struct PrintNode<'a> {
    pub depth: usize,
    pub key: NodeKey,
    pub is_root: bool,
    pub name: Option<&'a str>,
}

/// Build printing function for [`Tree::print()`] which outputs a list format.
fn print_basic(match_node: Option<NodeKey>) -> impl Fn(PrintNode) {
    move |node| {
        let key = node.key;
        let pad = "  ".repeat(node.depth);
        let highlight = if Some(key) == match_node { " ***" } else { "" };
        let namestr = match node.name {
            None if node.is_root => "'ROOT'".to_string(),
            None => "<unnamed>".to_string(),
            Some(s) => format!("'{s}'"),
        };

        println!("{pad}- {{ id: {key:#}, name: {namestr} }}{highlight}");
    }
}

type TreeBackref<T> = Arc<Mutex<Tree<T>>>;

struct NodeEntry<T: AccessedResource> {
    tree: TreeBackref<T>,
    /// Leaf resource for this node in the tree.
    ///
    /// The contents of the leaf resource may differ between nodes, as it is
    /// effectively a cache of the [AccessedResource::derive()] output, when not
    /// cleared as part of invalidation from the root.
    res_leaf: Option<T::Leaf>,
    // TODO: store enable/disable state here for evaluation and propagation
}
struct Node<T: AccessedResource>(Mutex<NodeEntry<T>>);
impl<T: AccessedResource> Node<T> {
    /// Lock tree and entry (in that order, as required), and check if the tree
    /// we locked is the one this node is associated with.
    ///
    /// If the tree references match, the two guards are returned. If not, the
    /// tree to which we are now associated is returned instead.
    ///
    /// This is purely a helper function to make lifetimes clearer for
    /// [`Self::lock_tree()`]
    #[allow(clippy::type_complexity)]
    fn try_lock_tree<'node, 'guard>(
        &'node self,
        tree_ref: &'guard TreeBackref<T>,
    ) -> Result<
        (MutexGuard<'guard, Tree<T>>, MutexGuard<'node, NodeEntry<T>>),
        TreeBackref<T>,
    > {
        let guard = tree_ref.lock().unwrap();
        let node_guard = self.0.lock().unwrap();
        if Arc::ptr_eq(tree_ref, &node_guard.tree) {
            Ok((guard, node_guard))
        } else {
            Err(node_guard.tree.clone())
        }
    }

    /// Safely acquire the lock to this entry, as well as the containing tree,
    /// respecting the ordering requirements.
    fn lock_tree<'node, F, R>(&'node self, f: F) -> R
    where
        F: for<'guard> FnOnce(
            MutexGuard<'guard, Tree<T>>,
            MutexGuard<'node, NodeEntry<T>>,
        ) -> R,
    {
        let mut tree = self.0.lock().unwrap().tree.clone();
        let (guard, self_guard) = loop {
            let new_tree = match self.try_lock_tree(&tree) {
                Ok(guards) => break guards,
                Err(nt) => nt,
            };
            let _ = std::mem::replace(&mut tree, new_tree);
        };
        f(guard, self_guard)
    }

    fn new_root(tree: Arc<Mutex<Tree<T>>>) -> Arc<Node<T>> {
        Arc::new(Node(Mutex::new(NodeEntry { tree, res_leaf: None })))
    }
    fn new_child(self: &Arc<Node<T>>, name: Option<String>) -> Arc<Node<T>> {
        self.lock_tree(|mut guard, _| guard.add_child(self.into(), name))
    }

    /// Acquire a reference to the accessed resource, if permitted.
    ///
    /// TODO: The guarded resource can be replaced while this guard is held.
    /// There is no synchronization between a potential disabling of access and
    /// outstanding guards that would be forbidden by that disablement.
    fn guard(&self) -> Option<Guard<'_, T>> {
        self.guard_borrow().map(|guard| {
            let leaf_ref = guard
                .res_leaf
                .clone()
                .expect("guard_borrow() only returns Some if res_leaf is Some");
            Guard { inner: leaf_ref, _pd: PhantomData }
        })
    }

    /// Lock this node's reference to the resource, if permitted.
    ///
    /// Take care: this returns the mutex guard, keeping this node locked.
    /// Concurrent accesses to this node will block, and attempts to update the
    /// resource in this tree will be blocked.
    fn guard_borrow(&self) -> Option<MutexGuard<'_, NodeEntry<T>>> {
        let local = self.0.lock().unwrap();
        if let Some(_) = local.res_leaf.as_ref() {
            Some(local)
        } else {
            drop(local);
            // Attempt to (re)derive leaf resource from root
            self.lock_tree(|tree, mut local| {
                if let Some(root) = tree.res_root.as_ref() {
                    let leaf = T::derive(root);
                    local.res_leaf = Some(leaf.clone());

                    Some(local)
                } else {
                    None
                }
            })
        }
    }

    fn drop_from_tree(self: &mut Arc<Node<T>>) {
        let key = NodeKey::from(&*self);
        self.lock_tree(|mut guard, mut local| {
            // drop any lingering access to the resource immediately
            let _ = local.res_leaf.take();

            // Since we hold the Tree lock (thus eliminating the chance of any
            // racing adopt/orphan activity to be manipulating the refcount on
            // the `Arc<Node<T>>`, we expect that its strong count is exactly 1.
            //
            // We, the holder (as part of Accessor::drop()) are the only one
            // with a strong reference, which should be released momentarily.
            debug_assert_eq!(Arc::strong_count(self), 1);

            guard.remove_dead_node(key);
        });
    }
}

pub struct Guard<'a, T: AccessedResource> {
    inner: T::Leaf,
    _pd: PhantomData<&'a T>,
}
impl<T: AccessedResource> std::ops::Deref for Guard<'_, T> {
    type Target = T::Target;
    fn deref(&self) -> &Self::Target {
        T::deref(&self.inner)
    }
}

pub struct LockedView<'node, T: AccessedResource> {
    guard: MutexGuard<'node, NodeEntry<T>>,
}

impl<'node, T: AccessedResource> LockedView<'node, T> {
    pub fn view(&self) -> &T::Target {
        let leaf = self
            .guard
            .res_leaf
            .as_ref()
            .expect("LockedView is returned only when res_leaf is Some()");
        T::deref(&leaf)
    }
}

pub struct Accessor<T: AccessedResource>(Arc<Node<T>>);
impl<T: AccessedResource> Accessor<T> {
    /// Create a new accessor hierarchy, mediating access to `resource`.
    pub fn new(resource: T::Root) -> Self {
        Self(Tree::new(Some(resource)))
    }

    /// Create a new orphaned accessor hierarchy, bearing no existing resource.
    /// The hierarchy can gain access to a valid resource by being
    /// [adopted][`Self::adopt()`].
    pub fn new_orphan() -> Self {
        Self(Tree::new(None))
    }

    /// Create a child of this node.
    pub fn child(&self, name: Option<String>) -> Self {
        Self(self.0.new_child(name))
    }

    /// Adopt an orphan node and its descendants.
    ///
    /// # Panics
    ///
    /// If the node to be adopted is not the root of an orphan tree.
    pub fn adopt(&self, child: &Self, name: Option<String>) {
        let parent_key = NodeKey::from(&self.0);
        let child_key = NodeKey::from(&child.0);

        assert_ne!(parent_key, child_key, "cannot adopt self");

        self.0.lock_tree(|mut parent_guard, node_guard| {
            drop(node_guard);
            child.0.lock_tree(|mut child_guard, node_guard| {
                drop(node_guard);
                if !child_guard.node_is_root(&child.0) {
                    // Drop all mutex guards prior to panic in order to allow
                    // unwinder to do its job, rather than getting tripped up by
                    // poisoned mutexes.  This allows the unit tests to exercise
                    // this panic condition.
                    drop(child_guard);
                    drop(parent_guard);
                    panic!("adopting of non-roots not allowed");
                }
                // Apply the chosen name to the root prior to its adoption
                child_guard.rename_node(child_key, name);
                parent_guard.adopt(parent_key, &mut child_guard);
            });
        });
    }

    /// Remove the underlying resource from the root node of a hierarchy.  This
    /// is meant to provide the root holder of the resource the means to
    /// promptly remove access to it during events such as tear-down.
    ///
    /// # Panics
    ///
    /// If this is called on a non-root node.
    pub fn remove_resource(&self) -> Option<T::Root> {
        self.0.lock_tree(|mut guard, node_guard| {
            drop(node_guard);
            if !guard.node_is_root(&self.0) {
                drop(guard);
                panic!("removal of root resource only allowed at root node");
            }

            guard.set_root_resource(None)
        })
    }

    /// Attempt to gain access to the underlying resource.
    ///
    /// Will return [None] if any ancestor node disables access, or if the node
    /// is not attached to a hierarchy containing a valid resource.
    ///
    /// TODO: an outstanding `Guard` does not synchronize with changes to the
    /// underlying resource; the resource could be changed, the tree adopted,
    /// access disallowed, etc, but an existing `Guard` will still reference the
    /// resource as it was at the point the access was allowed.
    pub fn access(&self) -> Option<Guard<'_, T>> {
        self.0.guard()
    }

    /// Attempt to get a reference to the underlying resource.
    ///
    /// Will return [None] if any ancestor node disables access, or if the node
    /// is not attached to a hierarchy containing a valid resource.
    ///
    /// Unlike [`Accesor::access()`], this returns a wrapped MutexGuard for this
    /// accessor node; callers must carefully consider lock ordering when
    /// holding this guard across other operations.  As with any other mutex,
    /// perfer holding this guard for as small a window as permitted.
    ///
    /// This function exists solely to support very hot code accessing the same
    /// resource across many processors.  When the underlying resource is an
    /// `Arc`, `access()` implies an `Arc::clone`, which would contentously
    /// modify the reference count to disastrous effect. `access_locked()` only
    /// involves this (hopefully-uncontended!) node, at the cost of a more
    /// error-prone API.  If `lock incq/lock decq` aren't in your profile, this
    /// probably isn't helpful!
    ///
    /// Some examples of the added consideration with this function: holding
    /// this guard will block other calls to this node's `access()` or
    /// `access_locked()`, and *may* block attempts to `access()` or
    /// `access_locked()` a child of this node. Holding this guard will block
    /// removal of the underlying resource, potentially blocking VM teardown.
    pub fn access_locked(&self) -> Option<LockedView<'_, T>> {
        let guard = self.0.guard_borrow()?;
        if guard.res_leaf.is_some() {
            Some(LockedView { guard })
        } else {
            None
        }
    }

    /// How many nodes exist in this Accessor hierarchy
    pub fn node_count(&self) -> usize {
        self.0.lock_tree(|guard, _| guard.node_count())
    }

    /// Print the hierarchy that this node is a member of
    pub fn print(&self, highlight_self: bool) {
        self.0.lock_tree(|tree, _| {
            tree.print(print_basic(
                highlight_self.then_some(NodeKey::from(&self.0)),
            ));
        });
    }
}
impl<T: AccessedResource> Drop for Accessor<T> {
    /// Perform necessary `Node` clean-up in the containing tree during drop of
    /// the Accessor.
    ///
    /// On first glance it would seem like this logic belongs in the [Drop] impl
    /// for `Node`, rather than the [Accessor].  Doing it that way mostly works,
    /// but poses a challenge: When the `Tree` is performing node adoption or
    /// orphaning, it must reach back out into the Nodes it owns via
    /// `TreeNode::node_ref`.  This poses a race for when the last
    /// `Arc<Node<T>>` is dropped, either by the `Accessor`, or the logic in the
    /// `Tree`.  When the latter wins, it still holds the tree lock, posing a
    /// deadlock situation which is otherwise impossible to prevent.
    ///
    /// As such it is expected that the Accessor will perform the tree
    /// de-registration of the node when it is being dropped.  No other
    /// structures should hold the `Arc<Node<T>>`.
    fn drop(&mut self) {
        self.0.drop_from_tree();
    }
}

pub type MemAccessor = Accessor<crate::vmm::mem::MemAccessed>;

enum MsiAccessed {}
impl AccessedResource for MsiAccessed {
    type Root = Arc<VmmHdl>;
    type Leaf = Arc<VmmHdl>;
    type Target = VmmHdl;

    fn derive(root: &Self::Root) -> Self::Leaf {
        root.clone()
    }
    fn deref(leaf: &Self::Leaf) -> &Self::Target {
        leaf
    }
}

// Keep the rest of VmmHdl hidden for the MSI accessor
pub struct MsiAccessor(Accessor<MsiAccessed>);
impl MsiAccessor {
    /// See: [`Accessor::new()`]
    pub fn new(hdl: Arc<VmmHdl>) -> Self {
        Self(Accessor::new(hdl))
    }
    /// See: [`Accessor::new_orphan()`]
    pub fn new_orphan() -> Self {
        Self(Accessor::new_orphan())
    }
    /// See: [`Accessor::child()`]
    pub fn child(&self, name: Option<String>) -> Self {
        Self(self.0.child(name))
    }
    /// See: [`Accessor::adopt()`]
    pub fn adopt(&self, child: &Self, name: Option<String>) {
        self.0.adopt(&child.0, name)
    }
    /// See: [Accessor::remove_resource()]
    pub fn remove_resource(&self) -> Option<Arc<VmmHdl>> {
        self.0.remove_resource()
    }

    /// Attempt to send an MSI with the resource held by this accessor
    /// hierarchy.  Returns [`Ok`] if valid access to the resource exists,
    /// otherwise [`Err`].
    pub fn send(&self, addr: u64, msg: u64) -> Result<(), ()> {
        if let Some(guard) = self.0.access() {
            guard.lapic_msi(addr, msg).expect("lapic_msi() should succeed");
            Ok(())
        } else {
            Err(())
        }
    }
}
impl std::fmt::Debug for MsiAccessor {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MsiAccessor").finish()
    }
}

#[cfg(test)]
mod test {
    //! Note regarding unwinding for `should_panic` tests:
    //!
    //! If any mutexes are held when the code under test panics, the poisoned
    //! mutex will prevent the unwinder from functioning properly when the
    //! [MutexGuard]s are dropped.  You will see several checks in the above
    //! logic which eschew [assert_eq] for a manual check which drops any held
    //! mutexes before issuing a [panic].

    use super::*;

    use std::sync::atomic::{AtomicUsize, Ordering};
    use std::sync::Arc;

    enum AtomicRes {}
    impl AccessedResource for AtomicRes {
        type Root = Arc<AtomicUsize>;
        type Leaf = Arc<AtomicUsize>;
        type Target = AtomicUsize;

        fn derive(root: &Self::Root) -> Self::Leaf {
            root.clone()
        }
        fn deref(leaf: &Self::Leaf) -> &Self::Target {
            leaf
        }
    }

    // Helpers:

    fn new_root() -> Accessor<AtomicRes> {
        Accessor::new(Arc::new(AtomicUsize::new(0)))
    }
    fn new_orphan() -> Accessor<AtomicRes> {
        Accessor::new_orphan()
    }
    fn new_depth(
        depth: usize,
    ) -> (Accessor<AtomicRes>, Vec<Accessor<AtomicRes>>) {
        let root = new_root();
        let mut children: Vec<Accessor<AtomicRes>> = Vec::with_capacity(depth);

        for idx in 0..depth {
            let next_child = match idx {
                0 => root.child(None),
                n => children[n - 1].child(None),
            };
            children.push(next_child);
        }
        (root, children)
    }

    #[test]
    fn tree_root() {
        let root = new_root();

        let guard = root.access();
        assert!(guard.is_some());
        let guard = guard.unwrap();
        drop(guard);

        let res = root.remove_resource();
        assert!(res.is_some());

        assert!(root.access().is_none())
    }

    #[test]
    fn simple_orphan() {
        let root = new_root();
        let orphan = new_orphan();

        assert!(root.access().is_some());
        assert!(orphan.access().is_none());

        root.adopt(&orphan, None);
        assert!(orphan.access().is_some());
    }

    #[test]
    #[should_panic]
    fn only_root_can_remove_resource() {
        let root = new_root();
        let child = root.child(None);

        assert!(root.access().is_some());
        assert!(child.access().is_some());

        child.remove_resource();
    }

    #[test]
    #[should_panic]
    fn adopt_self() {
        let root = new_root();
        root.adopt(&root, None);
    }

    #[test]
    #[should_panic]
    fn adopt_nonroot() {
        let root = new_root();
        let child = new_orphan();
        let grandchild = child.child(None);

        root.adopt(&grandchild, None);
    }

    #[test]
    fn simple_depth() {
        let depth = 4;
        let (root, children) = new_depth(depth);

        // update the inner resource, checking that it's the same at all depths
        let tval = 1;
        root.access().unwrap().store(tval, Ordering::Relaxed);
        for child in children.iter() {
            assert_eq!(child.access().unwrap().load(Ordering::Relaxed), tval);
        }

        root.remove_resource();
        for node in children.iter() {
            assert!(node.access().is_none());
        }
    }

    #[test]
    fn orphan_split() {
        let (root, children) = new_depth(5);

        // Wrap the children in Option, so we can drop one from the middle to
        // orphan its descendants
        let mut children =
            children.into_iter().map(|c| Some(c)).collect::<Vec<Option<_>>>();

        // Drop the middle node, causing its children to become orphaned
        children[2] = None;

        // Children above the "split" should be fine
        for child in children[0..2].iter().map(|c| c.as_ref().unwrap()) {
            assert!(child.access().is_some());
        }

        // Those below should be orphaned, with no access to the resource
        for child in children[3..].iter().map(|c| c.as_ref().unwrap()) {
            assert!(child.access().is_none());
        }

        let tval = 1;
        root.access().unwrap().store(tval, Ordering::Relaxed);

        // Closest available child will adopt the orphan chain
        children[1]
            .as_ref()
            .unwrap()
            .adopt(children[3].as_ref().unwrap(), None);

        // The adopted nodes should have access (and see the updated val)
        for child in children[3..].iter().map(|c| c.as_ref().unwrap()) {
            let guard = child.access().expect("resource is accessible");
            assert_eq!(guard.load(Ordering::Relaxed), tval);
        }
    }

    #[test]
    fn orphan_sibling() {
        let (root, mut children) = new_depth(2);

        let sib = root.child(Some("sibling".to_string()));
        let sib_child = sib.child(Some("sibling child".to_string()));

        // Check that both siblings, and their progeny, can access the resource
        assert!(sib.access().is_some());
        assert!(children[0].access().is_some());
        assert!(sib_child.access().is_some());
        assert!(children[1].access().is_some());

        // ... and that after orphaning one of them, that the other sibling
        // still has access
        let _ = children.remove(0);
        assert!(children[0].access().is_none());
        assert!(sib.access().is_some());
        assert!(sib_child.access().is_some());
    }

    #[test]
    fn print_names() {
        let root = new_root();

        // build up an arbitrary hierarchy to print out
        let left = root.child(Some("left".to_string()));
        let right = root.child(Some("right".to_string()));
        let mut sub = Vec::new();
        for n in 0..4 {
            let lsub = left.child(Some(format!("sub {n}")));
            let rsub = right.child(Some(format!("sub {n}")));

            match n {
                3 => {
                    sub.push(lsub.child(Some("deep".to_string())));
                }
                2 => {
                    sub.push(rsub.child(Some("deep".to_string())));
                }
                _ => {}
            }
            sub.push(lsub);
            sub.push(rsub);
        }

        right.print(true);
    }
}


================================================
FILE: lib/propolis/src/api_version.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#[derive(Debug, thiserror::Error)]
pub enum Error {
    #[error("IO Error")]
    Io(#[from] std::io::Error),

    // Newer APIs are backwards compatible with older (for now?), so don't
    // bother trying to express "we need a version before .." or any of that.
    //
    // Also assume that the component being versioned is either part of the OS
    // or that the OS version is a decent proxy for it. Not necessarily true in
    // general, but true for viona and bhyve.
    #[error("API version {have} is not at or above {want}. OS is too old?")]
    TooLow { have: u32, want: u32 },
}

#[derive(Debug, thiserror::Error)]
#[error("checking version of {component}")]
pub struct VersionCheckError {
    pub component: &'static str,
    pub path: &'static str,
    #[source]
    pub err: Error,
}

impl VersionCheckError {
    fn vmm(err: Error) -> Self {
        Self { component: "vmm", path: bhyve_api::VMM_CTL_PATH, err }
    }

    fn viona(err: Error) -> Self {
        Self { component: "viona", path: viona_api::VIONA_DEV_PATH, err }
    }
}

pub fn check() -> Result<(), VersionCheckError> {
    crate::vmm::check_api_version().map_err(VersionCheckError::vmm)?;
    crate::hw::virtio::viona::check_api_version()
        .map_err(VersionCheckError::viona)?;

    Ok(())
}


================================================
FILE: lib/propolis/src/attestation/boot_digest/crucible.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crucible::BlockIO;
use crucible::BlockIndex;
use crucible::Buffer;

use vm_attest::Measurement;

use anyhow::{anyhow, Result};
use sha2::{Digest, Sha256};
use slog::{error, info, o, Logger};
use std::time::{Duration, Instant};

/// Find the SHA256 sum of a crucible volume. This should be from a read-only
/// disk; otherwise, this isn't a reliable hash.
pub async fn boot_disk_digest(
    vol: crucible::Volume,
    log: &Logger,
) -> Result<Measurement> {
    let vol_uuid = vol.get_uuid().await.expect("could not get volume UUID");
    let vol_size = vol.total_size().await.expect("could not get volume size");
    let block_size =
        vol.get_block_size().await.expect("could not get volume block size");
    let end_block = vol_size / block_size;
    let hash_start = Instant::now();

    let log = log.new(o!("volume_id" => vol_uuid.to_string()));

    // XXX(jph): This was copied from the crucible scrub code, so that we can
    // read 128KiB of data on each read, regardless of block size.
    let block_count = 131072 / block_size;

    info!(
        log,
        "starting hash of volume";
        "volume_size" => vol_size,
        "block_size" => block_size,
        "end_block" => end_block,
        "block_count" => block_count,
    );

    let mut hasher = Sha256::new();
    let mut offset = 0;
    while offset < end_block {
        let remaining_blocks = end_block - offset;
        let this_block_count = block_count.min(remaining_blocks);
        if this_block_count != block_count {
            info!(
                log,
                "adjusting block_count to {} at offset {}",
                this_block_count,
                offset
            );
        }
        assert!(
            offset + this_block_count <= end_block,
            "offset={}, block_count={}, end={}",
            offset,
            this_block_count,
            end_block
        );

        let block = BlockIndex(offset);
        let mut buffer =
            Buffer::new(this_block_count as usize, block_size as usize);

        // Read the whole disk and hash it.
        //
        // If an individual read call fails, we'll retry some number of times,
        // but if that fails, just return an error to the attestation server.
        // If reads are failing on the boot disk, it's unlikely the instance is
        // doing well anyway, so there's not much to do here.
        let retry_count = 5;
        let mut n_retries = 0;
        loop {
            if n_retries >= retry_count {
                error!(
                    log,
                    "failed to read boot disk in {n_retries} tries \
                        aborting hash of boot digest"
                );

                return Err(anyhow!("could not hash boot disk digest"));
            }

            let res = vol.read(block, &mut buffer).await;

            if let Err(e) = res {
                error!(log,
                    "read failed: {e:?}";
                    "retry_count" => retry_count,
                    "io_offset" => offset,
                    "this_block_count" => this_block_count,
                    "block_size" => block_size,
                    "end_block" => end_block,
                );
                let delay = 1;
                error!(log, "will retry in {delay} secs");

                n_retries += 1;
                tokio::time::sleep(Duration::from_secs(delay)).await;
            } else {
                break;
            }
        }

        hasher.update(&*buffer);
        offset += this_block_count;
    }

    let elapsed = hash_start.elapsed();
    info!(log, "hash of volume took {:?} ms", elapsed.as_millis());

    Ok(Measurement::Sha256(hasher.finalize().into()))
}


================================================
FILE: lib/propolis/src/attestation/boot_digest/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use vm_attest::Measurement;

use anyhow::Result;
use slog::Logger;

#[cfg(feature = "crucible")]
mod crucible;

#[derive(Debug)]
pub enum Backend {
    #[cfg(feature = "crucible")]
    Crucible(::crucible::Volume),
}

pub async fn compute(backend: Backend, log: &Logger) -> Result<Measurement> {
    slog::info!(log, "computing disk digest for {backend:?}");

    match backend {
        #[cfg(feature = "crucible")]
        Backend::Crucible(vol) => {
            // TODO: load-bearing sleep: we have a Crucible volume, but we can
            // be here and chomping at the bit to get a digest calculation
            // started well before the volume has been activated; in
            // `propolis-server` we need to wait for at least a subsequent
            // instance start. Similar to the scrub task for Crucible disks,
            // delay some number of seconds in the hopes that activation is done
            // promptly.
            //
            // This should be replaced by awaiting for some kind of actual
            // "activated" signal.
            //
            // see #1078
            tokio::time::sleep(std::time::Duration::from_secs(10)).await;

            crucible::boot_disk_digest(vol, log).await
        }
    }
}


================================================
FILE: lib/propolis/src/attestation/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! # RFD 605: VM Attestation
//!
//!
//! ## Instance Identity Data
//!
//! Our MVP includes the following identity data for an instance:
//!
//! * boot digest, aka SHA256 hash of the boot disk specified for the instance
//! (iff the instance has a boot disk, and that boot disk is read-only)
//! * instance UUID
//!
//! If there is no boot disk, or the boot disk is not read-only, only the
//! instance ID is used as identifying data.
//!
//! If there is a read-only boot disk, the attestation server will fail
//! challenge requests from guest until the boot disk has been hashed.
//!
//!
//! ## High-Level Design
//!
//! The following assumes that the instance has a vsock device configured.
//! (If there is no vsock device, there will be no attestation server listening
//! there.)
//!
//!  - Guest software submits a 32-byte nonce to a known attestation port.
//!  - This port is backed by a vsock device in propolis.
//!  - When the instance is created (via `instance_ensure`), a tokio task
//!    begins to hash the boot disk of the instance (assuming that a boot disk
//!    is specified and that it is read-only.)
//!  - The attestation server waits on a tokio oneshot channel for the
//!    "VM conf", a structure containing data relevant to instance identity.
//!    This conf is sent to the attestation server once all of the VM identity
//!    data is done (so, in practice, when the boot disk is hashed).
//!  - Until the VM conf is ready, the attestation server fails challenges.
//!  - Once the VM conf is ready, these challenges are passed through to the
//!    sled-agent RoT APIs via the vm_attest crate, and those results are
//!    propagated back to the user.
//!

use std::net::{IpAddr, Ipv4Addr, SocketAddr};

pub mod boot_digest;
pub mod server;

// See: https://github.com/oxidecomputer/oana
pub const ATTESTATION_PORT: u16 = 605;
pub const ATTESTATION_ADDR: SocketAddr =
    SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), ATTESTATION_PORT);


================================================
FILE: lib/propolis/src/attestation/server.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io;
use std::net::SocketAddrV6;
use std::sync::Arc;
use std::sync::Mutex;

use slog::{error, info, o, Logger};
use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader};
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::{oneshot, Mutex as TokioMutex};
use tokio::task::JoinHandle;

use dice_verifier::sled_agent::AttestSledAgent;
use dice_verifier::Attest;

use vm_attest::VmInstanceConf;

use crate::attestation::{boot_digest, ATTESTATION_ADDR};

#[derive(Copy, Clone)]
pub struct AttestationServerConfig {
    pub sled_agent_addr: SocketAddrV6,
}

impl AttestationServerConfig {
    pub fn new(sled_agent_addr: SocketAddrV6) -> Self {
        Self { sled_agent_addr }
    }
}

pub struct AttestationSock {
    log: slog::Logger,
    join_hdl: JoinHandle<()>,
    hup_send: oneshot::Sender<()>,
    init_state: AttestationInitState,
}

#[derive(Debug)]
enum AttestationInitState {
    Preparing {
        vm_conf_send: oneshot::Sender<VmInstanceConf>,
    },
    /// A transient state while we're getting the initializer ready, having
    /// taken `Preparing` and its `vm_conf_send`, but before we've got a
    /// `JoinHandle` to track as running.
    Initializing,
    Running {
        init_task: JoinHandle<()>,
    },
}

/// This struct manages providing the requisite data for a corresponding
/// `AttestationSock` to become fully functional.
pub struct AttestationSockInit {
    log: slog::Logger,
    vm_conf_send: oneshot::Sender<VmInstanceConf>,
    uuid: uuid::Uuid,
    boot_backend_ref: Option<boot_digest::Backend>,
}

impl AttestationSockInit {
    /// Do any any remaining work of collecting VM RoT measurements in support
    /// of this VM's attestation server.
    pub async fn run(self) {
        let AttestationSockInit { log, vm_conf_send, uuid, boot_backend_ref } =
            self;

        let mut vm_conf = vm_attest::VmInstanceConf { uuid, boot_digest: None };

        if let Some(digest_backend) = boot_backend_ref {
            let boot_digest = match crate::attestation::boot_digest::compute(
                digest_backend,
                &log,
            )
            .await
            {
                Ok(digest) => digest,
                Err(e) => {
                    // a panic here is unfortunate, but helps us debug for
                    // now; if the digest calculation fails it may be some
                    // retryable issue that a guest OS would survive. but
                    // panicking here means we've stopped Propolis at the
                    // actual error, rather than noticing the
                    // `vm_conf_sender` having dropped elsewhere.
                    panic!("failed to compute boot disk digest: {e:?}");
                }
            };

            vm_conf.boot_digest = Some(boot_digest);
        } else {
            slog::warn!(log, "not computing boot disk digest");
        }

        let send_res = vm_conf_send.send(vm_conf);
        if let Err(_) = send_res {
            slog::error!(
                log,
                "attestation server is not listening for its config?"
            );
        }
    }
}

impl AttestationSock {
    pub async fn new(log: Logger, sa_addr: SocketAddrV6) -> io::Result<Self> {
        info!(log, "attestation server created (sled-agent addr {:?}", sa_addr);

        let listener = TcpListener::bind(ATTESTATION_ADDR).await?;
        let (vm_conf_send, vm_conf_recv) =
            oneshot::channel::<vm_attest::VmInstanceConf>();
        let (hup_send, hup_recv) = oneshot::channel::<()>();

        let attest_init_log = log.new(o!("component" => "attestation-server"));
        let attest_log_clone = attest_init_log.clone();
        let join_hdl = tokio::spawn(async move {
            Self::run(
                attest_log_clone,
                listener,
                vm_conf_recv,
                hup_recv,
                sa_addr,
            )
            .await;
        });
        let attestation_sock = Self {
            log: attest_init_log,
            join_hdl,
            hup_send,
            init_state: AttestationInitState::Preparing { vm_conf_send },
        };
        Ok(attestation_sock)
    }

    /// Stop the attestation server and abort in-flight initialization, if any
    /// is in progress.
    ///
    /// We don't worry about stopping any related `handle_conn` because they
    /// will discover that one or both ends of the connection are gone soon; we
    /// are closing our end, and the guest's side will close when the
    /// corresponding virtio-socket device is stopped.
    pub async fn halt(self) {
        let Self { join_hdl, hup_send, init_state, log: _ } = self;

        // Signal the socket listener to hang up, then wait for it to bail
        let _ = hup_send.send(());
        let _ = join_hdl.await;

        if let AttestationInitState::Running { init_task } = init_state {
            init_task.abort();
        }
    }

    /// Handle an incoming connection to the attestation port.
    async fn handle_conn(
        log: Logger,
        rot: Arc<TokioMutex<vm_attest::VmInstanceRot>>,
        vm_conf: Arc<Mutex<Option<vm_attest::VmInstanceConf>>>,
        conn: TcpStream,
    ) {
        let res = Self::handle_conn_inner(&log, rot, vm_conf, conn).await;
        if let Err(e) = res {
            slog::error!(
                log,
                "error handling attestation server connection: {e}"
            );
        }
    }

    /// The actual work of handling an incoming connection. This should only be
    /// called from `handle_conn`, and is distinct only for `?`/`Result`
    /// ergonomics.
    async fn handle_conn_inner(
        log: &Logger,
        rot: Arc<TokioMutex<vm_attest::VmInstanceRot>>,
        vm_conf: Arc<Mutex<Option<vm_attest::VmInstanceConf>>>,
        conn: TcpStream,
    ) -> anyhow::Result<()> {
        info!(log, "handling attestation request");

        let mut msg = String::new();

        const MAX_LINE_LENGTH: usize = 1024;
        let (reader, mut writer) = tokio::io::split(conn);
        let mut reader = BufReader::with_capacity(MAX_LINE_LENGTH, reader);

        loop {
            let bytes_read = reader.read_line(&mut msg).await?;
            if bytes_read == 0 {
                break;
            }

            // Check if the limit was hit and a newline wasn't found
            if bytes_read == MAX_LINE_LENGTH && !msg.ends_with('\n') {
                slog::warn!(
                    log,
                    "Line length exceeded the limit of {} bytes.",
                    MAX_LINE_LENGTH
                );
                let response =
                    vm_attest::Response::Error("Request too long".to_string());
                let mut response = serde_json::to_string(&response)?;
                response.push('\n');
                slog::info!(log, "sending error response: {response}");
                writer.write_all(response.as_bytes()).await?;
                break;
            }

            slog::debug!(log, "JSON received: {msg}");

            let result: Result<vm_attest::Request, serde_json::Error> =
                serde_json::from_str(&msg);
            let request = match result {
                Ok(q) => q,
                Err(e) => {
                    let response = vm_attest::Response::Error(e.to_string());
                    let mut response = serde_json::to_string(&response)?;
                    response.push('\n');
                    slog::info!(log, "sending error response: {response}");
                    writer.write_all(response.as_bytes()).await?;
                    break;
                }
            };

            let response = match request {
                vm_attest::Request::Attest(q) => {
                    slog::debug!(log, "qualifying data received: {q:?}");

                    let conf = {
                        let guard = vm_conf.lock().unwrap();
                        guard.to_owned()
                    };

                    match conf {
                        Some(conf) => {
                            info!(log, "vm conf is ready = {:?}", conf);

                            let rot_guard = rot.lock().await;

                            match rot_guard.attest(&conf, &q).await {
                                Ok(a) => vm_attest::Response::Attest(a),
                                Err(e) => {
                                    vm_attest::Response::Error(e.to_string())
                                }
                            }
                        }

                        // The VM conf isn't ready yet.
                        None => {
                            info!(log, "vm conf is NOT ready");
                            let response = vm_attest::Response::Error(
                                "VmInstanceConf not ready".to_string(),
                            );
                            response
                        }
                    }
                }
            };

            let mut response = serde_json::to_string(&response)?;
            response.push('\n');

            slog::debug!(log, "sending response: {response}");
            writer.write_all(response.as_bytes()).await?;
            msg.clear();
        }

        info!(log, "attestation request completed");
        Ok(())
    }

    pub fn prepare_instance_conf(
        &mut self,
        uuid: uuid::Uuid,
        boot_backend_ref: Option<boot_digest::Backend>,
    ) {
        let init_state = std::mem::replace(
            &mut self.init_state,
            AttestationInitState::Initializing,
        );
        let vm_conf_send = match init_state {
            AttestationInitState::Preparing { vm_conf_send } => vm_conf_send,
            other => {
                panic!(
                    "VM RoT used incorrectly: prepare_instance_conf called \
                        more than once. current state {other:?}"
                );
            }
        };
        let init = AttestationSockInit {
            log: self.log.clone(),
            uuid,
            boot_backend_ref,
            vm_conf_send,
        };
        let init_task = tokio::spawn(init.run());
        self.init_state = AttestationInitState::Running { init_task };
    }

    pub async fn run(
        log: Logger,
        listener: TcpListener,
        vm_conf_recv: oneshot::Receiver<vm_attest::VmInstanceConf>,
        mut hup_recv: oneshot::Receiver<()>,
        sa_addr: SocketAddrV6,
    ) {
        info!(log, "attestation server running");

        // Attestation requests get to the RoT via sled-agent API endpoints.
        let ox_attest: Box<dyn Attest + Send + Sync> =
            Box::new(AttestSledAgent::new(sa_addr, &log));
        let rot =
            Arc::new(TokioMutex::new(vm_attest::VmInstanceRot::new(ox_attest)));

        let vm_conf = Arc::new(Mutex::new(None));

        let log_ref = log.clone();
        let vm_conf_cloned = vm_conf.clone();
        tokio::spawn(async move {
            match vm_conf_recv.await {
                Ok(conf) => {
                    *vm_conf_cloned.lock().unwrap() = Some(conf);
                }
                Err(_e) => {
                    slog::warn!(
                        log_ref,
                        "lost boot digest sender, \
                        hopefully Propolis is stopping"
                    );
                }
            }
        });

        loop {
            tokio::select! {
                biased;

                _ = &mut hup_recv => {
                    return;
                },

                sock_res = listener.accept() => {
                    info!(log, "new attestation client connected");
                    match sock_res {
                        Ok((sock, _addr)) => {
                            let rot = rot.clone();
                            let log = log.clone();
                            let vm_conf = vm_conf.clone();

                            let handler = Self::handle_conn(log, rot, vm_conf,
                                sock);
                            tokio::spawn(handler);

                        }
                        Err(e) => {
                            error!(log, "attestation TCP listener error: {:?}", e);
                        }
                    }
                },
            };
        }
    }
}


================================================
FILE: lib/propolis/src/block/attachment.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Block "attachments" provide the plumbing between emulated devices and the
//! backends which execute the IO requests from said devices.
//!
//! Each emulated block device will contain a [DeviceAttachment] to which it
//! will associate one or more [DeviceQueue] instances.  The queue(s) is the
//! source of [super::Request]s, which are to be processed by an attached
//! backend.
//!
//! Block backends will each contain a [BackendAttachment] which they will
//! request worker contexts from ([SyncWorkerCtx] or [AsyncWorkerCtx]).  It is
//! through the worker context that the backend will fetch [super::Request]s
//! from the associated device in order to process them.

use std::collections::BTreeMap;
use std::future::Future;
use std::marker::PhantomPinned;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Condvar, Mutex, MutexGuard, Weak};
use std::task::{Context, Poll};

use super::minder::{NoneInFlight, QueueMinder};
use super::{
    devq_id, probes, BackendId, DeviceId, DeviceInfo, DeviceQueue,
    DeviceRequest, MetricConsumer, QueueId, WorkerId,
};
use crate::accessors::MemAccessor;
use crate::block;

use futures::stream::FuturesUnordered;
use futures::Stream;
use pin_project_lite::pin_project;
use strum::IntoStaticStr;
use thiserror::Error;
use tokio::sync::futures::Notified;
use tokio::sync::Notify;

pub const MAX_WORKERS: NonZeroUsize = NonZeroUsize::new(64).unwrap();

pub type ReqCountHint = Option<NonZeroUsize>;

#[derive(Default)]
struct QueueSlotState {
    minder: Option<Arc<QueueMinder>>,
}

struct QueueSlot {
    state: Mutex<QueueSlotState>,
    workers: Mutex<Option<Arc<WorkerCollection>>>,
    notify_count: AtomicUsize,
    queue_id: QueueId,
}
impl QueueSlot {
    fn new(queue_id: QueueId) -> Self {
        Self {
            state: Mutex::new(Default::default()),
            workers: Mutex::new(None),
            notify_count: AtomicUsize::new(0),
            queue_id,
        }
    }
    fn request_notify(&self, hint: ReqCountHint) {
        let existing = self.notify_count.load(Ordering::Acquire);
        if existing < MAX_WORKERS.get() {
            // wake everyone if we weren't given a hint
            let count = MAX_WORKERS.min(hint.unwrap_or(MAX_WORKERS));
            let _ = self.notify_count.fetch_add(count.get(), Ordering::Release);
        }
    }
    fn flush_notifications(&self) {
        let guard = self.workers.lock().unwrap();
        let Some(workers) = guard.as_ref() else {
            return;
        };

        let state = self.state.lock().unwrap();
        let Some(minder) = state.minder.as_ref() else {
            // The queue isn't associated with anything yet, so there are no
            // interested workers to wake.
            return;
        };

        let pending = self.notify_count.swap(0, Ordering::AcqRel);

        let Some(pending) = NonZeroUsize::new(pending) else {
            // We have not been asked to wake any workers since the last
            // `flush_notifications`. This is relatively unlikely but
            // legitimate, such as if this `QueueSlot` paused and resumed (as
            // for migrations) repeatedly.
            return;
        };

        // Take the full set of workers that may be idle and interested in this
        // queue. At this point we are responsible for either waking workers
        // here, or returning the idle-and-interested bit to `minder`.
        let Some(wake_wids) = minder.take_notifications() else {
            // `notify_count` was non-zero, but between checking the notify
            // count and getting idle workers, we started pausing devices.
            // Bummer. Request notification of as many workers as we were
            // going to, and let a future `flush_notifications()` take care of
            // it.
            self.request_notify(Some(pending));
            return;
        };
        drop(state);

        let remaining_wids =
            workers.wake(wake_wids, pending, Some(self.queue_id));

        if !remaining_wids.is_empty() {
            let state = self.state.lock().unwrap();
            let Some(minder) = state.minder.as_ref() else {
                // The queue no longer has a minder. This is unfortunate, but it
                // is at least OK to discard `remaining_wids` here: if this
                // queue is reassociated later, updating the queue collection's
                // associations will wake all queues.
                return;
            };

            minder.add_notifications(remaining_wids);
        }
    }
}

#[derive(Default, Clone)]
struct QueueColState {
    associated_qids: Versioned<Bitmap>,
    paused: bool,
    metric_consumer: Option<Arc<dyn MetricConsumer>>,
}
impl QueueColState {
    fn queue_associate(&mut self, qid: QueueId) -> Versioned<Bitmap> {
        self.associated_qids.update().set(qid.into());
        self.associated_qids
    }
    fn queue_dissociate(&mut self, qid: QueueId) -> Versioned<Bitmap> {
        self.associated_qids.update().unset(qid.into());
        self.associated_qids
    }
}
struct QueueCollection {
    queues: Vec<QueueSlot>,
    state: Mutex<QueueColState>,
    pub devid: DeviceId,
}
impl QueueCollection {
    fn new(max_queues: NonZeroUsize, devid: DeviceId) -> Arc<Self> {
        let count = max_queues.get();
        assert!(count <= MAX_WORKERS.get());
        let queues =
            (0..count).map(|n| QueueSlot::new(QueueId::from(n))).collect();

        Arc::new(Self { queues, devid, state: Default::default() })
    }
    fn attach(&self, workers: &Arc<WorkerCollection>) {
        for slot in self.queues.iter() {
            let old = slot.workers.lock().unwrap().replace(workers.clone());
            assert!(old.is_none(), "workers ref should not have been attached");
        }
    }
    fn detach(&self) {
        for slot in self.queues.iter() {
            let old = slot.workers.lock().unwrap().take();
            assert!(old.is_some(), "workers ref should have been attached");
        }
    }
    fn slot(&self, queue_id: QueueId) -> &QueueSlot {
        self.queues.get(usize::from(queue_id)).expect("queue id within range")
    }
    fn notify(&self, queue_id: QueueId, hint: ReqCountHint) {
        let slot = self.slot(queue_id);
        slot.request_notify(hint);
        slot.flush_notifications();
    }
    fn set_metric_consumer(&self, consumer: Arc<dyn MetricConsumer>) {
        let mut state = self.state.lock().unwrap();
        for queue in self.queues.iter() {
            if let Some(minder) = queue.state.lock().unwrap().minder.as_ref() {
                minder.set_metric_consumer(consumer.clone());
            }
        }
        state.metric_consumer = Some(consumer);
    }
    fn associated_qids(&self) -> Versioned<Bitmap> {
        self.state.lock().unwrap().associated_qids
    }
    fn pause(&self) {
        let mut state = self.state.lock().unwrap();
        assert!(!state.paused);

        state.paused = true;
        for slot in self.queues.iter() {
            if let Some(minder) = slot.state.lock().unwrap().minder.as_ref() {
                minder.pause();
            }
        }
    }
    fn resume(&self) {
        let mut state = self.state.lock().unwrap();
        assert!(state.paused);

        state.paused = false;
        for slot in self.queues.iter() {
            let state = slot.state.lock().unwrap();
            let Some(minder) = state.minder.as_ref() else {
                continue;
            };
            minder.resume();
            drop(state);

            slot.flush_notifications();
        }
    }
    fn none_processing(&self) -> NoneProcessing {
        let minders = self
            .queues
            .iter()
            .filter_map(|slot| {
                let state = slot.state.lock().unwrap();
                state.minder.as_ref().map(Arc::clone)
            })
            .collect::<Vec<_>>();
        NoneProcessing {
            minders: MinderRefs { values: minders, _pinned: PhantomPinned },
            unordered: FuturesUnordered::new(),
            loaded: false,
        }
    }

    fn next_req(
        &self,
        queue_select: QueueId,
        wid: WorkerId,
    ) -> Option<DeviceRequest> {
        let idx: usize = queue_select.into();
        let slot = self.queues.get(idx)?;

        let guard = slot.state.lock().unwrap();
        let minder = guard.minder.as_ref()?;
        let result = minder.next_req(wid);

        probes::block_poll!(|| {
            (
                devq_id(self.devid, slot.queue_id),
                wid as u64,
                result.is_some() as u8,
            )
        });
        result
    }

    fn next_req_any(
        &self,
        cursor: &mut PollCursor,
        wid: WorkerId,
    ) -> Option<DeviceRequest> {
        let idx = usize::from(cursor.0 .0);
        assert!(idx < self.queues.len());
        let (front, back) = self.queues.split_at(idx);
        let queues = back.iter().chain(front.iter());

        let (hit_qid, dreq) = queues
            .filter_map(|slot| {
                let guard = slot.state.lock().unwrap();
                let minder = guard.minder.as_ref()?;
                let result = minder.next_req(wid);

                probes::block_poll!(|| {
                    (
                        devq_id(self.devid, slot.queue_id),
                        wid as u64,
                        result.is_some() as u8,
                    )
                });

                Some((slot.queue_id, result?))
            })
            .next()?;

        // Which slot should the caller start with next time?
        cursor.0 = hit_qid.next(self.queues.len());

        Some(dreq)
    }
}

struct MinderRefs {
    values: Vec<Arc<QueueMinder>>,
    _pinned: PhantomPinned,
}
pin_project! {
    pub struct NoneProcessing {
        #[pin]
        minders: MinderRefs,
        #[pin]
        unordered: FuturesUnordered<NoneInFlight<'static>>,
        loaded: bool,
    }
    impl PinnedDrop for NoneProcessing {
        fn drop(this: Pin<&mut Self>) {
            let mut this = this.project();

            // Ensure that all references into `minders` held by NoneInFlight
            // futures are dropped before the `minders` contents themselves.
            this.unordered.clear();
        }
    }
}
impl Future for NoneProcessing {
    type Output = ();

    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let mut this = self.project();
        if !*this.loaded {
            for minder in this.minders.values.iter().map(Arc::as_ref) {
                // # SAFETY
                //
                // With the Vec<Arc<QueueMinder>> pinned (and barred via marker
                // from Unpin), it should not be possible to remove them for the
                // lifetime of this future.  With that promised to us, we can
                // extend the lifetime of the QueueMinder references long enough
                // to run the NoneInFlight futures.
                //
                // The contents of `minders` will remain pinned and untouched
                // until NoneProcessing is dropped.  At that point, any
                // lingering references held by the FuturesUnordered will be
                // explicitly released in PinnedDrop::drop(), ensuring they do
                // not outlive MinderRefs.
                let extended: &'static QueueMinder =
                    unsafe { std::mem::transmute(minder) };

                this.unordered.push(extended.none_in_flight());
            }
            *this.loaded = true;
        }
        loop {
            match Stream::poll_next(this.unordered.as_mut(), cx) {
                Poll::Ready(None) => {
                    return Poll::Ready(());
                }
                Poll::Ready(Some(_)) => {
                    continue;
                }
                Poll::Pending => {
                    return Poll::Pending;
                }
            }
        }
    }
}

/// A pair of weak references to inner state of a device and backend which are
/// attached to one another.
///
/// Attachment and detachment requires taking locks in both the device and
/// backend, and such lock is done in that specific order to avoid deadlock.
#[derive(Clone)]
struct AttachPair {
    dev_attach: Weak<DeviceAttachInner>,
    backend_attach: Weak<BackendAttachInner>,
}
impl AttachPair {
    fn attach(
        dev: &DeviceAttachment,
        be: &BackendAttachment,
    ) -> Result<(), AttachError> {
        let mut dev_att_state = dev.0.att_state.lock().unwrap();
        let mut be_att_state = be.0.att_state.lock().unwrap();

        if dev_att_state.is_some() {
            return Err(AttachError::DeviceAttached);
        }
        if be_att_state.is_some() {
            return Err(AttachError::BackendAttached);
        }

        probes::block_attach!(|| (dev.device_id().0, be.backend_id().0));
        // TODO: name the accessor child?
        let be_acc_mem = dev.0.acc_mem.child(None);
        be.0.workers.attach(&be_acc_mem, &dev.0.queues);
        dev.0.queues.attach(&be.0.workers);

        let shared = AttachPair {
            dev_attach: Arc::downgrade(&dev.0),
            backend_attach: Arc::downgrade(&be.0),
        };
        *dev_att_state = Some(shared.clone());
        *be_att_state = Some((shared, be_acc_mem));

        drop(dev_att_state);
        drop(be_att_state);

        let dev_state = dev.0.dev_state.lock().unwrap();
        if let Some(on_attach) = dev_state.on_attach.as_ref() {
            on_attach(be.info())
        }

        Ok(())
    }

    fn detach(self) {
        let (Some(dev), Some(be)) =
            (self.dev_attach.upgrade(), self.backend_attach.upgrade())
        else {
            // If the drop handler has run for the device or backend, resulting
            // in its Weak pointer being unable to upgrade, then a detach is
            // already in progress, and we can let that run to completion.
            return;
        };

        let mut dev_state = dev.att_state.lock().unwrap();
        let mut be_state = be.att_state.lock().unwrap();
        match (dev_state.as_ref(), be_state.as_ref()) {
            (Some(ds), Some((bs, _))) if self.eq(ds) && self.eq(bs) => {
                // Device and backend agree about mutual attachment
            }
            _ => {
                // It is possible for this to race with some other thread which
                // is performing detach and attach operations.  If the frontend
                // and/or backend does not match up with what we have in this
                // AttachPair, that is indicative of such a race.  Bailing out
                // here is safe, since said racing operation(s) would have
                // resulted in subsequent AttachPair-ings which maintain proper
                // references to the involved attachments.
                return;
            }
        }
        probes::block_detach!(|| (dev.device_id.0, be.backend_id.0));
        *dev_state = None;
        *be_state = None;

        // TODO: ensure workers have no in-flight requests
        be.workers.detach();
        dev.queues.detach();
    }
}
impl PartialEq for AttachPair {
    fn eq(&self, other: &Self) -> bool {
        self.dev_attach.ptr_eq(&other.dev_attach)
            && self.backend_attach.ptr_eq(&other.backend_attach)
    }
}

pub type OnAttachFn = Box<dyn Fn(DeviceInfo) + Send + Sync + 'static>;

#[derive(Default)]
struct DeviceState {
    on_attach: Option<OnAttachFn>,
}

struct DeviceAttachInner {
    att_state: Mutex<Option<AttachPair>>,
    dev_state: Mutex<DeviceState>,
    queues: Arc<QueueCollection>,
    acc_mem: MemAccessor,
    device_id: block::DeviceId,
}

/// Main "attachment point" for a block device.
pub struct DeviceAttachment(Arc<DeviceAttachInner>);
impl DeviceAttachment {
    /// Create a [DeviceAttachment] for a given device.  The maximum number of
    /// queues which the device will ever expose is set via `max_queues`.  DMA
    /// done by attached backend workers will be through the provided `acc_mem`.
    pub fn new(max_queues: NonZeroUsize, acc_mem: MemAccessor) -> Self {
        let device_id = DeviceId::new();
        let queues = QueueCollection::new(max_queues, device_id);
        Self(Arc::new(DeviceAttachInner {
            att_state: Mutex::new(None),
            dev_state: Mutex::new(DeviceState::default()),
            queues,
            acc_mem,
            device_id,
        }))
    }

    /// If a backend is attached to this device, notify it that the queue
    /// associations for this device have changed.
    fn queues_update_assoc(&self, queues_associated: Versioned<Bitmap>) {
        let guard = self.0.att_state.lock().unwrap();
        if let Some(att_state) = guard.as_ref() {
            if let Some(backend) = Weak::upgrade(&att_state.backend_attach) {
                drop(guard);
                backend.workers.update_queue_associations(queues_associated);
            }
        }
    }

    /// Associate a [DeviceQueue] with this device.
    ///
    /// Once associated, any attached backend will process requests emitted from
    /// that queue.
    ///
    /// # Panics
    ///
    /// If `queue_id` is >= the max queues specified for this device, or if
    /// an existing queue is associated with that ID.
    pub fn queue_associate(
        &self,
        queue_id: QueueId,
        queue: Arc<impl DeviceQueue>,
    ) {
        let minder = QueueMinder::new(queue, self.0.queues.devid, queue_id);

        let mut state = self.0.queues.state.lock().unwrap();
        let slot = self.0.queues.slot(queue_id);
        let mut slot_state = slot.state.lock().unwrap();
        assert!(
            slot_state.minder.is_none(),
            "queue slot should not be occupied"
        );

        if state.paused {
            // Propagate any pause state of the device into any newly
            // associating queues while in such a pause.
            minder.pause();
        }
        if let Some(consumer) = state.metric_consumer.as_ref() {
            // Propagate any metric consumer already registered with
            // this device to the newly-associating queue.
            minder.set_metric_consumer(consumer.clone());
        }
        slot_state.minder = Some(minder);
        drop(slot_state);

        let associated = state.queue_associate(queue_id);
        drop(state);

        self.queues_update_assoc(associated);
    }

    /// Dissociate a [DeviceQueue] from this device
    ///
    /// After dissociation, any attached backend will cease processing requests
    /// from that queue.
    ///
    /// # Panics
    ///
    /// if `queue_id` is >= the max queues specified for this device, or if
    /// there is not queue associated with that ID.
    pub fn queue_dissociate(&self, queue_id: QueueId) {
        let mut state = self.0.queues.state.lock().unwrap();
        let slot = self.0.queues.slot(queue_id);
        let mut slot_state = slot.state.lock().unwrap();

        let minder =
            slot_state.minder.take().expect("queue slot should be occupied");
        minder.destroy();
        drop(slot_state);

        let associated = state.queue_dissociate(queue_id);
        drop(state);

        self.queues_update_assoc(associated);
    }

    /// Notify attached backend (if any) that `queue_id` may have new IO
    /// requests to process.  If the number of available requests is known, it
    /// can be communicated via `hint` in order to optimize worker waking.
    pub fn notify(&self, queue_id: QueueId, hint: ReqCountHint) {
        self.0.queues.notify(queue_id, hint);
    }

    pub fn device_id(&self) -> DeviceId {
        self.0.device_id
    }

    /// Get the maximum queues configured for this device.
    pub fn max_queues(&self) -> NonZeroUsize {
        NonZeroUsize::new(self.0.queues.queues.len())
            .expect("non-zero queue count")
    }
    /// Get the [DeviceInfo] of the attached backend (if any)
    pub fn info(&self) -> Option<DeviceInfo> {
        let state = self.0.att_state.lock().unwrap();
        let backend = Weak::upgrade(&state.as_ref()?.backend_attach)?;
        Some(backend.info)
    }

    /// Detach the device from attached backend (if any)
    pub fn detach(&self) {
        let guard = self.0.att_state.lock().unwrap();
        if let Some(att_state) = guard.as_ref().map(Clone::clone) {
            drop(guard);
            att_state.detach();
        }
    }

    /// Pause the device, preventing workers from an attached backend (if any)
    /// from fetching new IO requests to process.  Outstanding requests will
    /// proceed as normal.
    pub fn pause(&self) {
        self.0.queues.pause();
    }

    /// Resume the device, allowing workers from an attached backend (if any) to
    /// once again fetch new IO requests to process.
    pub fn resume(&self) {
        self.0.queues.resume();
    }

    /// Emit a [Future] which will resolve when there are no request being
    /// actively processed by an attached backend.
    pub fn none_processing(&self) -> NoneProcessing {
        self.0.queues.none_processing()
    }

    /// Set the [MetricConsumer] to be informed of all request completions
    /// processed by this device.
    pub fn set_metric_consumer(&self, consumer: Arc<dyn MetricConsumer>) {
        self.0.queues.set_metric_consumer(consumer);
    }

    /// Register a function to be called when this device becomes attached to a
    /// backend.  Intended for tasks such as querying the [DeviceInfo] for
    /// presentation to the guest.
    pub fn on_attach(&self, cb: OnAttachFn) {
        self.0.dev_state.lock().unwrap().on_attach = Some(cb);
    }
}
impl Drop for DeviceAttachment {
    fn drop(&mut self) {
        self.detach();
    }
}

#[derive(Copy, Clone, Default)]
struct PollCursor(QueueId);
impl PollCursor {
    /// Suggest that the worker using this cursor poll a specific queue next
    fn suggest(&mut self, queue_id: QueueId) {
        self.0 = queue_id;
    }
}

#[derive(Default)]
struct WorkerState {
    /// Has the worker associated with this slot indicated that it is active?
    active_type: Option<WorkerType>,
    /// Has the absence of work caused this worker to sleep?
    sleeping_on: Option<DeviceId>,

    assign_strat: Versioned<Strategy>,
    assign_poll: PollAssignment,

    cursor: PollCursor,

    queues: Option<Arc<QueueCollection>>,
}

pub(crate) struct WorkerSlot {
    state: Mutex<WorkerState>,
    acc_mem: MemAccessor,
    cv: Condvar,
    notify: Notify,
    id: WorkerId,
}
impl WorkerSlot {
    fn new(id: WorkerId) -> Self {
        Self {
            state: Mutex::new(Default::default()),
            acc_mem: MemAccessor::new_orphan(),
            cv: Condvar::new(),
            notify: Notify::new(),
            id,
        }
    }
    fn block_for_req(&self) -> Option<DeviceRequest> {
        let mut state = self.state.lock().unwrap();
        assert!(state.active_type.is_some());

        loop {
            let devid = match self.next_req(&mut state) {
                PollResult::Ok(device_request) => {
                    return Some(device_request);
                }
                PollResult::Detached | PollResult::Halted => {
                    return None;
                }
                PollResult::WaitFor(devid) => devid,
            };

            state.sleeping_on = Some(devid);
            probes::block_sleep!(|| { (devid.0, self.id as u64) });
            state = self.cv.wait(state).unwrap();
            probes::block_wake!(|| { (devid.0, self.id as u64) });
            state.sleeping_on = None;
        }
    }

    fn next_req(&self, state: &mut MutexGuard<WorkerState>) -> PollResult {
        assert!(state.active_type.is_some());

        let Some(queues) = state.queues.as_ref() else {
            return PollResult::Detached;
        };
        let devid = queues.devid;
        let result = match state.assign_poll {
            PollAssignment::Halt => {
                return PollResult::Halted;
            }
            PollAssignment::Idle => None,
            PollAssignment::Fixed(queue_id) => {
                queues.next_req(queue_id, self.id)
            }
            PollAssignment::Any => {
                // Copy cursor since split borrows confuses borrowck
                let mut cursor = state.cursor;
                let result = queues.next_req_any(&mut cursor, self.id);
                state.cursor = cursor;
                result
            }
        };
        match result {
            Some(req) => PollResult::Ok(req),
            None => PollResult::WaitFor(devid),
        }
    }

    fn async_start_sleep(
        &self,
        mut state: MutexGuard<WorkerState>,
        devid: DeviceId,
    ) {
        state.sleeping_on = Some(devid);
        probes::block_sleep!(|| { (devid.0, self.id as u64) });
    }

    fn async_stop_sleep(&self) {
        let mut state = self.state.lock().unwrap();
        if let Some(devid) = state.sleeping_on.take() {
            probes::block_wake!(|| { (devid.0, self.id as u64) });
        }
    }

    fn wait_for_req(&self) -> WaitForReq<'_> {
        WaitForReq::new(self)
    }

    fn update_assignment(&self, assign: &Assignment) {
        let mut state = self.state.lock().unwrap();
        if state.assign_strat.newer_than(&assign.strategy) {
            // We already have a newer assignment
            return;
        }
        state.assign_strat = assign.strategy;
        if assign.should_halt {
            state.assign_poll = PollAssignment::Halt;
        } else {
            state.assign_poll =
                if let Some(poll_assign) = assign.poll_assignments.as_ref() {
                    *poll_assign.get(&self.id).unwrap_or(&PollAssignment::Any)
                } else {
                    PollAssignment::Idle
                };
        }
        self.wake(Some(state), None);
    }

    fn wake(
        &self,
        state: Option<MutexGuard<WorkerState>>,
        qid_hint: Option<QueueId>,
    ) -> bool {
        let mut state = state.unwrap_or_else(|| self.state.lock().unwrap());
        if let Some(wtype) = state.active_type {
            if state.sleeping_on.is_some() {
                if let Some(qid) = qid_hint {
                    state.cursor.suggest(qid);
                }
                match wtype {
                    WorkerType::Sync => self.cv.notify_one(),
                    WorkerType::Async => self.notify.notify_one(),
                }
                return true;
            }
        }

        false
    }
}

/// Device queue worker is assigned to poll
#[derive(Clone, Copy, Default)]
enum PollAssignment {
    /// End polling immediately since backend is halted
    Halt,
    /// Poll no queue(s) as worker is in idle state
    Idle,
    /// Fixed queue specified by [QueueId]
    Fixed(QueueId),
    /// Poll any queue(s)
    #[default]
    Any,
}

#[derive(Default)]
struct WorkerColState {
    backend_running: bool,

    strategy: Versioned<Strategy>,

    workers_active: Bitmap,

    associated_qids: Versioned<Bitmap>,

    device_id: Option<DeviceId>,
}
impl WorkerColState {
    fn set_worker_state(&mut self, wid: WorkerId, is_active: bool) {
        if is_active {
            self.workers_active.set(wid);
        } else {
            self.workers_active.unset(wid);
        }
    }
    /// Based on active workers and queues, pick a suitable dispatch strategy
    /// and perform any worker->queue assignments (if applicable to the newly
    /// selected strategy).
    fn generate_assignments(&mut self) -> Assignment {
        // Pick a (potentially) new strategy in the face of updated state
        self.strategy.replace(if self.backend_running {
            Strategy::choose(
                self.workers_active.count(),
                self.associated_qids.get().count(),
            )
        } else {
            Strategy::Idle
        });

        if !self.backend_running {
            return Assignment {
                strategy: self.strategy,
                poll_assignments: None,
                should_halt: true,
            };
        }
        let poll_assignments = match self.strategy.get() {
            Strategy::Idle => None,
            Strategy::Single => {
                assert_eq!(self.associated_qids.get().count(), 1);
                let single_queue: QueueId =
                    self.associated_qids.get().iter().next().unwrap().into();

                Some(
                    self.workers_active
                        .iter()
                        .map(|wid| (wid, PollAssignment::Fixed(single_queue)))
                        .collect(),
                )
            }
            Strategy::Static => {
                let worker_count = self.workers_active.count();
                let queue_count = self.associated_qids.get().count();
                assert!(
                    worker_count >= queue_count,
                    "workers should >= queues when {:?} is chosen",
                    self.strategy.get()
                );
                let per_queue = worker_count / queue_count;
                let mut queue_loop = self.associated_qids.get().looping_iter();

                let mut workers = self.workers_active.iter();

                let mut assigned: BTreeMap<WorkerId, PollAssignment> = workers
                    .by_ref()
                    .take(per_queue * queue_count)
                    .map(|wid| {
                        (
                            wid,
                            PollAssignment::Fixed(queue_loop.next().expect(
                                "looping queue iter should emit results",
                            ).into()),
                        )
                    })
                    .collect();
                // Remaining workers will be idled
                assigned.extend(workers.map(|wid| (wid, PollAssignment::Idle)));

                Some(assigned)
            }
            Strategy::FreeForAll => Some(
                self.workers_active
                    .iter()
                    .map(|wid| (wid, PollAssignment::Any))
                    .collect(),
            ),
        };
        Assignment {
            strategy: self.strategy,
            poll_assignments,
            should_halt: false,
        }
    }
}

#[derive(Default, Copy, Clone, PartialEq, Eq, Debug, IntoStaticStr)]
pub enum Strategy {
    /// An explicitly stopped backend or lack of workers and/or queues means
    /// there is no dispatching to do
    #[default]
    Idle,

    /// All workers servicing single queue
    Single,

    /// Workers are statically assigned to queues in an even distribution.
    Static,

    /// Workers will round-robin through all queues, attempting to pick up
    /// requests from any they can.
    FreeForAll,
}
impl Strategy {
    pub fn choose(worker_count: usize, queue_count: usize) -> Self {
        if worker_count == 0 || queue_count == 0 {
            return Strategy::Idle;
        }
        if queue_count == 1 {
            return Strategy::Single;
        }
        if worker_count >= queue_count {
            return Strategy::Static;
        }
        // Unfortunate, but better than leaving requests to linger in a queue
        // which lacks any assigned workers
        Strategy::FreeForAll
    }
}

struct Assignment {
    strategy: Versioned<Strategy>,
    poll_assignments: Option<BTreeMap<WorkerId, PollAssignment>>,
    should_halt: bool,
}

pub(crate) struct WorkerCollection {
    workers: Vec<WorkerSlot>,
    state: Mutex<WorkerColState>,
}
impl WorkerCollection {
    fn new(max_workers: NonZeroUsize) -> Arc<Self> {
        let max_workers = max_workers.get();
        assert!(max_workers <= MAX_WORKERS.get());
        let workers: Vec<_> = (0..max_workers)
            .map(|id| WorkerSlot::new(WorkerId::from(id)))
            .collect();
        Arc::new(Self { workers, state: Default::default() })
    }
    fn set_active(&self, id: WorkerId, new_type: Option<WorkerType>) -> bool {
        if let Some(slot) = self.workers.get(id) {
            let refresh_guard = {
                let mut wstate = slot.state.lock().unwrap();
                if wstate.active_type.is_some() != new_type.is_some() {
                    let mut cstate = self.state.lock().unwrap();
                    cstate.set_worker_state(id, new_type.is_some());
                    wstate.active_type = new_type;
                    Some(cstate)
                } else {
                    None
                }
            };

            if let Some(guard) = refresh_guard {
                self.assignments_refresh(guard);
                return true;
            }
        }
        false
    }
    fn assignments_refresh(&self, mut state: MutexGuard<WorkerColState>) {
        let assign = state.generate_assignments();
        let devid = state.device_id.unwrap_or(block::DeviceId::INVALID);
        drop(state);

        super::probes::block_strategy!(|| {
            let assign_name: &'static str = assign.strategy.get().into();
            let generation = assign.strategy.generation() as u64;
            (devid.0, assign_name, generation)
        });
        for slot in self.workers.iter() {
            slot.update_assignment(&assign);
        }
    }
    fn slot(&self, id: WorkerId) -> &WorkerSlot {
        self.workers.get(id).expect("valid worker id for slot")
    }
    fn attach(&self, parent_mem: &MemAccessor, queues: &Arc<QueueCollection>) {
        for (idx, slot) in self.workers.iter().enumerate() {
            parent_mem.adopt(&slot.acc_mem, Some(format!("worker-{idx}")));
            let mut state = slot.state.lock().unwrap();
            let old = state.queues.replace(queues.clone());
            assert!(old.is_none(), "worker slot not already attached");
        }

        let mut state = self.state.lock().unwrap();
        state.device_id = Some(queues.devid);
        state.associated_qids = queues.associated_qids();
    }
    fn detach(&self) {
        for slot in self.workers.iter() {
            let mut state = slot.state.lock().unwrap();
            let old = state.queues.take();
            assert!(old.is_some(), "worker slot should have been attached");
        }
        let mut state = self.state.lock().unwrap();
        state.strategy.replace(Strategy::Idle);
        // With no device attached, the queues information should be cleared
        state.associated_qids = Versioned::default();
        state.device_id = None;
    }
    fn wake(
        &self,
        wake_wids: Bitmap,
        limit: NonZeroUsize,
        qid_hint: Option<QueueId>,
    ) -> Bitmap {
        probes::block_worker_collection_wake!(|| (wake_wids.0, limit.get()));

        let mut num_woken = 0;
        let mut idle_wids = wake_wids.iter();

        for wid in &mut idle_wids {
            let Some(slot) = self.workers.get(wid) else {
                continue;
            };

            if slot.wake(None, qid_hint) {
                num_woken += 1;
            }

            if num_woken == limit.get() {
                break;
            }
        }

        let remainder = idle_wids.remainder();

        probes::block_worker_collection_woken!(|| (remainder.0, num_woken));

        remainder
    }
    fn update_queue_associations(&self, queues_associated: Versioned<Bitmap>) {
        let mut state = self.state.lock().unwrap();
        state.associated_qids.replace_if_newer(&queues_associated);
        self.assignments_refresh(state);
    }
    fn start(&self) {
        let mut state = self.state.lock().unwrap();
        state.backend_running = true;
        self.assignments_refresh(state);
    }
    fn stop(&self) {
        let mut state = self.state.lock().unwrap();
        state.backend_running = false;
        self.assignments_refresh(state);
    }
}

#[derive(Copy, Clone)]
pub enum WorkerType {
    Sync,
    Async,
}

pub struct InactiveWorkerCtx {
    workers: Arc<WorkerCollection>,
    id: WorkerId,
}
impl InactiveWorkerCtx {
    /// Activate this worker for synchronous operation.
    ///
    /// Returns [None] if there is already an active worker in the slot
    /// associated with this [WorkerId].
    pub fn activate_sync(self) -> Option<SyncWorkerCtx> {
        if self.workers.set_active(self.id, Some(WorkerType::Sync)) {
            Some(SyncWorkerCtx(self.into()))
        } else {
            None
        }
    }

    /// Activate this worker for asynchronous operation.
    ///
    /// Returns [None] if there is already an active worker in the slot
    /// associated with this [WorkerId].
    pub fn activate_async(self) -> Option<AsyncWorkerCtx> {
        if self.workers.set_active(self.id, Some(WorkerType::Async)) {
            Some(AsyncWorkerCtx(self.into()))
        } else {
            None
        }
    }
}

/// Worker context for synchronous (blocking) request processing.
///
/// Note: When the context is dropped, the slot for this [WorkerId] will become
/// vacant, and available to be activated again.
pub struct SyncWorkerCtx(WorkerCtxInner);
impl SyncWorkerCtx {
    /// Block (synchronously) in order to retrieve the next
    /// [request](DeviceRequest) from the device.  Will return [None] if no
    /// device is attached, or the backend is stopped, otherwise it will block
    /// until a request is available.
    pub fn block_for_req(&self) -> Option<DeviceRequest> {
        self.0.workers.slot(self.0.id).block_for_req()
    }
    /// Get the [MemAccessor] required to do DMA for request processing
    pub fn acc_mem(&self) -> &MemAccessor {
        self.0.acc_mem()
    }
}

/// Worker context for asynchronous request processing
///
/// Note: When the context is dropped, the slot for this [WorkerId] will become
/// vacant, and available to be activated again.
pub struct AsyncWorkerCtx(WorkerCtxInner);
impl AsyncWorkerCtx {
    /// Get a [Future] which will wait for a [request](DeviceRequest) to be made
    /// available from an attached device.
    pub fn wait_for_req(&self) -> WaitForReq<'_> {
        self.0.workers.slot(self.0.id).wait_for_req()
    }
    /// Get the [MemAccessor] required to do DMA for request processing
    pub fn acc_mem(&self) -> &MemAccessor {
        self.0.acc_mem()
    }
}

struct WorkerCtxInner {
    workers: Arc<WorkerCollection>,
    id: WorkerId,
}
impl From<InactiveWorkerCtx> for WorkerCtxInner {
    fn from(value: InactiveWorkerCtx) -> Self {
        let InactiveWorkerCtx { workers, id } = value;
        WorkerCtxInner { workers, id }
    }
}
impl WorkerCtxInner {
    fn acc_mem(&self) -> &MemAccessor {
        &self.workers.slot(self.id).acc_mem
    }
}
impl Drop for WorkerCtxInner {
    /// Deactivate the worker when it is dropped
    fn drop(&mut self) {
        assert!(
            self.workers.set_active(self.id, None),
            "active worker is valid during deactivation"
        );
    }
}

struct BackendAttachInner {
    att_state: Mutex<Option<(AttachPair, MemAccessor)>>,
    workers: Arc<WorkerCollection>,
    info: DeviceInfo,
    backend_id: BackendId,
}

/// Main "attachment point" for a block backend.
pub struct BackendAttachment(Arc<BackendAttachInner>);
impl BackendAttachment {
    pub fn new(max_workers: NonZeroUsize, info: DeviceInfo) -> Self {
        Self(Arc::new(BackendAttachInner {
            att_state: Mutex::new(None),
            workers: WorkerCollection::new(max_workers),
            info,
            backend_id: BackendId::new(),
        }))
    }
    /// Get an (inactive) [context](InactiveWorkerCtx) for a given [WorkerId].
    pub fn worker(&self, id: WorkerId) -> InactiveWorkerCtx {
        assert!(id < self.0.workers.workers.len());
        InactiveWorkerCtx { workers: self.0.workers.clone(), id }
    }

    pub fn max_workers(&self) -> NonZeroUsize {
        NonZeroUsize::new(self.0.workers.workers.len())
            .expect("WorkerCollection correctly initialized")
    }

    pub fn info(&self) -> DeviceInfo {
        self.0.info
    }

    pub fn backend_id(&self) -> BackendId {
        self.0.backend_id
    }

    /// Permit workers to pull requests from the attached device (if any) for
    /// processing.
    pub fn start(&self) {
        self.0.workers.start()
    }

    /// Remove access to pull requests from the attached device (if any) from
    /// workers, causing them to halt processing once they have completed any
    /// in-flight work.
    pub fn stop(&self) {
        self.0.workers.stop()
    }

    /// Detach this backend from the device (if any)
    pub fn detach(&self) {
        let guard = self.0.att_state.lock().unwrap();
        if let Some(att_state) =
            guard.as_ref().map(|(att_state, _)| att_state.clone())
        {
            drop(guard);
            att_state.detach();
        }
    }
}
impl Drop for BackendAttachment {
    fn drop(&mut self) {
        self.detach()
    }
}

/// Attach a [device](DeviceAttachment) to a [backend](BackendAttachment).
pub fn attach(
    device: &DeviceAttachment,
    backend: &BackendAttachment,
) -> Result<(), AttachError> {
    AttachPair::attach(device, backend)
}

pin_project! {
    /// [Future] returned from [`Waiter::for_req()`]
    pub struct WaitForReq<'a> {
        slot: &'a WorkerSlot,
        sleeping_on: Option<DeviceId>,
        #[pin]
        wait: Notified<'a>
    }

    impl PinnedDrop for WaitForReq<'_> {
        fn drop(this: Pin<&mut Self>) {
            let this = this.project();
            if let Some(_) = this.sleeping_on.take() {
                this.slot.async_stop_sleep();
            }
        }
    }
}

impl WaitForReq<'_> {
    fn new<'a>(slot: &'a WorkerSlot) -> WaitForReq<'a> {
        let wait = slot.notify.notified();
        WaitForReq { slot, sleeping_on: None, wait }
    }
}

impl Future for WaitForReq<'_> {
    type Output = Option<DeviceRequest>;
    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let mut this = self.project();

        if let Some(_) = this.sleeping_on.take() {
            this.slot.async_stop_sleep();
        }

        loop {
            let mut state = this.slot.state.lock().unwrap();
            match this.slot.next_req(&mut state) {
                PollResult::Ok(dreq) => {
                    return Poll::Ready(Some(dreq));
                }
                PollResult::WaitFor(devid) => {
                    // Record that this worker is going to sleep
                    *this.sleeping_on = Some(devid);
                    this.slot.async_start_sleep(state, devid);

                    if let Poll::Ready(_) =
                        Notified::poll(this.wait.as_mut(), cx)
                    {
                        // The `Notified` future is fused, so we must "refresh"
                        // prior to any subsequent attempts to poll it after it
                        // emits `Ready`
                        this.wait.set(this.slot.notify.notified());

                        // Take another lap if woken by the notifier to check
                        // for a pending request
                        continue;
                    }
                    return Poll::Pending;
                }
                PollResult::Detached | PollResult::Halted => {
                    return Poll::Ready(None);
                }
            }
        }
    }
}

enum PollResult {
    /// Accepted request from device queue
    Ok(DeviceRequest),
    /// Worker has been idled, likely due to empty device queue(s)
    WaitFor(DeviceId),
    /// Backend is not attached to any device
    Detached,
    /// Backend is halting workers
    Halted,
}

#[derive(Error, Debug)]
pub enum AttachError {
    #[error("backend already attached")]
    BackendAttached,
    #[error("device already attached")]
    DeviceAttached,
}

/// Resource versioned with a generation number
#[derive(Copy, Clone)]
struct Versioned<T: Copy + Clone> {
    generation: usize,
    item: T,
}
impl<T: Copy + Clone> Versioned<T> {
    fn new(item: T) -> Self {
        Self { generation: 0, item }
    }
    /// Is this resource newer than `compare`?
    fn newer_than(&self, compare: &Self) -> bool {
        self.generation > compare.generation
    }
    /// Get mutable reference to resource while incrementing its generation
    fn update(&mut self) -> &mut T {
        self.generation += 1;
        &mut self.item
    }
    /// Replace contained resource and increment the generation
    fn replace(&mut self, item: T) {
        *self.update() = item;
    }
    fn replace_if_newer(&mut self, compare: &Self) {
        if compare.newer_than(self) {
            *self = *compare;
        }
    }
    fn get(&self) -> T {
        self.item
    }
    fn generation(&self) -> usize {
        self.generation
    }
}
impl<T: Copy + Clone + Default> Default for Versioned<T> {
    fn default() -> Self {
        Self::new(T::default())
    }
}

/// Simple bitmap which facilitates iterator over bits which are asserted
#[derive(Copy, Clone, Default)]
pub(crate) struct Bitmap(u64);
impl Bitmap {
    const TOP_BIT: usize = u64::BITS as usize;

    pub const ALL: Self = Self(u64::MAX);

    pub fn set(&mut self, idx: usize) {
        assert!(idx < Self::TOP_BIT);
        self.0 |= 1u64 << idx;
    }
    pub fn unset(&mut self, idx: usize) {
        assert!(idx < Self::TOP_BIT);
        self.0 &= !(1u64 << idx);
    }
    pub fn set_all(&mut self, other: Bitmap) {
        self.0 |= other.0;
    }
    pub fn lowest_set(&self) -> Option<usize> {
        if self.0.count_ones() == 0 {
            None
        } else {
            Some(self.0.trailing_zeros() as usize)
        }
    }
    pub fn count(&self) -> usize {
        self.0.count_ones() as usize
    }
    pub fn is_empty(&self) -> bool {
        self.count() == 0
    }
    pub fn take(&mut self) -> Self {
        Self(std::mem::replace(&mut self.0, 0))
    }
    /// Get iterator which emits indices of bits which are set in this map.
    pub fn iter(&self) -> BitIter {
        BitIter(*self)
    }
    /// Get iterator which emits indices of bits which are set in this map.
    /// It will infinitely loop back to the first bit whenever the last bit is
    /// reached.
    pub fn looping_iter(&self) -> LoopIter {
        LoopIter { orig: *self, cur: *self }
    }
}

pub struct BitIter(Bitmap);
impl Iterator for BitIter {
    type Item = usize;

    fn next(&mut self) -> Option<Self::Item> {
        let idx = self.0.lowest_set()?;
        self.0.unset(idx);
        Some(idx)
    }
}
impl BitIter {
    fn remainder(self) -> Bitmap {
        self.0
    }
}
pub struct LoopIter {
    cur: Bitmap,
    orig: Bitmap,
}
impl Iterator for LoopIter {
    type Item = usize;

    fn next(&mut self) -> Option<Self::Item> {
        if self.orig.count() == 0 {
            return None;
        }
        if self.cur.count() == 0 {
            self.cur = self.orig;
        }
        let idx = self.cur.lowest_set().unwrap();
        self.cur.unset(idx);
        Some(idx)
    }
}


================================================
FILE: lib/propolis/src/block/crucible.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implement a virtual block device backed by Crucible

use std::io;
use std::num::NonZeroUsize;
use std::ops::Deref;
use std::sync::Arc;

use crate::block;
use crate::tasks::TaskGroup;
use crate::vmm::MemCtx;

use crucible::{
    BlockIO, Buffer, CrucibleError, ReplaceResult, SnapshotDetails, Volume,
    VolumeBuilder,
};
use crucible_client_types::VolumeConstructionRequest;
use crucible_client_types::VolumeInfo;
use oximeter::types::ProducerRegistry;
use slog::{error, info};
use thiserror::Error;
use uuid::Uuid;

pub use nexus_client::Client as NexusClient;

// TODO: Make this a runtime tunable?
const WORKER_COUNT: NonZeroUsize = NonZeroUsize::new(8).unwrap();

pub struct CrucibleBackend {
    block_attach: block::BackendAttachment,
    state: Arc<WorkerState>,
    workers: TaskGroup,
}
struct WorkerState {
    volume: Volume,
    info: block::DeviceInfo,
    skip_flush: bool,
}
impl WorkerState {
    async fn process_loop(&self, wctx: block::AsyncWorkerCtx) {
        // Start with a read buffer of a single block
        // It will be resized larger (and remain so) if subsequent read
        // operations required additional space.
        let mut readbuf = Buffer::new(1, self.info.block_size as usize);
        loop {
            let Some(dreq) = wctx.wait_for_req().await else {
                break;
            };

            let Some(memctx) = wctx.acc_mem().access() else {
                dreq.complete(block::Result::Failure);
                continue;
            };

            let res = match self
                .process_request(
                    self.volume.deref(),
                    dreq.req(),
                    &mut readbuf,
                    &memctx,
                )
                .await
            {
                Ok(_) => block::Result::Success,
                Err(e) => {
                    let mapped = block::Result::from(e);
                    assert!(mapped.is_err());
                    mapped
                }
            };

            dreq.complete(res);
        }
    }

    async fn process_request(
        &self,
        block: &(dyn BlockIO + Send + Sync),
        req: &block::Request,
        readbuf: &mut Buffer,
        mem: &MemCtx,
    ) -> Result<(), Error> {
        let block_size = self.info.block_size as usize;

        match req.op {
            block::Operation::Read(off, len) => {
                let (off_blocks, len_blocks) =
                    block_offset_count(off, len, block_size)?;

                let maps =
                    req.mappings(mem).ok_or_else(|| Error::BadGuestRegion)?;

                // Perform one large read from crucible, and write from data into
                // mappings
                readbuf.reset(len_blocks, block_size);
                let _ = block.read(off_blocks, readbuf).await?;

                let mut nwritten = 0;
                for mapping in maps {
                    nwritten += mapping.write_bytes(
                        &readbuf[nwritten..(nwritten + mapping.len())],
                    )?;
                }

                if nwritten != len {
                    return Err(Error::CopyError(nwritten, len));
                }
            }
            block::Operation::Write(off, len) => {
                if self.info.read_only {
                    return Err(Error::ReadOnly);
                }

                let (off_blocks, _len_blocks) =
                    block_offset_count(off, len, block_size)?;

                // Read from all the mappings into vec, and perform one large write
                // to crucible
                let maps =
                    req.mappings(mem).ok_or_else(|| Error::BadGuestRegion)?;
                let mut data = crucible::BytesMut::with_capacity(len);
                let mut nread = 0;
                for mapping in maps {
                    let n = mapping.read_bytes_uninit(
                        &mut data.spare_capacity_mut()[..mapping.len()],
                    )?;
                    // `read_bytes` returns the number of bytes written, so we can
                    // expand our initialized area by this amount.
                    unsafe {
                        data.set_len(data.len() + n);
                    }
                    nread += n;
                }
                if nread != len {
                    return Err(Error::CopyError(nread, len));
                }

                let _ = block.write(off_blocks, data).await?;
            }
            block::Operation::Flush => {
                if !self.skip_flush {
                    // Send flush to crucible
                    let _ = block.flush(None).await?;
                }
            }
            block::Operation::Discard => {
                // Crucible does not support discard operations for now, so we implement this as
                // a no-op (which technically is a valid implementation of discard, just one that
                // doesn't actually free any space).
                return Ok(());
            }
        }
        Ok(())
    }
}

impl CrucibleBackend {
    pub async fn create(
        request: VolumeConstructionRequest,
        opts: block::BackendOpts,
        producer_registry: Option<ProducerRegistry>,
        nexus_client: Option<NexusClient>,
        log: slog::Logger,
    ) -> io::Result<Arc<Self>> {
        // Construct the volume.
        let volume = Volume::construct(request, producer_registry, log.clone())
            .await
            .map_err(|e| io::Error::from(CrucibleError::from(e)))?;

        // Decide if we need to scrub this volume or not.
        //
        // We should not scrub read-only volumes, as we cannot write back to
        // them, due to...you know, being read-only. So just don't do that.
        if !opts.is_read_only() && volume.has_read_only_parent() {
            let vclone = volume.clone();
            tokio::spawn(async move {
                let volume_id = vclone.get_uuid().await.unwrap();

                // This does the actual scrub.
                match vclone.scrub(Some(120), Some(25)).await {
                    Ok(()) => {
                        if let Some(nexus_client) = nexus_client {
                            info!(
                                log,
                                "Scrub of volume {} completed, remove parent",
                                volume_id
                            );

                            Self::remove_read_only_parent(
                                &volume_id,
                                nexus_client,
                                log,
                            )
                            .await;
                        } else {
                            // No nexus contact was provided, so just log
                            // a message.
                            info!(
                                log,
                                "Scrub of volume {} completed", volume_id
                            );
                        }
                    }
                    Err(e) => {
                        error!(
                            log,
                            "Scrub of volume {} failed: {}", volume_id, e
                        );
                        // TODO: Report error to nexus that scrub failed
                    }
                }
            });
        }

        // After active negotiation, set sizes
        let block_size = volume.get_block_size().await?;
        let total_size = volume.total_size().await?;
        let sectors = total_size / block_size;

        let info = block::DeviceInfo {
            block_size: block_size as u32,
            total_size: sectors,
            read_only: opts.is_read_only(),
            supports_discard: false,
        };

        Ok(Arc::new(Self {
            block_attach: block::BackendAttachment::new(WORKER_COUNT, info),
            state: Arc::new(WorkerState {
                volume,
                info,
                skip_flush: opts.skip_flush.unwrap_or(false),
            }),
            workers: TaskGroup::new(),
        }))
    }

    /// Return the block size of this Crucible backend, if it can be determined.
    pub async fn block_size(&self) -> Option<u32> {
        self.state
            .volume
            .get_block_size()
            .await
            .ok()
            .and_then(|sz| sz.try_into().ok())
    }

    /// Create Crucible backend using the in-memory volume backend, rather than
    /// "real" Crucible downstairs instances.
    pub async fn create_mem(
        size: u64,
        opts: block::BackendOpts,
        log: slog::Logger,
    ) -> io::Result<Arc<Self>> {
        let block_size = u64::from(opts.block_size.ok_or_else(|| {
            CrucibleError::GenericError(
                "block_size is required parameter".into(),
            )
        })?);
        // Allocate and construct the volume.
        let mem_disk = Arc::new(crucible::InMemoryBlockIO::new(
            Uuid::new_v4(),
            block_size,
            size as usize,
        ));
        let mut builder = VolumeBuilder::new(block_size, log);
        builder
            .add_subvolume(mem_disk)
            .await
            .map_err(|e| std::io::Error::from(e))?;

        let info = block::DeviceInfo {
            block_size: block_size as u32,
            total_size: size / block_size,
            read_only: opts.read_only.unwrap_or(false),
            supports_discard: false,
        };

        Ok(Arc::new(CrucibleBackend {
            block_attach: block::BackendAttachment::new(WORKER_COUNT, info),
            state: Arc::new(WorkerState {
                volume: builder.into(),
                info,
                skip_flush: opts.skip_flush.unwrap_or(false),
            }),
            workers: TaskGroup::new(),
        }))
    }

    // Communicate to Nexus that we can remove the read only parent for
    // the given volume id.
    async fn remove_read_only_parent(
        volume_id: &Uuid,
        nexus_client: NexusClient,
        log: slog::Logger,
    ) {
        // Notify Nexus of the state change.
        match nexus_client.cpapi_disk_remove_read_only_parent(&volume_id).await
        {
            Ok(_) => {
                info!(
                    log,
                    "Submitted removal for read only parent on {}", volume_id,
                );
            }
            Err(e) => {
                // We finished the scrub, but can't tell Nexus to remove
                // the read only parent. While this is not ideal, as it
                // means we will re-do a scrub the next time this
                // volume is attached, it won't result in any harm to
                // the volume or data.
                error!(log, "Failed removal of read only parent: {}", e,);
            }
        }
    }

    /// Retrieve the UUID identifying this Crucible backend.
    pub async fn get_uuid(&self) -> io::Result<uuid::Uuid> {
        self.state.volume.get_uuid().await.map_err(CrucibleError::into)
    }

    /// Issue a snapshot request
    pub async fn snapshot(&self, snapshot_id: Uuid) -> io::Result<()> {
        self.state
            .volume
            .flush(Some(SnapshotDetails {
                snapshot_name: snapshot_id.to_string(),
            }))
            .await
            .map_err(CrucibleError::into)
    }

    /// Issue a VolumeConstructionRequest replacement
    pub async fn vcr_replace(
        &self,
        old_vcr_json: &str,
        new_vcr_json: &str,
    ) -> io::Result<ReplaceResult> {
        let old_vcr = serde_json::from_str(old_vcr_json)?;
        let new_vcr = serde_json::from_str(new_vcr_json)?;
        self.state
            .volume
            .target_replace(old_vcr, new_vcr)
            .await
            .map_err(CrucibleError::into)
    }

    fn spawn_workers(&self) {
        let max_workers = self.block_attach.max_workers().get();
        self.workers.extend((0..max_workers).map(|n| {
            let worker_state = self.state.clone();
            let wctx = self.block_attach.worker(n);

            tokio::spawn(async move {
                let Some(wctx) = wctx.activate_async() else {
                    return;
                };
                worker_state.process_loop(wctx).await
            })
        }))
    }

    pub async fn volume_is_active(&self) -> Result<bool, CrucibleError> {
        self.state.volume.query_is_active().await
    }

    pub fn clone_volume(&self) -> Volume {
        self.state.volume.clone()
    }

    pub fn is_read_only(&self) -> bool {
        self.state.info.read_only
    }

    pub async fn query_volume_info(&self) -> Result<VolumeInfo, CrucibleError> {
        self.state.volume.query_volume_info().await
    }
}

#[async_trait::async_trait]
impl block::Backend for CrucibleBackend {
    fn attachment(&self) -> &block::BackendAttachment {
        &self.block_attach
    }
    async fn start(&self) -> anyhow::Result<()> {
        self.state.volume.activate().await?;
        self.block_attach.start();
        self.spawn_workers();
        Ok(())
    }
    async fn stop(&self) -> () {
        self.block_attach.stop();
        self.workers.join_all().await;
    }
    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

#[derive(Debug, Error)]
pub enum Error {
    #[error("invalid guest memory region")]
    BadGuestRegion,
    #[error("backend is read-only")]
    ReadOnly,
    #[error("operation not supported")]
    Unsupported,

    #[error("offset or length not multiple of blocksize")]
    BlocksizeMismatch,

    #[error("copied length {0} did not match expectation {1}")]
    CopyError(usize, usize),

    #[error("IO Error")]
    Io(#[from] io::Error),

    #[error("Crucible Error: {0}")]
    Crucible(#[from] CrucibleError),
}
impl From<Error> for block::Result {
    fn from(value: Error) -> Self {
        match value {
            Error::ReadOnly => block::Result::ReadOnly,
            Error::Unsupported => block::Result::Unsupported,
            _ => block::Result::Failure,
        }
    }
}

/// Calculate offset (in crucible::Block form) and length in blocksize
fn block_offset_count(
    off_bytes: usize,
    len_bytes: usize,
    block_size: usize,
) -> Result<(crucible::BlockIndex, usize), Error> {
    if off_bytes.is_multiple_of(block_size)
        && len_bytes.is_multiple_of(block_size)
    {
        Ok((
            crucible::BlockIndex((off_bytes / block_size) as u64),
            len_bytes / block_size,
        ))
    } else {
        Err(Error::BlocksizeMismatch)
    }
}

#[cfg(test)]
mod test {
    use super::block_offset_count;

    #[test]
    fn err_on_bad_offset() {
        let bs = 512;
        assert!(block_offset_count(bs - 1, bs * 2, bs).is_err());
        assert!(block_offset_count(bs + 1, bs * 2, bs).is_err());
    }

    #[test]
    fn err_on_bad_size() {
        let bs = 512;
        assert!(block_offset_count(0, bs + 1, bs).is_err());
        assert!(block_offset_count(0, bs - 1, bs).is_err());
    }

    #[test]
    fn ok_for_valid() {
        let bs = 512;
        assert!(block_offset_count(0, bs, bs).is_ok());
        assert!(block_offset_count(bs * 3, bs * 4, bs).is_ok());
    }

    #[test]
    fn block_calc_ok() {
        let bs = 512;
        let off = bs * 4;
        let (block, _len) = block_offset_count(off, 0, bs).unwrap();

        assert_eq!(block.0, 4);
    }
}


================================================
FILE: lib/propolis/src/block/file.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs::{metadata, File, OpenOptions};
use std::io::{Error, ErrorKind, Result};
use std::num::NonZeroUsize;
use std::os::unix::io::AsRawFd;
use std::path::Path;
use std::sync::{Arc, Mutex};

use crate::block::{self, SyncWorkerCtx, WorkerId};
use crate::tasks::ThreadGroup;
use crate::vmm::{MappingExt, MemCtx};
use slog::warn;

use anyhow::Context;

pub struct FileBackend {
    state: Arc<SharedState>,
    block_attach: block::BackendAttachment,

    worker_count: NonZeroUsize,
    workers: ThreadGroup,
}
struct SharedState {
    fp: File,

    /// Write-Cache-Enable state (if supported) of the underlying device
    wce_state: Mutex<Option<WceState>>,
    discard_mech: Option<dkioc::DiscardMech>,

    info: block::DeviceInfo,
    skip_flush: bool,
    log: slog::Logger,
}
struct WceState {
    initial: bool,
    current: bool,
}
impl SharedState {
    fn new(
        fp: File,
        info: block::DeviceInfo,
        skip_flush: bool,
        wce_state: Option<WceState>,
        discard_mech: Option<dkioc::DiscardMech>,
        log: slog::Logger,
    ) -> Arc<Self> {
        let state = SharedState {
            fp,
            wce_state: Mutex::new(wce_state),
            discard_mech,
            skip_flush,
            info,
            log,
        };

        // Attempt to enable write caching if underlying resource supports it
        state.set_wce(true);

        Arc::new(state)
    }

    fn processing_loop(&self, wctx: SyncWorkerCtx) {
        while let Some(dreq) = wctx.block_for_req() {
            let req = dreq.req();
            if self.info.read_only && req.op.is_write() {
                dreq.complete(block::Result::ReadOnly);
                continue;
            }
            if self.discard_mech.is_none() && req.op.is_discard() {
                dreq.complete(block::Result::Unsupported);
                continue;
            }

            let Some(mem) = wctx.acc_mem().access() else {
                dreq.complete(block::Result::Failure);
                continue;
            };
            let res = match self.process_request(&req, &mem) {
                Ok(_) => block::Result::Success,
                Err(_) => block::Result::Failure,
            };
            dreq.complete(res);
        }
    }

    fn process_request(
        &self,
        req: &block::Request,
        mem: &MemCtx,
    ) -> std::result::Result<(), &'static str> {
        match req.op {
            block::Operation::Read(off, len) => {
                let maps = req.mappings(mem).ok_or("mapping unavailable")?;

                let nbytes = maps
                    .preadv(self.fp.as_raw_fd(), off as i64)
                    .map_err(|_| "io error")?;
                if nbytes != len {
                    return Err("bad read length");
                }
            }
            block::Operation::Write(off, len) => {
                let maps = req.mappings(mem).ok_or("bad guest region")?;

                let nbytes = maps
                    .pwritev(self.fp.as_raw_fd(), off as i64)
                    .map_err(|_| "io error")?;
                if nbytes != len {
                    return Err("bad write length");
                }
            }
            block::Operation::Flush => {
                if !self.skip_flush {
                    self.fp.sync_data().map_err(|_| "io error")?;
                }
            }
            block::Operation::Discard => {
                if let Some(mech) = self.discard_mech {
                    for &(off, len) in &req.ranges {
                        // There might be some performance benefits to combining the ranges into
                        // one DKIOCFREE call, but ZFS will only issue one range to the
                        // underlying disk at a time, so we expect the benefit to be minimal in
                        // practice.
                        if let Err(e) = dkioc::do_discard(
                            &self.fp, mech, off as u64, len as u64,
                        ) {
                            if e.kind() == ErrorKind::Unsupported {
                                // If the discard mechanism is unsupported, we should not have
                                // advertised support for discard in the first place.  However, if
                                // this happens, it likely means we're running on older ZFS bits that
                                // don't support DKIOCFREE on raw zvols.  Since this is not a supported
                                // configuration, but developer machines might be in this state, we
                                // swallow errors from the ioctl rather than failing the command.
                                warn!(self.log, "discard at offset {off} length {len} is unsupported; check ZFS version");
                            } else {
                                return Err("io error while attempting to free block(s)");
                            }
                        }
                    }
                } else {
                    unreachable!("handled above in processing_loop()");
                }
            }
        }
        Ok(())
    }

    fn set_wce(&self, enabled: bool) {
        if self.info.read_only {
            // Do not needlessly toggle the cache on a read-only disk
            return;
        }
        if let Some(state) = self.wce_state.lock().unwrap().as_mut() {
            if state.current != enabled {
                if let Some(new_wce) = dkioc::set_wce(&self.fp, enabled).ok() {
                    state.current = new_wce;
                }
            }
        }
    }
}
impl Drop for SharedState {
    fn drop(&mut self) {
        // Attempt to return WCE state on the device to how it was when we
        // initially opened it.
        if let Some(state) = self.wce_state.get_mut().unwrap().as_mut() {
            if state.current != state.initial {
                let _ = dkioc::set_wce(&self.fp, state.initial);
            }
        }
    }
}

impl FileBackend {
    /// Creates a new block device from a device at `path`.
    pub fn create(
        path: impl AsRef<Path>,
        opts: block::BackendOpts,
        worker_count: NonZeroUsize,
        log: slog::Logger,
    ) -> Result<Arc<Self>> {
        let p: &Path = path.as_ref();

        let meta = metadata(p)?;
        let read_only = match (opts.read_only, meta.permissions().readonly()) {
            (Some(false), true) => Err(Error::new(
                ErrorKind::Other,
                "writeable backend with read-only file not allowed",
            )),
            (Some(ro), false) => Ok(ro),
            (_, file_ro) => Ok(file_ro),
        }?;

        let fp = OpenOptions::new().read(true).write(!read_only).open(p)?;
        let len = fp.metadata().unwrap().len();
        let disk_info = dkioc::disk_info(&fp);

        // Do not use the device-queried block size for now. Guests get upset if
        // this changes, and it is likely differen than the old default of 512B
        let block_size = opts.block_size.unwrap_or(block::DEFAULT_BLOCK_SIZE);

        let info = block::DeviceInfo {
            block_size,
            total_size: len / u64::from(block_size),
            read_only,
            supports_discard: disk_info.discard_mech.is_some(),
        };
        let skip_flush = opts.skip_flush.unwrap_or(false);
        let wce_state = if !read_only {
            disk_info
                .wce_state
                .map(|initial| WceState { initial, current: initial })
        } else {
            None
        };
        let block_attach = block::BackendAttachment::new(worker_count, info);
        Ok(Arc::new(Self {
            state: SharedState::new(
                fp,
                info,
                skip_flush,
                wce_state,
                disk_info.discard_mech,
                log,
            ),
            block_attach,
            worker_count,
            workers: ThreadGroup::new(),
        }))
    }
    fn spawn_workers(&self) -> std::io::Result<()> {
        let backend_id = self.block_attach.backend_id().0;
        let spawn_results = (0..self.worker_count.get())
            .map(|n| {
                let shared_state = self.state.clone();
                let wctx = self.block_attach.worker(n as WorkerId);

                std::thread::Builder::new()
                    .name(format!("file backend {backend_id}/worker {n}"))
                    .spawn(move || {
                        let wctx = wctx
                            .activate_sync()
                            .expect("worker slot is uncontended");
                        shared_state.processing_loop(wctx);
                    })
            })
            .collect::<Vec<_>>();

        self.workers.extend(spawn_results.into_iter())
    }
}

#[async_trait::async_trait]
impl block::Backend for FileBackend {
    fn attachment(&self) -> &block::BackendAttachment {
        &self.block_attach
    }

    async fn start(&self) -> anyhow::Result<()> {
        self.block_attach.start();
        if let Err(e) = self.spawn_workers() {
            self.block_attach.stop();
            self.workers.block_until_joined();
            Err(e).context("failure while spawning workers")
        } else {
            Ok(())
        }
    }

    async fn stop(&self) -> () {
        self.block_attach.stop();
        self.workers.block_until_joined();
    }

    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

mod dkioc {
    #![allow(non_camel_case_types)]

    use std::fs::File;
    use std::io::Result;
    use std::os::raw::{c_int, c_longlong, c_uint};
    use std::os::unix::fs::FileTypeExt;
    use std::os::unix::io::AsRawFd;

    use crate::block::DEFAULT_BLOCK_SIZE;
    use crate::util::ioctl;

    const DKIOC: i32 = 0x04 << 8;
    const DKIOCGETWCE: i32 = DKIOC | 36;
    const DKIOCSETWCE: i32 = DKIOC | 37;
    const DKIOCGMEDIAINFOEXT: i32 = DKIOC | 48;
    const DKIOCFREE: i32 = DKIOC | 50;
    const DKIOC_CANFREE: i32 = DKIOC | 60;

    #[derive(Copy, Clone)]
    pub(crate) enum DiscardMech {
        /// Discard via `ioctl(DKIOCFREE)`
        DkiocFree,
        /// Discard via `fcntl(F_FREESP)`
        FnctlFreesp,
    }

    #[derive(Copy, Clone)]
    #[allow(unused, dead_code)]
    pub(crate) struct DiskInfo {
        /// WCE state (if any) for disk
        ///
        /// Block devices (including "real" disks and zvols) will regard all writes as
        /// synchronous when performed via the /dev/rdsk endpoint when WCE is not
        /// enabled.  With WCE enabled, writes can be cached on the device, to be
        /// flushed later via fsync().
        pub wce_state: Option<bool>,
        /// Block size of disk
        pub block_size: u32,
        /// Does the disk support use of DKIOCFREE or F_FREESP?
        pub discard_mech: Option<DiscardMech>,
    }
    impl Default for DiskInfo {
        fn default() -> Self {
            Self {
                wce_state: None,
                block_size: DEFAULT_BLOCK_SIZE,
                discard_mech: None,
            }
        }
    }

    pub(crate) fn disk_info(fp: &File) -> DiskInfo {
        match fp.metadata() {
            Ok(ft) if ft.file_type().is_char_device() => {
                // Continue on to attempt DKIOC lookups on raw disk devices
            }
            Ok(ft) if ft.file_type().is_file() => {
                // Assume fcntl(F_FREESP) support for files
                return DiskInfo {
                    discard_mech: Some(DiscardMech::FnctlFreesp),
                    ..Default::default()
                };
            }
            _ => {
                return DiskInfo::default();
            }
        }

        let wce_state = unsafe {
            let mut res: c_int = 0;

            ioctl(fp.as_raw_fd(), DKIOCGETWCE, &mut res as *mut c_int as _)
                .ok()
                .map(|_| res != 0)
        };

        let can_free = unsafe {
            let mut res: c_int = 0;
            ioctl(fp.as_raw_fd(), DKIOC_CANFREE, &mut res as *mut c_int as _)
                .ok()
                .map(|_| res != 0)
                .unwrap_or(false)
        };

        let block_size = unsafe {
            let mut info = dk_minfo_ext::default();
            ioctl(
                fp.as_raw_fd(),
                DKIOCGMEDIAINFOEXT,
                &mut info as *mut dk_minfo_ext as _,
            )
            .ok()
            .map(|_| info.dki_pbsize)
            .unwrap_or(DEFAULT_BLOCK_SIZE)
        };

        DiskInfo {
            wce_state,
            block_size,
            discard_mech: can_free.then_some(DiscardMech::DkiocFree),
        }
    }

    /// Attempt to set the Write-Cache-Enable state for a given open device
    pub(crate) fn set_wce(fp: &File, enabled: bool) -> Result<bool> {
        let mut flag: c_int = enabled.into();
        unsafe {
            ioctl(fp.as_raw_fd(), DKIOCSETWCE, &mut flag as *mut c_int as _)
                .map(|_| enabled)
        }
    }

    pub(crate) fn do_discard(
        fp: &File,
        mech: DiscardMech,
        off: u64,
        len: u64,
    ) -> Result<()> {
        match mech {
            DiscardMech::DkiocFree => {
                let mut req = dkioc_free_list {
                    dfl_flags: 0,
                    dfl_num_exts: 1,
                    dfl_offset: 0,
                    dfl_exts: [dkioc_free_list_ext {
                        dfle_start: off,
                        dfle_length: len,
                    }],
                };
                unsafe {
                    ioctl(
                        fp.as_raw_fd(),
                        DKIOCFREE,
                        &mut req as *mut dkioc_free_list as _,
                    )?;
                };
                Ok(())
            }
            DiscardMech::FnctlFreesp => {
                // If the target platform doesn't define F_WRLCK to be a type
                // quivalent to i16, we have to cast it. But if it's already an
                // i16 the cast is linted for being pointless. cfg() it to only
                // exist when needed.
                #[cfg(target_os = "linux")]
                let l_type = libc::F_WRLCK as i16;
                #[cfg(not(target_os = "linux"))]
                let l_type = libc::F_WRLCK;

                let mut fl = libc::flock {
                    l_type,
                    l_whence: 0,
                    l_start: off as i64,
                    l_len: len as i64,
                    // Ugly hack to zero out struct members we do not care about
                    ..unsafe { std::mem::MaybeUninit::zeroed().assume_init() }
                };

                // Make this buildable on non-illumos, despite the F_FREESP
                // fnctl command being unavailable elsewhere.
                #[cfg(target_os = "illumos")]
                let fcntl_cmd = libc::F_FREESP;
                #[cfg(not(target_os = "illumos"))]
                let fcntl_cmd = -1;

                let res = unsafe {
                    libc::fcntl(
                        fp.as_raw_fd(),
                        fcntl_cmd,
                        &mut fl as *mut libc::flock as *mut libc::c_void,
                    )
                };
                if res != 0 {
                    Err(std::io::Error::last_os_error())
                } else {
                    Ok(())
                }
            }
        }
    }

    #[derive(Copy, Clone, Default)]
    #[repr(C)]
    struct dkioc_free_list_ext {
        dfle_start: u64,
        dfle_length: u64,
    }

    #[derive(Copy, Clone, Default)]
    #[repr(C)]
    struct dkioc_free_list {
        dfl_flags: u64,
        dfl_num_exts: u64,
        dfl_offset: u64,
        dfl_exts: [dkioc_free_list_ext; 1],
    }

    #[derive(Copy, Clone, Default)]
    #[repr(C)]
    struct dk_minfo_ext {
        dki_media_type: c_uint,
        dki_lbsize: c_uint,
        dki_capacity: c_longlong,
        dki_pbsize: c_uint,
    }
}


================================================
FILE: lib/propolis/src/block/id.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Runtime identifiers for block devices and their backends.
//!
//! Devices that support block backends and the block backends themselves are
//! independent identifiers, and only become related when an item of each type
//! is connected via [`block::attach`].
//!
//! Devices in particular may have multiple identifiers, some from this module
//! and some from others. As one example, [`propolis::hw::nvme::NvmeCtrl`] has a
//! `device_id` distinguishing *instances of the NVMe controller* across a VM,
//! while the `PciNvme` which has an NVMe controller also has `block_attach`
//! with a `device_id` distinguishing *instances of block devices* across a VM.
//!
//! ## Limitations
//!
//! A consumer of `propolis` is free to construct devices supporting block
//! backends in any order, and may happen to construct block backends in any
//! different arbitrary order. Attaching the two kinds of item together is also
//! up to the consumer of `propolis`, and there is no requirement that a
//! particular block backend must be connected to a particular device.
//!
//! Consequently, these identifiers are not stable for use in migration of a VM,
//! and must not be used in a way visible to a VM. They are unsuitable for
//! emulated device serial numbers, model numbers, etc. The destination
//! `propolis` may construct the same set of devices in a different order,
//! resulting in different run-time identifiers for a device at the same
//! location.

use crate::util::id::define_id;

define_id! {
    /// Numbering across block devices means that a block `DeviceId` and the
    /// queue ID in a block attachment are unique across a VM.
    #[derive(Copy, Clone)]
    pub struct DeviceId(pub(crate) u32);
}

define_id! {
    /// Block backends are numbered distinctly across a VM, but may not
    /// be created in the same order as devices. The `block_attach` probe fires
    /// when a `DeviceId` and `BackendId` become associated.
    #[derive(Copy, Clone)]
    pub struct BackendId(pub(crate) u32);
}


================================================
FILE: lib/propolis/src/block/in_memory.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io::{Error, ErrorKind, Result};
use std::num::NonZeroUsize;
use std::sync::{Arc, Mutex};

use crate::block;
use crate::common::Lifecycle;
use crate::migrate::{
    MigrateCtx, MigrateSingle, MigrateStateError, Migrator, PayloadOffer,
    PayloadOutput,
};
use crate::tasks::ThreadGroup;
use crate::vmm::{MemCtx, SubMapping};

use anyhow::Context;

pub struct InMemoryBackend {
    shared_state: Arc<SharedState>,
    block_attach: block::BackendAttachment,

    workers: ThreadGroup,
}
struct SharedState {
    bytes: Mutex<Vec<u8>>,
    info: block::DeviceInfo,
}
impl SharedState {
    fn processing_loop(&self, wctx: block::SyncWorkerCtx) {
        while let Some(dreq) = wctx.block_for_req() {
            let req = dreq.req();
            if self.info.read_only && req.op.is_write() {
                dreq.complete(block::Result::ReadOnly);
                continue;
            }
            if req.op.is_discard() {
                // Punt on discard support
                dreq.complete(block::Result::Unsupported);
                continue;
            }

            let res = match wctx
                .acc_mem()
                .access()
                .and_then(|mem| self.process_request(&req, &mem).ok())
            {
                Some(_) => block::Result::Success,
                None => block::Result::Failure,
            };

            dreq.complete(res);
        }
    }

    fn process_request(
        &self,
        req: &block::Request,
        mem: &MemCtx,
    ) -> Result<()> {
        match req.op {
            block::Operation::Read(off, len) => {
                let maps = req.mappings(mem).ok_or_else(|| {
                    Error::new(ErrorKind::Other, "bad guest region")
                })?;

                let bytes = self.bytes.lock().unwrap();
                process_read_request(&bytes, off as u64, len, &maps)?;
            }
            block::Operation::Write(off, len) => {
                if self.info.read_only {
                    return Err(Error::new(
                        ErrorKind::PermissionDenied,
                        "backend is read-only",
                    ));
                }

                let maps = req.mappings(mem).ok_or_else(|| {
                    Error::new(ErrorKind::Other, "bad guest region")
                })?;

                let mut bytes = self.bytes.lock().unwrap();
                process_write_request(&mut bytes, off as u64, len, &maps)?;
            }
            block::Operation::Flush => {
                // nothing to do
            }
            block::Operation::Discard => {
                unreachable!("handled in processing_loop()");
            }
        }

        Ok(())
    }
}

impl InMemoryBackend {
    pub fn create(
        bytes: Vec<u8>,
        opts: block::BackendOpts,
        worker_count: NonZeroUsize,
    ) -> Result<Arc<Self>> {
        let block_size = opts.block_size.unwrap_or(block::DEFAULT_BLOCK_SIZE);

        let len = bytes.len();
        if len == 0 {
            return Err(Error::new(ErrorKind::Other, "size cannot be 0"));
        } else if !len.is_multiple_of(block_size as usize) {
            return Err(Error::new(
                ErrorKind::Other,
                format!(
                    "size {} not multiple of block size {}!",
                    len, block_size,
                ),
            ));
        }

        let info = block::DeviceInfo {
            block_size,
            total_size: len as u64 / u64::from(block_size),
            read_only: opts.read_only.unwrap_or(false),
            supports_discard: false,
        };
        let bytes = Mutex::new(bytes);
        let block_attach = block::BackendAttachment::new(worker_count, info);

        Ok(Arc::new(Self {
            shared_state: Arc::new(SharedState { bytes, info }),
            block_attach,

            workers: ThreadGroup::new(),
        }))
    }
    fn spawn_workers(&self) -> Result<()> {
        let count = self.block_attach.max_workers().get();
        let spawn_results = (0..count).map(|n| {
            let shared_state = self.shared_state.clone();
            let wctx = self.block_attach.worker(n);
            std::thread::Builder::new()
                .name(format!("in-memory worker {n}"))
                .spawn(move || {
                    let wctx = wctx
                        .activate_sync()
                        .expect("worker slot is uncontended");
                    shared_state.processing_loop(wctx);
                })
        });

        self.workers.extend(spawn_results.into_iter())
    }
}

#[async_trait::async_trait]
impl block::Backend for InMemoryBackend {
    fn attachment(&self) -> &block::BackendAttachment {
        &self.block_attach
    }

    async fn start(&self) -> anyhow::Result<()> {
        self.block_attach.start();
        if let Err(e) = self.spawn_workers() {
            self.block_attach.stop();
            self.workers.block_until_joined();
            Err(e).context("failure while spawning workers")
        } else {
            Ok(())
        }
    }

    async fn stop(&self) -> () {
        self.block_attach.stop();
        self.workers.block_until_joined();
    }

    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

/// Read from bytes into guest memory
fn process_read_request(
    bytes: &[u8],
    offset: u64,
    len: usize,
    mappings: &[SubMapping],
) -> Result<()> {
    let start = offset as usize;
    let end = offset as usize + len;

    if start >= bytes.len() || end > bytes.len() {
        return Err(std::io::Error::new(
            ErrorKind::InvalidInput,
            format!(
                "invalid offset {} and len {} when bytes len is {}",
                offset,
                len,
                bytes.len(),
            ),
        ));
    }

    let data = &bytes[start..end];

    let mut nwritten = 0;
    for mapping in mappings {
        nwritten +=
            mapping.write_bytes(&data[nwritten..(nwritten + mapping.len())])?;
    }

    Ok(())
}

/// Write from guest memory into bytes
fn process_write_request(
    bytes: &mut [u8],
    offset: u64,
    len: usize,
    mappings: &[SubMapping],
) -> Result<()> {
    let start = offset as usize;
    let end = offset as usize + len;

    if start >= bytes.len() || end > bytes.len() {
        return Err(std::io::Error::new(
            ErrorKind::InvalidInput,
            format!(
                "invalid offset {} and len {} when bytes len is {}",
                offset,
                len,
                bytes.len(),
            ),
        ));
    }

    let data = &mut bytes[start..end];

    let mut nread = 0;
    for mapping in mappings {
        nread +=
            mapping.read_bytes(&mut data[nread..(nread + mapping.len())])?;
    }

    Ok(())
}

impl Lifecycle for InMemoryBackend {
    fn type_name(&self) -> &'static str {
        "in-memory-storage"
    }

    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}

impl MigrateSingle for InMemoryBackend {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> std::result::Result<PayloadOutput, MigrateStateError> {
        let bytes = self.shared_state.bytes.lock().unwrap();
        Ok(migrate::InMemoryBlockBackendV1 { bytes: bytes.clone() }.into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> std::result::Result<(), MigrateStateError> {
        let data: migrate::InMemoryBlockBackendV1 = offer.parse()?;
        let mut guard = self.shared_state.bytes.lock().unwrap();
        if guard.len() != data.bytes.len() {
            return Err(MigrateStateError::ImportFailed(format!(
                "imported in-memory block backend data has length {}, \
                        but backend's original length was {}",
                data.bytes.len(),
                guard.len()
            )));
        }

        *guard = data.bytes;
        Ok(())
    }
}

mod migrate {
    use serde::{Deserialize, Serialize};

    use crate::migrate::{Schema, SchemaId};

    #[derive(Serialize, Deserialize)]
    pub struct InMemoryBlockBackendV1 {
        pub(super) bytes: Vec<u8>,
    }

    impl std::fmt::Debug for InMemoryBlockBackendV1 {
        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
            f.debug_struct("InMemoryBlockBackendV1")
                .field("bytes", &"<redacted>".to_string())
                .finish()
        }
    }

    impl Schema<'_> for InMemoryBlockBackendV1 {
        fn id() -> SchemaId {
            ("in-memory-block-backend", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/block/mem_async.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io::{Error, ErrorKind, Result};
use std::num::NonZeroUsize;
use std::ptr::NonNull;
use std::sync::Arc;

use crate::block;
use crate::tasks::TaskGroup;
use crate::vmm::MemCtx;

/// Block device backend which uses anonymous memory as its storage.
///
/// While not useful for actually storage data beyond the life of an instance,
/// this backend can be used for measuring how other parts of the emulation
/// stack perform.
pub struct MemAsyncBackend {
    shared_state: Arc<SharedState>,
    block_attach: block::BackendAttachment,

    workers: TaskGroup,
}
struct SharedState {
    seg: MmapSeg,
    info: block::DeviceInfo,
}
impl SharedState {
    async fn processing_loop(&self, wctx: block::AsyncWorkerCtx) {
        while let Some(dreq) = wctx.wait_for_req().await {
            let req = dreq.req();
            if self.info.read_only && req.op.is_write() {
                dreq.complete(block::Result::ReadOnly);
                continue;
            }
            if req.op.is_discard() {
                dreq.complete(block::Result::Unsupported);
                continue;
            }

            let res = match wctx
                .acc_mem()
                .access()
                .and_then(|mem| self.process_request(&req, &mem).ok())
            {
                Some(_) => block::Result::Success,
                None => block::Result::Failure,
            };
            dreq.complete(res);
        }
    }

    fn process_request(
        &self,
        req: &block::Request,
        mem: &MemCtx,
    ) -> std::result::Result<(), &'static str> {
        let seg = &self.seg;
        match req.op {
            block::Operation::Read(off, _len) => {
                req.regions
                    .iter()
                    .try_fold(0usize, |nread, region| {
                        let map = mem.writable_region(region)?;
                        unsafe {
                            let read_ptr = map.raw_writable()?;
                            let len = map.len();
                            seg.read(off + nread, read_ptr, len)
                                .then_some(nread + len)
                        }
                    })
                    .ok_or("read failure")?;
            }
            block::Operation::Write(off, _len) => {
                req.regions
                    .iter()
                    .try_fold(0usize, |nwritten, region| {
                        let map = mem.readable_region(region)?;
                        unsafe {
                            let write_ptr = map.raw_readable()?;
                            let len = map.len();
                            seg.write(off + nwritten, write_ptr, len)
                                .then_some(nwritten + len)
                        }
                    })
                    .ok_or("write failure")?;
            }
            block::Operation::Flush => {
                // nothing to do
            }
            block::Operation::Discard => {
                unreachable!("handled in processing_loop()")
            }
        }

        Ok(())
    }
}

impl MemAsyncBackend {
    pub fn create(
        size: u64,
        opts: block::BackendOpts,
        worker_count: NonZeroUsize,
    ) -> Result<Arc<Self>> {
        let block_size = opts.block_size.unwrap_or(block::DEFAULT_BLOCK_SIZE);

        if size == 0 {
            return Err(Error::new(ErrorKind::Other, "size cannot be 0"));
        } else if !size.is_multiple_of(u64::from(block_size)) {
            return Err(Error::new(
                ErrorKind::Other,
                format!(
                    "size {} not multiple of block size {}!",
                    size, block_size,
                ),
            ));
        }

        let info = block::DeviceInfo {
            block_size,
            total_size: size / u64::from(block_size),
            read_only: opts.read_only.unwrap_or(false),
            supports_discard: false,
        };
        let seg = MmapSeg::new(size as usize)?;
        let block_attach = block::BackendAttachment::new(worker_count, info);

        Ok(Arc::new(Self {
            shared_state: Arc::new(SharedState { info, seg }),
            block_attach,

            workers: TaskGroup::new(),
        }))
    }

    fn spawn_workers(&self) {
        let count = self.block_attach.max_workers().get();
        self.workers.extend((0..count).map(|n| {
            let shared_state = self.shared_state.clone();
            let wctx = self.block_attach.worker(n);
            tokio::spawn(async move {
                let wctx =
                    wctx.activate_async().expect("worker slot is uncontended");
                shared_state.processing_loop(wctx).await
            })
        }))
    }
}

#[async_trait::async_trait]
impl block::Backend for MemAsyncBackend {
    fn attachment(&self) -> &block::BackendAttachment {
        &self.block_attach
    }
    async fn start(&self) -> anyhow::Result<()> {
        self.block_attach.start();
        self.spawn_workers();
        Ok(())
    }
    async fn stop(&self) -> () {
        self.block_attach.stop();
        self.workers.join_all().await;
    }
    fn as_any(&self) -> &dyn std::any::Any {
        self
    }
}

struct MmapSeg(NonNull<u8>, usize);
impl MmapSeg {
    fn new(size: usize) -> Result<Self> {
        let ptr = unsafe {
            libc::mmap(
                core::ptr::null_mut(),
                size,
                libc::PROT_READ | libc::PROT_WRITE,
                libc::MAP_PRIVATE | libc::MAP_ANON,
                -1,
                0,
            )
        };

        if ptr == libc::MAP_FAILED {
            return Err(Error::last_os_error());
        }
        Ok(Self(NonNull::new(ptr as *mut u8).unwrap(), size))
    }
    unsafe fn write(&self, off: usize, data: *const u8, sz: usize) -> bool {
        if (off + sz) > self.1 {
            return false;
        }

        self.0.as_ptr().add(off).copy_from_nonoverlapping(data, sz);
        true
    }
    unsafe fn read(&self, off: usize, data: *mut u8, sz: usize) -> bool {
        if (off + sz) > self.1 {
            return false;
        }

        self.0.as_ptr().add(off).copy_to_nonoverlapping(data, sz);
        true
    }
}
impl Drop for MmapSeg {
    fn drop(&mut self) {
        unsafe {
            libc::munmap(self.0.as_ptr() as *mut libc::c_void, self.1);
        }
    }
}
// Safety: The consumer is allowed to make their own pointer mistakes
unsafe impl Send for MmapSeg {}
unsafe impl Sync for MmapSeg {}


================================================
FILE: lib/propolis/src/block/minder.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Mechanisms required to implement a block device

use std::any::Any;
use std::borrow::Borrow;
use std::collections::BTreeMap;
use std::future::Future;
use std::pin::Pin;
use std::sync::{Arc, Mutex, Weak};
use std::task::{Context, Poll};
use std::time::Instant;

use pin_project_lite::pin_project;
use tokio::sync::futures::Notified;
use tokio::sync::Notify;

use crate::block::attachment::Bitmap;
use crate::block::{self, devq_id, probes, Operation, Request};
use crate::block::{DeviceId, MetricConsumer, QueueId, WorkerId};

/// Each emulated block device will have one or more [DeviceQueue]s which can be
/// polled through [next_req()](DeviceQueue::next_req()) to emit IO requests.
/// The completions for those requests are then processed through
/// [complete()](DeviceQueue::complete()) calls.
pub trait DeviceQueue: Send + Sync + 'static {
    /// Requests emitted from a [DeviceQueue] may require some associated state
    /// in order to communicate their completion to the guest.  The `Token` type
    /// represents that state.
    type Token: Send + Sync + 'static;

    /// Get the next [Request] (if any) from this queue.  Supporting data
    /// included with the request consists of the necessary [Self::Token] as
    /// well an optional [queued-time](Instant).
    fn next_req(&self) -> Option<(Request, Self::Token, Option<Instant>)>;

    /// Emit a completion for a processed request, identified by its
    /// [token](Self::Token).
    fn complete(
        &self,
        op: block::Operation,
        result: block::Result,
        token: Self::Token,
    );

    /// Explicitly abandon a queue token, never to be used for an I/O
    /// completion.
    ///
    /// A token's typical lifecycle is to be produced by
    /// [`DeviceQueue::next_req`], operated on, and completed with a result by
    /// [`DeviceQueue::complete`]. If the device's queues are dissociated, such
    /// as by a reset of the device, we may want to shortcut this lifecycle and
    /// destroy the token immediately.
    ///
    /// `DeviceQueue` implementations may use `Token`s that panic on `Drop`, to
    /// flag errnoenous discards of request tokens without completing them.
    /// `abandon`, instead, is an escape hatch in the case one genuinely must
    /// discard an I/O token without fulfilling the operation.
    fn abandon(&self, token: Self::Token);
}

/// A wrapper for an IO [Request] bearing necessary tracking information to
/// issue its completion back to the [queue](DeviceQueue) from which it came.
///
/// A panic will occur a `DeviceRequest` instance is dropped without calling
/// [complete()](DeviceRequest::complete()).
pub struct DeviceRequest {
    req: Request,
    id: ReqId,
    source: Weak<QueueMinder>,
    _nodrop: NoDropDevReq,
}
impl DeviceRequest {
    fn new(id: ReqId, req: Request, source: Weak<QueueMinder>) -> Self {
        Self { req, id, source, _nodrop: NoDropDevReq }
    }

    /// Get the underlying block [Request]
    pub fn req(&self) -> &Request {
        &self.req
    }

    /// Issue a completion for this [Request].
    pub fn complete(self, result: super::Result) {
        let DeviceRequest { id, source, _nodrop, .. } = self;
        std::mem::forget(_nodrop);

        if let Some(src) = source.upgrade() {
            src.complete(id, result);
        }
    }
}

/// Marker struct to ensure that [DeviceRequest] consumers call
/// [complete()](DeviceRequest::complete()), rather than silently dropping it.
struct NoDropDevReq;
impl Drop for NoDropDevReq {
    fn drop(&mut self) {
        panic!("DeviceRequest should be complete()-ed before drop");
    }
}

/// Closure to permit [QueueMinder] to type-erase the calling of
/// [DeviceQueue::next_req()].
type NextReqFn = Box<
    dyn Fn() -> Option<(
            block::Request,
            Box<dyn Any + Send + Sync>,
            Option<Instant>,
        )> + Send
        + Sync,
>;

/// Closure to permit [QueueMinder] to type-erase the calling of
/// [DeviceQueue::complete()].
type CompleteReqFn = Box<
    dyn Fn(Operation, block::Result, Box<dyn Any + Send + Sync>) + Send + Sync,
>;

/// Closure to permit [QueueMinder] to type-erase the calling of
/// [DeviceQueue::abandon()].
type AbandonReqFn = Box<dyn Fn(Box<dyn Any + Send + Sync>) + Send + Sync>;

struct QmEntry {
    token: Box<dyn Any + Send + Sync>,
    op: Operation,
    when_queued: Instant,
    when_started: Instant,
}

struct QmInner {
    next_id: ReqId,
    /// Map of [WorkerId]s which we emitted [None] to via
    /// [QueueMinder::next_req()] and which are likely candidates to notify when
    /// this queue has new entries.
    notify_workers: Bitmap,
    paused: bool,
    /// Has this QueueMinder been destroyed? Since `QueueMinder` is typically in
    /// an Arc, `destroy` may be called while there are other references
    /// outstanding - concurrent completions that have just upgraded their
    /// weak refs, for example. When the minder has been "destroyed", those I/Os
    /// should gracefully abort.
    destroyed: bool,
    in_flight: BTreeMap<ReqId, QmEntry>,
    metric_consumer: Option<Arc<dyn MetricConsumer>>,
    /// Number of [Request] completions which are currently being processed by
    /// the device.  This is tracked only for requests which are the last entry
    /// removed from `in_flight`, as a means providing accurate results from
    /// [NoneInFlight].
    processing_last: usize,
}
impl Default for QmInner {
    fn default() -> Self {
        Self {
            next_id: ReqId::START,
            notify_workers: Bitmap::default(),
            paused: false,
            destroyed: false,
            processing_last: 0,
            in_flight: BTreeMap::new(),
            metric_consumer: None,
        }
    }
}

pub(super) struct QueueMinder {
    pub queue_id: QueueId,
    pub device_id: DeviceId,
    state: Mutex<QmInner>,
    self_ref: Weak<Self>,
    notify: Notify,
    /// Type-erased wrapper function for [DeviceQueue::next_req()]
    next_req_fn: NextReqFn,
    /// Type-erased wrapper function for [DeviceQueue::complete()]
    complete_req_fn: CompleteReqFn,
    /// Type-erased wrapper function for [DeviceQueue::abandon()]
    abandon_req_fn: AbandonReqFn,
}

impl QueueMinder {
    pub fn destroy(self: Arc<Self>) {
        // Up-front, it would be nice to assert that we have the last strong ref
        // on this `QueueMinder`. We might not actually though: the controller
        // may be reset at the same time we're completing I/Os, and those
        // completions have upgraded their ref back to the minder.
        //
        // So, do *not* `assert_eq!(Arc::strong_count(&self), 1);`.

        let mut state = self.state.lock().unwrap();

        // A minder can only be destroyed once. To destroy it more than once
        // would imply it was dissociated from a queue a second time, and for
        // that to happen the destroyed minder would have had to be associated
        // to a queue. Nonsense!
        assert!(!state.destroyed);
        state.destroyed = true;

        if state.in_flight.len() > 0 {
            let old = std::mem::replace(&mut state.in_flight, BTreeMap::new());
            for (_, QmEntry { token, .. }) in old.into_iter() {
                (self.abandon_req_fn)(token);
            }
        }
        assert_eq!(state.in_flight.len(), 0);
    }

    pub fn new<DQ: DeviceQueue>(
        queue: Arc<DQ>,
        device_id: DeviceId,
        queue_id: QueueId,
    ) -> Arc<Self> {
        let device_queue_ref = queue.clone();
        let next_req_fn: NextReqFn = Box::new(move || {
            let (req, token, when_queued) = device_queue_ref.next_req()?;
            Some((
                req,
                Box::new(token) as Box<dyn Any + Send + Sync>,
                when_queued,
            ))
        });

        let device_queue_ref = queue.clone();
        let complete_req_fn: CompleteReqFn =
            Box::new(move |op, result, token| {
                let token = token
                    .downcast::<DQ::Token>()
                    .expect("token type unchanged");
                let token = *token;
                device_queue_ref.complete(op, result, token);
            });

        let abandon_req_fn: AbandonReqFn = Box::new(move |token| {
            let token =
                token.downcast::<DQ::Token>().expect("token type unchanged");
            let token = *token;
            queue.abandon(token);
        });

        Arc::new_cyclic(|self_ref| Self {
            queue_id,
            device_id,
            state: Mutex::new(QmInner::default()),
            self_ref: self_ref.clone(),
            notify: Notify::new(),
            next_req_fn,
            complete_req_fn,
            abandon_req_fn,
        })
    }

    /// Attempt to fetch the next IO request from this queue for a worker.
    ///
    /// If no requests are available, that worker (specified by `wid`) will be
    /// recorded so it can be notified if/when the guest notifies this queue
    /// that more requests are available.
    pub fn next_req(&self, wid: WorkerId) -> Option<DeviceRequest> {
        let mut state = self.state.lock().unwrap();
        if state.destroyed {
            return None;
        }
        if state.paused {
            state.notify_workers.set(wid);
            return None;
        }
        if let Some((req, token, when_queued)) = (self.next_req_fn)() {
            let id = state.next_id;
            state.next_id.advance();

            let devqid = devq_id(self.device_id, self.queue_id);
            match req.op {
                Operation::Read(off, len) => {
                    probes::block_begin_read!(|| {
                        (devqid, id, off as u64, len as u64)
                    });
                }
                Operation::Write(off, len) => {
                    probes::block_begin_write!(|| {
                        (devqid, id, off as u64, len as u64)
                    });
                }
                Operation::Flush => {
                    probes::block_begin_flush!(|| { (devqid, id) });
                }
                Operation::Discard => {
                    probes::block_begin_discard!(|| {
                        (devqid, id, req.ranges.len() as u64)
                    });
                }
            }
            let when_started = Instant::now();
            let old = state.in_flight.insert(
                id,
                QmEntry {
                    token,
                    op: req.op,
                    when_queued: when_queued.unwrap_or(when_started),
                    when_started,
                },
            );
            assert!(old.is_none(), "request IDs should not overlap");

            Some(DeviceRequest::new(id, req, self.self_ref.clone()))
        } else {
            state.notify_workers.set(wid);
            None
        }
    }

    /// Process a completion for an in-flight IO request on this queue.
    pub fn complete(&self, id: ReqId, result: block::Result) {
        let mut state = self.state.lock().unwrap();
        let Some(ent) = state.in_flight.remove(&id) else {
            // If we lost state for this I/O, we better have gotten here because
            // the controller was reset and dissociated all queues. In that case
            // we should have destroyed the `QueueMinder`s, so assert that is
            // the case.
            assert!(state.destroyed);

            // One must imagine the guest would be happy to know the I/O *was*
            // completed after all, but we can no longer do anything about it.
            // We don't even know when it started anymore, so we can't report
            // meaningful metrics about it.
            return;
        };
        let metric_consumer = state.metric_consumer.as_ref().map(Arc::clone);
        let is_last_req = state.in_flight.is_empty();
        if is_last_req {
            state.processing_last += 1;
        }
        drop(state);

        let when_done = Instant::now();
        let time_queued = ent.when_started.duration_since(ent.when_queued);
        let time_processed = when_done.duration_since(ent.when_started);

        let ns_queued = time_queued.as_nanos() as u64;
        let ns_processed = time_processed.as_nanos() as u64;
        let rescode = result as u8;
        let devqid = devq_id(self.device_id, self.queue_id);
        match ent.op {
            Operation::Read(..) => {
                probes::block_complete_read!(|| {
                    (devqid, id, rescode, ns_processed, ns_queued)
                });
            }
            Operation::Write(..) => {
                probes::block_complete_write!(|| {
                    (devqid, id, rescode, ns_processed, ns_queued)
                });
            }
            Operation::Flush => {
                probes::block_complete_flush!(|| {
                    (devqid, id, rescode, ns_processed, ns_queued)
                });
            }
            Operation::Discard => {
                probes::block_complete_discard!(|| {
                    (devqid, id, rescode, ns_processed, ns_queued)
                });
            }
        }

        (self.complete_req_fn)(ent.op, result, ent.token);

        probes::block_completion_sent!(|| {
            (devqid, id, when_done.elapsed().as_nanos() as u64)
        });

        // Report the completion to the metrics consumer, if one exists
        if let Some(consumer) = metric_consumer {
            consumer.request_completed(
                self.queue_id,
                ent.op,
                result,
                time_queued,
                time_processed,
            );
        }

        // We must track how many completions are being processed by the device,
        // since they are done outside the state lock, in order to present a
        // reliably accurate accurate signal of when the device has no more
        // in-flight requests.
        if is_last_req {
            let mut state = self.state.lock().unwrap();
            state.processing_last -= 1;
            if state.in_flight.is_empty() && state.processing_last == 0 {
                self.notify.notify_waiters();
            }
        }
    }

    /// Take the bitmap of the workers which should be notified that this queue
    /// may now have requests available.
    ///
    /// Bits in this map correspond to workers that either should be
    /// [`WorkerSlot::wake`]'d or returned to this `QueueMinder` via
    /// [`add_notifications`]. Failure to do so will result in idle workers
    /// never being woken for future work.
    pub(in crate::block) fn take_notifications(&self) -> Option<Bitmap> {
        let mut state = self.state.lock().unwrap();
        if state.paused {
            state.notify_workers = Bitmap::ALL;
            None
        } else {
            Some(state.notify_workers.take())
        }
    }

    /// Add a set of workers to be notified when this queue may have requests
    /// available.
    ///
    /// This should only be called with the remaining parts of a bitmap obtained
    /// from an ealier [`take_notifications`]. Using other bit patterns may
    /// result in wakeups to out-of-range worker IDs and subsequent panic.
    pub(in crate::block) fn add_notifications(&self, worker_ids: Bitmap) {
        let mut state = self.state.lock().unwrap();

        state.notify_workers.set_all(worker_ids);
    }

    /// Associate a [MetricConsumer] with this queue.
    ///
    /// It will be notified about each IO completion as they occur.
    pub(crate) fn set_metric_consumer(
        &self,
        consumer: Arc<dyn MetricConsumer>,
    ) {
        self.state.lock().unwrap().metric_consumer = Some(consumer);
    }

    pub(crate) fn pause(&self) {
        let mut state = self.state.lock().unwrap();
        state.paused = true;
        self.notify.notify_waiters();
    }

    pub(crate) fn resume(&self) {
        let mut state = self.state.lock().unwrap();
        state.paused = false;
        self.notify.notify_waiters();
    }

    pub(crate) fn none_in_flight(&self) -> NoneInFlight<'_> {
        NoneInFlight { minder: self, wait: self.notify.notified() }
    }
}

pin_project! {
    /// A [Future] which resolves to [Ready](Poll::Ready) when there are no
    /// requests being processed by an attached backend.
    pub(crate) struct NoneInFlight<'a> {
        minder: &'a QueueMinder,
        #[pin]
        wait: Notified<'a>
    }
}
impl Future for NoneInFlight<'_> {
    type Output = ();

    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let mut this = self.project();

        loop {
            let state = this.minder.state.lock().unwrap();
            if state.in_flight.is_empty() && state.processing_last == 0 {
                return Poll::Ready(());
            }
            // Keep the minder `state` lock held while polling the Notified
            // instance.  While it may not be strictly necessary, it matches the
            // conventions we expect from similar sync primitives such as CVs.
            if let Poll::Ready(_) = Notified::poll(this.wait.as_mut(), cx) {
                // Refresh fused future from Notify
                this.wait.set(this.minder.notify.notified());
            } else {
                return Poll::Pending;
            }
        }
    }
}

/// Unique ID assigned to a given block [Request].
#[derive(Copy, Clone, PartialEq, PartialOrd, Eq, Ord)]
pub struct ReqId(u64);
impl ReqId {
    const START: Self = ReqId(0);

    fn advance(&mut self) {
        self.0 += 1;
    }
}
impl Borrow<u64> for ReqId {
    fn borrow(&self) -> &u64 {
        &self.0
    }
}
impl From<ReqId> for u64 {
    fn from(value: ReqId) -> Self {
        value.0
    }
}


================================================
FILE: lib/propolis/src/block/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implements an interface to virtualized block devices.

use std::time::Duration;

use crate::common::*;
use crate::vmm::{MemCtx, SubMapping};

mod id;
pub use id::{BackendId, DeviceId};

mod file;
pub use file::FileBackend;

#[cfg(feature = "crucible")]
mod crucible;
#[cfg(feature = "crucible")]
pub use self::crucible::CrucibleBackend;

mod in_memory;
pub use in_memory::InMemoryBackend;

mod mem_async;
pub use mem_async::MemAsyncBackend;

pub mod attachment;
pub mod minder;

pub use attachment::{
    attach, AsyncWorkerCtx, AttachError, BackendAttachment, DeviceAttachment,
    SyncWorkerCtx,
};
pub use minder::{DeviceQueue, DeviceRequest};

pub type ByteOffset = usize;
pub type ByteLen = usize;

/// When `block_size` is not specified in [BackendOpts], and the backend itself
/// is not choosing a block size, a default of 512B is used.
pub const DEFAULT_BLOCK_SIZE: u32 = 512;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn block_attach(dev_id: u32, backend_id: u32) {}
    fn block_detach(dev_id: u32, backend_id: u32) {}

    fn block_begin_read(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
    fn block_begin_write(devq_id: u64, req_id: u64, offset: u64, len: u64) {}
    fn block_begin_flush(devq_id: u64, req_id: u64) {}
    fn block_begin_discard(devq_id: u64, req_id: u64, nr: u64) {}

    fn block_complete_read(
        devq_id: u64,
        req_id: u64,
        result: u8,
        proc_ns: u64,
        queue_ns: u64,
    ) {
    }
    fn block_complete_write(
        devq_id: u64,
        req_id: u64,
        result: u8,
        proc_ns: u64,
        queue_ns: u64,
    ) {
    }
    fn block_complete_flush(
        devq_id: u64,
        req_id: u64,
        result: u8,
        proc_ns: u64,
        queue_ns: u64,
    ) {
    }
    fn block_complete_discard(
        devq_id: u64,
        req_id: u64,
        result: u8,
        proc_ns: u64,
        queue_ns: u64,
    ) {
    }

    fn block_completion_sent(devq_id: u64, req_id: u64, complete_ns: u64) {}

    fn block_poll(devq_id: u64, worker_id: u64, emit_req: u8) {}
    fn block_sleep(dev_id: u32, worker_id: u64) {}
    fn block_wake(dev_id: u32, worker_id: u64) {}
    fn block_notify(devq_id: u64) {}
    fn block_strategy(dev_id: u32, strat: String, generation: u64) {}

    fn block_worker_collection_wake(wake_wids: u64, limit: usize) {}
    fn block_worker_collection_woken(remaining_wids: u64, num_woken: usize) {}
}

/// Type of operations which may be issued to a virtual block device.
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum Operation {
    /// Read from `offset` for `len`
    Read(ByteOffset, ByteLen),
    /// Write to `offset` for len
    Write(ByteOffset, ByteLen),
    /// Flush buffer(s)
    Flush,
    /// Discard/UNMAP/deallocate some ranges, which are specified in Request::ranges
    Discard,
}
impl Operation {
    pub const fn is_read(&self) -> bool {
        matches!(self, Operation::Read(..))
    }
    pub const fn is_write(&self) -> bool {
        matches!(self, Operation::Write(..))
    }
    pub const fn is_flush(&self) -> bool {
        matches!(self, Operation::Flush)
    }
    pub const fn is_discard(&self) -> bool {
        matches!(self, Operation::Discard)
    }
}

/// Result of a block [`Request`]
#[derive(Copy, Clone, Debug)]
pub enum Result {
    /// Request succeeded
    Success = 0,
    /// Backend indicated failure for operation
    Failure,
    /// Underlying backend is read-only
    ReadOnly,
    /// Operation not supported by backend
    Unsupported,
}
impl Result {
    pub const fn is_err(&self) -> bool {
        !matches!(self, Result::Success)
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug, Default)]
pub struct QueueId(u8);
impl QueueId {
    /// Arbitrary limit for per-device queues.
    /// Sized to match [attachment::Bitmap] capacity
    pub const MAX_QUEUES: usize = 64;

    pub const MAX: Self = Self(Self::MAX_QUEUES as u8);

    /// Get the next sequential QueueId, wrapping around at a maximum
    fn next(self, max: usize) -> Self {
        let max: u8 = max.try_into().expect("max should be in-range");
        assert!(max != 0 && max <= Self::MAX.0);

        let next = self.0.wrapping_add(1);
        if next >= max {
            Self(0)
        } else {
            Self(next)
        }
    }
}
impl From<usize> for QueueId {
    fn from(value: usize) -> Self {
        assert!(value < Self::MAX_QUEUES);
        Self(value as u8)
    }
}
impl From<QueueId> for usize {
    fn from(value: QueueId) -> Self {
        value.0 as usize
    }
}
impl From<u16> for QueueId {
    fn from(value: u16) -> Self {
        assert!(value < (Self::MAX_QUEUES as u16));
        Self(value as u8)
    }
}
impl From<QueueId> for u16 {
    fn from(value: QueueId) -> Self {
        value.0 as u16
    }
}

pub type WorkerId = usize;

/// Combine device and queue IDs into single u64 for probes
pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 {
    ((dev.0 as u64) << 8) | (queue.0 as u64)
}

/// Block device operation request
#[derive(Clone)]
pub struct Request {
    /// The type of operation requested by the block device
    pub op: Operation,

    /// A list of regions of guest memory to read/write into as part of the I/O
    /// request
    pub regions: Vec<GuestRegion>,

    /// A list of byte ranges to discard as part of the I/O request.  This is only
    /// relevant for discard operations, and is expected to be empty otherwise.
    pub ranges: Vec<(ByteOffset, ByteLen)>,
}
impl Request {
    pub fn new_read(
        off: ByteOffset,
        len: ByteLen,
        regions: Vec<GuestRegion>,
    ) -> Self {
        Self { op: Operation::Read(off, len), regions, ranges: Vec::new() }
    }

    pub fn new_write(
        off: ByteOffset,
        len: ByteLen,
        regions: Vec<GuestRegion>,
    ) -> Self {
        Self { op: Operation::Write(off, len), regions, ranges: Vec::new() }
    }

    pub fn new_flush() -> Self {
        let op = Operation::Flush;
        Self { op, regions: Vec::new(), ranges: Vec::new() }
    }

    pub fn new_discard(ranges: Vec<(ByteOffset, ByteLen)>) -> Self {
        let op = Operation::Discard;
        Self { op, regions: Vec::new(), ranges }
    }

    pub fn mappings<'a>(&self, mem: &'a MemCtx) -> Option<Vec<SubMapping<'a>>> {
        match &self.op {
            Operation::Read(..) => {
                self.regions.iter().map(|r| mem.writable_region(r)).collect()
            }
            Operation::Write(..) => {
                self.regions.iter().map(|r| mem.readable_region(r)).collect()
            }
            Operation::Flush | Operation::Discard => None,
        }
    }
}

/// Metadata regarding a virtualized block device.
#[derive(Default, Debug, Copy, Clone)]
pub struct DeviceInfo {
    /// Size (in bytes) per block
    pub block_size: u32,
    /// Device size in blocks (see above)
    pub total_size: u64,
    /// Is the device read-only
    pub read_only: bool,
    /// Does the device support discard/UNMAP
    pub supports_discard: bool,
}

/// Options to control behavior of block backend.
///
/// Values for omitted fields will be determined by the backend, likely by
/// querying the underlying resource.  If values provided conflict with said
/// resource, the backend may fail its initialization with an error.
#[derive(Default, Copy, Clone)]
pub struct BackendOpts {
    /// Size (in bytes) per block
    pub block_size: Option<u32>,

    /// Disallow writes (returning errors if attempted) and report a
    /// non-writable device (if frontend is capable)
    pub read_only: Option<bool>,

    /// Force flush requests to be skipped (turned into no-op)
    pub skip_flush: Option<bool>,
}

impl BackendOpts {
    /// Return `true` if and only if this backend is configured to be read-only.
    pub fn is_read_only(&self) -> bool {
        self.read_only.unwrap_or(false)
    }
}

/// Top-level trait for block devices (frontends) to translate guest block IO
/// requests into [Request]s for the attached [Backend]
pub trait Device: Send + Sync + 'static {
    /// Access to the [DeviceAttachment] representing this device.
    fn attachment(&self) -> &DeviceAttachment;
}

/// Top-level trait for block backends which will attach to [Device]s in order
/// to process [Request]s posted by the guest.
#[async_trait::async_trait]
pub trait Backend: Send + Sync + 'static {
    /// Access to the [BackendAttachment] representing this backend.
    fn attachment(&self) -> &BackendAttachment;

    /// Start attempting to process [Request]s from [Device] (if attached)
    ///
    /// Spawning of any tasks required to do such request processing can be done
    /// as part of this start-up.
    ///
    /// This operation will be invoked only once per backend (when its VM
    /// starts). Block backends are not explicitly resumed during VM lifecycle
    /// events; instead, their corresponding devices will stop issuing new
    /// requests while paused and resume issuing them when they are resumed.
    ///
    /// WARNING: The caller may abort VM startup and cancel the future created
    /// by this routine. In this case the caller may not call [`Self::stop()`]
    /// prior to dropping the backend. This routine is, however, guaranteed to
    /// be called before the VM's vCPUs are started.
    ///
    async fn start(&self) -> anyhow::Result<()>;

    /// Stop attempting to process new [Request]s from [Device] (if attached)
    ///
    /// Any in-flight processing of requests should be concluded before this
    /// call returns.
    ///
    /// If any tasks were spawned as part of [Backend::start()], they should be
    /// brought to rest as part of this call.
    ///
    /// This operation will be invoked only once per backend (when its VM
    /// stops). Block backends are not explicitly paused during VM lifecycle
    /// events; instead, their corresponding devices will stop issuing new
    /// requests when they are told to pause (and will only report they are
    /// fully paused when all their in-flight requests have completed).
    async fn stop(&self);

    /// TODO: good comment here explaining the downcasting
    fn as_any(&self) -> &dyn std::any::Any;
}

/// Consumer of per-[Request] metrics
pub trait MetricConsumer: Send + Sync + 'static {
    /// Called upon the completion of each block [Request] when a MetricConsumer
    /// has been set for a given [DeviceAttachment].
    fn request_completed(
        &self,
        queue_id: QueueId,
        op: Operation,
        result: Result,
        time_queued: Duration,
        time_processed: Duration,
    );
}


================================================
FILE: lib/propolis/src/chardev/file_out.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs::File as FsFile;
use std::num::NonZeroUsize;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use crate::chardev::pollers;
use crate::chardev::BlockingSource;

use tokio::fs::File;
use tokio::io::AsyncWriteExt;

struct Inner {
    fp: Option<FsFile>,
}

pub struct BlockingFileOutput {
    poller: Arc<pollers::BlockingSourceBuffer>,
    inner: Mutex<Inner>,
}

const BUF_SIZE: usize = 256;

impl BlockingFileOutput {
    pub fn new(fp: FsFile) -> Arc<Self> {
        let params = pollers::BlockingParams {
            poll_interval: Duration::from_millis(10),
            poll_miss_thresh: 5,
            buf_size: NonZeroUsize::new(BUF_SIZE).unwrap(),
        };
        let poller = pollers::BlockingSourceBuffer::new(params);

        Arc::new(Self { poller, inner: Mutex::new(Inner { fp: Some(fp) }) })
    }

    pub fn attach(&self, source: Arc<dyn BlockingSource>) {
        let mut inner = self.inner.lock().unwrap();
        let fp = inner.fp.take().unwrap();

        self.poller.attach(source.as_ref());

        let poller = Arc::clone(&self.poller);
        let _task = tokio::spawn(async move {
            let afp = File::from_std(fp);
            let _ = Self::run(poller, afp).await;
            todo!("get async task hdl");
        });
    }

    async fn run(poller: Arc<pollers::BlockingSourceBuffer>, mut fp: File) {
        let mut buf = [0u8; BUF_SIZE];
        loop {
            if let Some(n) = poller.read(&mut buf).await {
                fp.write_all(&buf[..n]).await.unwrap();
            }
        }
    }
}


================================================
FILE: lib/propolis/src/chardev/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Mutex;

mod file_out;
pub mod pollers;
mod sock;

pub use file_out::BlockingFileOutput;
pub use sock::UDSock;

pub type SinkNotifier = Box<dyn Fn(&dyn Sink) + Send + Sync + 'static>;
pub type SourceNotifier = Box<dyn Fn(&dyn Source) + Send + Sync + 'static>;
pub type BlockingSourceConsumer = Box<dyn Fn(&[u8]) + Send + Sync + 'static>;

pub trait Sink: Send + Sync + 'static {
    // XXX: make this slice based
    fn write(&self, data: u8) -> bool;

    /// Set notifier callback for when sink becomes writable.  If that callback acquires any
    /// exclusion resources (locks, etc), they must not be held setting the notifier.
    fn set_notifier(&self, f: Option<SinkNotifier>);
}

pub trait Source: Send + Sync + 'static {
    // XXX: make this slice based
    fn read(&self) -> Option<u8>;

    fn discard(&self, count: usize) -> usize;
    fn set_autodiscard(&self, active: bool);
    /// Set notifier callback for when source becomes readable.  If that callback acquires any
    /// exclusion resources (locks, etc), they must not be held setting the notifier.
    fn set_notifier(&self, f: Option<SourceNotifier>);
}

/// Device which is a source of bytes which must be processed synchronously,
/// lest they be lost in subsequent operations
pub trait BlockingSource: Send + Sync + 'static {
    fn set_consumer(&self, f: Option<BlockingSourceConsumer>);
}

type NotifierFn<T> = dyn Fn(&T) + Send + Sync + 'static;
pub struct NotifierCell<T: ?Sized> {
    is_set: AtomicBool,
    notifier: Mutex<Option<Box<NotifierFn<T>>>>,
}
impl<T: ?Sized> NotifierCell<T> {
    pub fn new() -> Self {
        Self { is_set: AtomicBool::new(false), notifier: Mutex::new(None) }
    }
}
impl NotifierCell<dyn Sink> {
    pub fn set(&self, f: Option<SinkNotifier>) {
        let mut guard = self.notifier.lock().unwrap();
        self.is_set.store(f.is_some(), Ordering::Release);
        *guard = f;
    }
    pub fn notify(&self, sink: &dyn Sink) {
        if self.is_set.load(Ordering::Acquire) {
            let guard = self.notifier.lock().unwrap();
            if let Some(f) = guard.as_ref() {
                f(sink);
            }
        }
    }
}
impl NotifierCell<dyn Source> {
    pub fn set(&self, f: Option<SourceNotifier>) {
        let mut guard = self.notifier.lock().unwrap();
        self.is_set.store(f.is_some(), Ordering::Release);
        *guard = f;
    }
    pub fn notify(&self, source: &dyn Source) {
        if self.is_set.load(Ordering::Acquire) {
            let guard = self.notifier.lock().unwrap();
            if let Some(f) = guard.as_ref() {
                f(source);
            }
        }
    }
}

pub struct ConsumerCell {
    is_set: AtomicBool,
    consumer: Mutex<Option<BlockingSourceConsumer>>,
}
impl ConsumerCell {
    pub fn new() -> Self {
        Self { is_set: AtomicBool::new(false), consumer: Mutex::new(None) }
    }
    pub fn set(&self, f: Option<BlockingSourceConsumer>) {
        let mut guard = self.consumer.lock().unwrap();
        self.is_set.store(f.is_some(), Ordering::Release);
        *guard = f;
    }
    pub fn consume(&self, data: &[u8]) {
        if self.is_set.load(Ordering::Acquire) {
            let guard = self.consumer.lock().unwrap();
            if let Some(f) = guard.as_ref() {
                f(data);
            }
        }
    }
}


================================================
FILE: lib/propolis/src/chardev/pollers.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::VecDeque;
use std::num::NonZeroUsize;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::time::{Duration, Instant};

use crate::chardev::{BlockingSource, Sink, Source};

use tokio::sync::Notify;
use tokio::time::sleep;

pub struct Params {
    pub poll_interval: Duration,
    pub poll_miss_thresh: usize,
    pub buf_size: NonZeroUsize,
}
struct SourceInner {
    buf: Vec<u8>,
    last_poll: Option<Instant>,
}
impl SourceInner {
    fn is_full(&self) -> bool {
        self.buf.len() == self.buf.capacity()
    }
}
pub struct SourceBuffer {
    data_ready: Notify,
    inner: Mutex<SourceInner>,
    poll_active: AtomicBool,
    params: Params,
}
impl SourceBuffer {
    pub fn new(params: Params) -> Arc<Self> {
        let this = Self {
            data_ready: Notify::new(),
            inner: Mutex::new(SourceInner {
                buf: Vec::with_capacity(params.buf_size.get()),
                last_poll: None,
            }),
            poll_active: AtomicBool::new(true),
            params,
        };
        Arc::new(this)
    }

    pub fn attach(self: &Arc<Self>, source: &dyn Source) {
        let this = Arc::clone(self);
        source.set_autodiscard(false);
        source.set_notifier(Some(Box::new(move |s| {
            this.notify(s);
        })));
    }

    /// Read data from Source and/or its associated buffer.
    ///
    /// # Cancel safety
    ///
    /// This method is cancel safe.  It can be used in `tokio::select!` and if
    /// cancelled, it is guaranteed that no data will have been read.
    pub async fn read(
        &self,
        buf: &mut [u8],
        source: &dyn Source,
    ) -> Option<usize> {
        if buf.is_empty() {
            return Some(0);
        }
        if self.poll_active.load(Ordering::Acquire) {
            let _ = self.leading_delay().await;
        }
        loop {
            let nread = self.read_data(buf, source);
            if nread > 0 {
                return Some(nread);
            }

            self.wait().await;
        }
    }

    /// If we are polling this Source and the buffer is not full, we may want to
    /// wait to let it fill further so we make fewer trips back and forth.
    async fn leading_delay(&self) -> Option<Duration> {
        let last_poll = {
            let inner = self.inner.lock().unwrap();
            // No delay if already full
            if inner.is_full() {
                return None;
            }
            let last = inner.last_poll;
            drop(inner);
            last
        };

        if let Some(since) = last_poll.map(|t| Instant::now().duration_since(t))
        {
            if let Some(diff) = self.params.poll_interval.checked_sub(since) {
                tokio::select! {
                    _ = sleep(diff) => {},
                    _ = self.data_ready.notified() => {},
                };
                return Some(diff);
            }
        }
        None
    }

    async fn wait(&self) {
        self.poll_active.store(true, Ordering::Release);
        let mut misses = 0;
        loop {
            tokio::select! {
                _ = sleep(self.params.poll_interval) => {
                    let mut inner = self.inner.lock().unwrap();
                    if inner.buf.is_empty() {
                        inner.last_poll = Some(Instant::now());
                        misses += 1;
                        if misses > self.params.poll_miss_thresh {
                            self.poll_active.store(false, Ordering::Release);
                            break;
                        }
                    } else {
                        return;
                    }
                },
                _ = self.data_ready.notified() => {
                    return;
                },
            };
        }

        // We have exceeded the miss threshold
        self.data_ready.notified().await;
    }

    pub fn read_data(&self, buf: &mut [u8], source: &dyn Source) -> usize {
        let mut inner = self.inner.lock().unwrap();
        let mut copied = copy_and_consume(&mut inner.buf, buf);
        // Can also attempt to read direct from the Source
        if copied < buf.len() {
            if let Some(b) = source.read() {
                buf[copied] = b;
                copied += 1;
            }
        }
        inner.last_poll = Some(Instant::now());
        copied
    }

    fn notify(&self, source: &dyn Source) {
        if self.poll_active.load(Ordering::Acquire) {
            let mut inner = self.inner.lock().unwrap();
            if !inner.is_full() {
                if let Some(c) = source.read() {
                    inner.buf.push(c);
                }
                // If the buffer is not full and polling is still active, elide
                // the notification to the Source consumer.
                if !inner.is_full() && self.poll_active.load(Ordering::Acquire)
                {
                    return;
                }
            }
        }
        self.data_ready.notify_one();
    }
}

struct SinkInner {
    buf: VecDeque<u8>,
    wait_empty: bool,
}
impl SinkInner {
    fn is_full(&self) -> bool {
        self.buf.len() == self.buf.capacity()
    }
}

pub struct SinkBuffer {
    notify: Notify,
    inner: Mutex<SinkInner>,
}

impl SinkBuffer {
    pub fn new(size: NonZeroUsize) -> Arc<Self> {
        let this = Self {
            notify: Notify::new(),
            inner: Mutex::new(SinkInner {
                buf: VecDeque::with_capacity(size.get()),
                wait_empty: false,
            }),
        };
        Arc::new(this)
    }

    pub fn attach(self: &Arc<Self>, sink: &dyn Sink) {
        let this = Arc::clone(self);
        sink.set_notifier(Some(Box::new(move |s| {
            this.notify(s);
        })));
    }

    pub async fn wait_empty(&self) {
        loop {
            {
                let mut inner = self.inner.lock().unwrap();
                if inner.buf.is_empty() {
                    inner.wait_empty = false;
                    return;
                }
                inner.wait_empty = true;
            }
            self.notify.notified().await;
        }
    }

    /// Write data into the Sink and/or its associated buffer.
    ///
    /// # Cancel safety
    ///
    /// This method is cancel safe.  It can be used in `tokio::select!` and if
    /// cancelled, it is guaranteed that no data will have been written.
    pub async fn write(
        &self,
        mut data: &[u8],
        sink: &dyn Sink,
    ) -> Option<usize> {
        if data.is_empty() {
            return Some(0);
        }
        loop {
            {
                let mut inner = self.inner.lock().unwrap();
                let mut nwritten = 0;

                // If the buffer started empty, try to kick the sink into
                // accepting data.
                if inner.buf.is_empty() {
                    while !data.is_empty() {
                        if sink.write(data[0]) {
                            data = &data[1..];
                            nwritten += 1;
                        } else {
                            break;
                        }
                    }
                }

                // Push whatever is left into the buffer
                while !inner.is_full() {
                    if let Some((c, rest)) = data.split_first() {
                        inner.buf.push_back(*c);
                        data = rest;
                        nwritten += 1;
                    } else {
                        break;
                    }
                }

                if nwritten > 0 {
                    return Some(nwritten);
                } else {
                    inner.wait_empty = true;
                }
            }

            self.notify.notified().await;
        }
    }

    fn notify(&self, sink: &dyn Sink) {
        let mut inner = self.inner.lock().unwrap();
        while let Some(c) = inner.buf.pop_front() {
            if !sink.write(c) {
                inner.buf.push_front(c);
                break;
            }
        }
        if inner.buf.is_empty() || !inner.wait_empty {
            self.notify.notify_one();
        }
    }
}

pub struct BlockingParams {
    pub poll_interval: Duration,
    pub poll_miss_thresh: usize,
    pub buf_size: NonZeroUsize,
}
struct BlockingSourceInner {
    buf: Vec<u8>,
    last_poll: Option<Instant>,
}
impl BlockingSourceInner {
    fn is_full(&self) -> bool {
        self.buf.len() == self.buf.capacity()
    }
}
pub struct BlockingSourceBuffer {
    data_ready: Notify,
    consume_cv: Condvar,
    inner: Mutex<BlockingSourceInner>,
    poll_active: AtomicBool,
    params: BlockingParams,
}
impl BlockingSourceBuffer {
    pub fn new(params: BlockingParams) -> Arc<Self> {
        let this = Self {
            data_ready: Notify::new(),
            consume_cv: Condvar::new(),
            inner: Mutex::new(BlockingSourceInner {
                buf: Vec::with_capacity(params.buf_size.get()),
                last_poll: None,
            }),
            poll_active: AtomicBool::new(false),
            params,
        };
        Arc::new(this)
    }

    pub fn attach(self: &Arc<Self>, source: &dyn BlockingSource) {
        let this = Arc::clone(self);
        source.set_consumer(Some(Box::new(move |data| {
            this.consume(data);
        })));
    }

    pub async fn read(&self, buf: &mut [u8]) -> Option<usize> {
        if buf.is_empty() {
            return Some(0);
        }
        if self.poll_active.load(Ordering::Relaxed) {
            let _ = self.leading_delay().await;
        }
        loop {
            let nread = {
                let mut inner = self.inner.lock().unwrap();
                inner.last_poll = Some(Instant::now());
                let nread = copy_and_consume(&mut inner.buf, buf);
                self.consume_cv.notify_one();
                nread
            };
            if nread > 0 {
                return Some(nread);
            }

            self.wait_for_data().await;
        }
    }

    /// If we are polling this Source and the buffer is not full, we may want to
    /// wait to let it fill further so we make fewer trips back and forth.
    async fn leading_delay(&self) -> Option<Duration> {
        let last_poll = {
            let inner = self.inner.lock().unwrap();
            // No delay if already full
            if inner.is_full() {
                return None;
            }
            let last = inner.last_poll;
            drop(inner);
            last
        };

        if let Some(since) = last_poll.map(|t| Instant::now().duration_since(t))
        {
            if let Some(diff) = self.params.poll_interval.checked_sub(since) {
                tokio::select! {
                    _ = sleep(diff) => {},
                    _ = self.data_ready.notified() => {},
                };
                return Some(diff);
            }
        }
        None
    }

    async fn wait_for_data(&self) {
        self.poll_active.store(true, Ordering::Release);
        let mut misses = 0;
        loop {
            tokio::select! {
                _ = sleep(self.params.poll_interval) => {
                    let mut inner = self.inner.lock().unwrap();
                    if inner.buf.is_empty() {
                        inner.last_poll = Some(Instant::now());
                        misses += 1;
                        if misses > self.params.poll_miss_thresh {
                            self.poll_active.store(false, Ordering::Release);
                            break;
                        }
                    } else {
                        return;
                    }
                },
                _ = self.data_ready.notified() => {
                    return;
                },
            };
        }

        // We have exceeded the miss threshold
        self.data_ready.notified().await;
    }

    fn consume(&self, mut data: &[u8]) {
        let mut inner = self.inner.lock().unwrap();
        while !data.is_empty() {
            if inner.is_full() {
                self.data_ready.notify_one();
                // TODO: What guarantees do we want make about the poller
                // vacating space in the buffer in a timely fashion?  This is
                // particularly relevant during operations like quiesce.
                inner =
                    self.consume_cv.wait_while(inner, |i| i.is_full()).unwrap();
            }
            let old_len = inner.buf.len();
            let copy_len =
                usize::min(data.len(), inner.buf.capacity() - old_len);
            inner.buf.extend_from_slice(&data[..copy_len]);
            let (_consumed, remain) = data.split_at(copy_len);
            data = remain;
        }

        if !inner.is_full() {
            if self.poll_active.load(Ordering::Acquire) {
                // The buffer is not full and we are being polled, so elide the
                // wake-up for now.
                return;
            }
        }
        drop(inner);
        self.data_ready.notify_one();
    }
}

/// Copy available data from a Vec. Any remaining data will be copied to the
/// front of the Vec, truncating the vacated space without altering its
/// allocated capacity.
fn copy_and_consume(src: &mut Vec<u8>, dest: &mut [u8]) -> usize {
    if src.is_empty() || dest.is_empty() {
        0
    } else {
        let old_len = src.len();
        let copy_len = usize::min(dest.len(), old_len);
        dest[..copy_len].copy_from_slice(&src[..copy_len]);
        if copy_len != old_len {
            src.copy_within(copy_len.., 0);
        }
        src.truncate(old_len - copy_len);
        copy_len
    }
}

#[test]
fn test_copy_and_consume_1() {
    // Test copy_and_consume behaviour:
    // - source is copied to dest, and number of u8 copied is returned
    // - source is truncated without altering capacity

    let mut buf = vec![
        108, 111, 99, 97, 108, 104, 111, 115, 116, 58, 126, 35, 32, 27, 91, 54,
        110,
    ];
    let mut output = [0u8; 8];

    // before anything, assert len and capacity
    assert_eq!(buf.len(), 17);
    assert_eq!(buf.capacity(), 17);

    let n = copy_and_consume(&mut buf, &mut output[..]);

    // assert copy_and_consume fills output
    assert_eq!(n, 8);

    // assert capacity has not changed
    assert_eq!(buf.capacity(), 17);

    // assert copy_and_consume modify their arguments.
    assert_eq!(output[..n], vec![108, 111, 99, 97, 108, 104, 111, 115]);
    assert_eq!(buf, vec![116, 58, 126, 35, 32, 27, 91, 54, 110]);

    let n = copy_and_consume(&mut buf, &mut output[..]);

    // assert copy_and_consume fills output
    assert_eq!(n, 8);

    // assert capacity has not changed
    assert_eq!(buf.capacity(), 17);

    // assert further argument modification
    assert_eq!(output[..n], vec![116, 58, 126, 35, 32, 27, 91, 54]);
    assert_eq!(buf, vec![110]);

    let n = copy_and_consume(&mut buf, &mut output[..]);

    // assert copy_and_consume cannot fill output this time
    assert_eq!(n, 1);

    // assert capacity has not changed
    assert_eq!(buf.capacity(), 17);

    // assert further argument modification
    assert_eq!(output[..n], vec![110]);
    assert!(buf.is_empty());

    // assert that when copy_and_consume's source is empty, it does nothing
    let n = copy_and_consume(&mut buf, &mut output[..]);
    assert_eq!(n, 0);

    // assert that the output of copy_and_consume is consistent with it doing
    // nothing
    assert_eq!(buf.capacity(), 17);
    assert!(output[..n].is_empty());
    assert!(buf.is_empty());

    // assert that when it does nothing, output isn't changed
    assert_eq!(output[0..1], vec![110]);
}

#[test]
fn test_copy_and_consume_one_u8() {
    // Test that copy_and_consume works when source is one u8.
    let mut buf = vec![108];
    let mut output = [0u8; 8];

    assert_eq!(buf.len(), 1);
    assert_eq!(buf.capacity(), 1);

    let n = copy_and_consume(&mut buf, &mut output[..]);

    // only one u8 to read from source
    assert_eq!(n, 1);

    // assert that one u8 is read, that the source is now empty, and that
    // capacity is unchanged.
    assert_eq!(output[..n], vec![108]);
    assert!(buf.is_empty());
    assert_eq!(buf.capacity(), 1);
}

#[cfg(test)]
impl Params {
    pub(crate) fn test_defaults() -> Self {
        Self {
            poll_interval: Duration::from_millis(10),
            poll_miss_thresh: 2,
            buf_size: NonZeroUsize::new(16).unwrap(),
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::chardev::*;

    use futures::FutureExt;

    #[tokio::test]
    async fn read_empty_returns_zero_bytes() {
        let uart = Arc::new(TestUart::new(4, 4));
        let rpoll = SourceBuffer::new(Params::test_defaults());
        rpoll.attach(uart.as_ref());

        let mut output = [];
        let res = rpoll.read(&mut output, uart.as_ref()).await.unwrap();
        assert_eq!(res, 0);
    }

    #[tokio::test]
    async fn write_empty_fills_zero_bytes() {
        let uart = Arc::new(TestUart::new(4, 4));
        let wpoll = SinkBuffer::new(NonZeroUsize::new(16).unwrap());
        wpoll.attach(uart.as_ref());

        let input = [];
        let res = wpoll.write(&input, uart.as_ref()).await.unwrap();
        assert_eq!(res, 0);
    }

    #[tokio::test]
    async fn read_byte() {
        let uart = Arc::new(TestUart::new(4, 4));
        let rpoll = SourceBuffer::new(Params::test_defaults());
        rpoll.attach(uart.as_ref());

        // If the guest writes a byte...
        uart.push_source(0xFE);
        uart.notify_source().await;

        let mut output = [0u8; 16];
        // ... We can read that byte.
        assert_eq!(1, rpoll.read(&mut output, uart.as_ref()).await.unwrap());
        assert_eq!(output[0], 0xFE);
    }

    #[tokio::test]
    async fn read_bytes() {
        let uart = Arc::new(TestUart::new(2, 2));
        let rpoll = SourceBuffer::new(Params::test_defaults());
        rpoll.attach(uart.as_ref());

        // If the guest writes multiple bytes...
        uart.push_source(0x0A);
        uart.push_source(0x0B);
        uart.notify_source().await;

        let mut output = [0u8; 16];
        // ... We can read them.
        assert_eq!(2, rpoll.read(&mut output, uart.as_ref()).await.unwrap());
        assert_eq!(output[0], 0x0A);
        assert_eq!(output[1], 0x0B);
    }

    #[tokio::test]
    async fn read_bytes_blocking() {
        let uart = Arc::new(TestUart::new(4, 4));
        let rpoll = SourceBuffer::new(Params::test_defaults());
        rpoll.attach(uart.as_ref());

        let mut output = [0u8; 16];

        // Before the source has been filled, reads should not succeed.
        futures::select! {
            _ = rpoll.read(&mut output, uart.as_ref()).fuse() => {
                panic!("Shouldn't be readable")
            }
            default => {}
        }

        uart.push_source(0xFE);
        uart.notify_source().await;

        // However, once the uart has identified that it is readable, we can
        // begin reading bytes.
        assert_eq!(1, rpoll.read(&mut output, uart.as_ref()).await.unwrap());
        assert_eq!(output[0], 0xFE);
    }

    #[tokio::test]
    async fn write_byte() {
        let uart = Arc::new(TestUart::new(4, 4));
        let wpoll = SinkBuffer::new(NonZeroUsize::new(16).unwrap());
        wpoll.attach(uart.as_ref());

        let input = [0xFE];
        // If we write a byte...
        assert_eq!(1, wpoll.write(&input, uart.as_ref()).await.unwrap());

        // ... The guest can read it.
        assert_eq!(uart.pop_sink().unwrap(), 0xFE);
    }

    #[tokio::test]
    async fn write_bytes() {
        let uart = Arc::new(TestUart::new(4, 4));
        let wpoll = SinkBuffer::new(NonZeroUsize::new(16).unwrap());
        wpoll.attach(uart.as_ref());

        let input = [0x0A, 0x0B];
        // If we write multiple bytes...
        assert_eq!(2, wpoll.write(&input, uart.as_ref()).await.unwrap());

        // ... The guest can read them.
        assert_eq!(uart.pop_sink().unwrap(), 0x0A);
        assert_eq!(uart.pop_sink().unwrap(), 0x0B);
    }

    #[tokio::test]
    async fn write_bytes_beyond_internal_buffer_size() {
        let uart = Arc::new(TestUart::new(1, 1));
        let wpoll = SinkBuffer::new(NonZeroUsize::new(3).unwrap());
        wpoll.attach(uart.as_ref());
        assert_eq!(3, wpoll.inner.lock().unwrap().buf.capacity());

        // By attempting to write five bytes, we fill the following pipeline
        // in stages:
        //
        // [Client] -> [Serial Buffer] -> [UART]
        //             ^ 3 byte cap       ^ 1 byte cap
        //
        // After both the serial buffer and UART are saturated (four bytes
        // total) the write future will no longer complete successfully.
        //
        // Once this occurs, the UART will need to pop data from the
        // incoming sink to make space for subsequent writes.
        let input = [0x0A, 0x0B, 0x0C, 0x0D, 0x0E];
        assert_eq!(4, wpoll.write(&input, uart.as_ref()).await.unwrap());

        futures::select! {
            _ = wpoll.write(&input[4..], uart.as_ref()).fuse() => {
                panic!("Shouldn't be writable")
            }
            default => {}
        }

        assert_eq!(uart.pop_sink().unwrap(), 0x0A);
        uart.notify_sink().await;

        // After a byte is popped, the last byte becomes writable.
        assert_eq!(1, wpoll.write(&input[4..], uart.as_ref()).await.unwrap());

        assert_eq!(uart.pop_sink().unwrap(), 0x0B);
        uart.notify_sink().await;
        assert_eq!(uart.pop_sink().unwrap(), 0x0C);
        uart.notify_sink().await;
        assert_eq!(uart.pop_sink().unwrap(), 0x0D);
        uart.notify_sink().await;
        assert_eq!(uart.pop_sink().unwrap(), 0x0E);
    }

    struct TestUart {
        // The "capacity" fields here are a little redundant with the underlying
        // VecDeque capacities, but those values may get rounded up.
        //
        // To be more precise with "blocking-on-buffer-full" tests, we preserve
        // the original requested capacity value, which may be smaller.
        sink_cap: usize,
        sink: Mutex<VecDeque<u8>>,
        source_cap: usize,
        source: Mutex<VecDeque<u8>>,
        sink_notifier: NotifierCell<dyn Sink>,
        source_notifier: NotifierCell<dyn Source>,
        auto_discard: AtomicBool,
    }

    impl TestUart {
        fn new(sink_size: usize, source_size: usize) -> Self {
            TestUart {
                sink_cap: sink_size,
                sink: Mutex::new(VecDeque::with_capacity(sink_size)),
                source_cap: source_size,
                source: Mutex::new(VecDeque::with_capacity(source_size)),
                sink_notifier: NotifierCell::new(),
                source_notifier: NotifierCell::new(),
                auto_discard: AtomicBool::new(true),
            }
        }

        // Add a byte which can later get popped out of the source.
        fn push_source(&self, byte: u8) {
            let mut source = self.source.lock().unwrap();
            assert!(source.len() < self.source_cap);
            source.push_back(byte);
        }

        // Pop a byte out of the sink.
        fn pop_sink(&self) -> Option<u8> {
            let mut sink = self.sink.lock().unwrap();
            sink.pop_front()
        }

        async fn notify_source(&self) {
            self.source_notifier.notify(self);
        }

        async fn notify_sink(&self) {
            self.sink_notifier.notify(self);
        }
    }

    impl Sink for TestUart {
        fn write(&self, data: u8) -> bool {
            let mut sink = self.sink.lock().unwrap();
            if sink.len() < self.sink_cap {
                sink.push_back(data);
                true
            } else {
                false
            }
        }
        fn set_notifier(&self, f: Option<SinkNotifier>) {
            self.sink_notifier.set(f);
        }
    }

    impl Source for TestUart {
        fn read(&self) -> Option<u8> {
            let mut source = self.source.lock().unwrap();
            source.pop_front()
        }
        fn discard(&self, _count: usize) -> usize {
            panic!();
        }
        fn set_autodiscard(&self, active: bool) {
            self.auto_discard.store(active, Ordering::SeqCst);
        }
        fn set_notifier(&self, f: Option<SourceNotifier>) {
            self.source_notifier.set(f);
        }
    }
}


================================================
FILE: lib/propolis/src/chardev/sock.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs;
use std::io::{ErrorKind, Result};
use std::num::NonZeroUsize;
use std::os::unix::net::UnixListener as StdUnixListener;
use std::path::Path;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex};
use std::time::Duration;

use crate::chardev::{pollers, Sink, Source};

use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::unix::{OwnedReadHalf, OwnedWriteHalf, SocketAddr};
use tokio::net::UnixListener;

const BUF_SIZE: usize = 512;
const POLL_INTERVAL_MS: usize = 10;
const POLL_MISS_THRESH: usize = 5;

struct Inner {
    std_sock: Option<StdUnixListener>,
    client: Option<SocketAddr>,
}

pub struct UDSock {
    inner: Mutex<Inner>,
    cv: Condvar,
    abort: AtomicBool,
    sink_buf: Arc<pollers::SinkBuffer>,
    source_buf: Arc<pollers::SourceBuffer>,
}
impl UDSock {
    pub fn bind(path: &Path) -> Result<Arc<Self>> {
        let lsock = match StdUnixListener::bind(path) {
            Ok(sock) => sock,
            Err(e) => {
                if e.kind() != ErrorKind::AddrInUse {
                    return Err(e);
                }
                // XXX just blindly do remove
                fs::remove_file(path)?;
                StdUnixListener::bind(path)?
            }
        };
        lsock.set_nonblocking(true)?;

        let this = Arc::new(Self {
            inner: Mutex::new(Inner { std_sock: Some(lsock), client: None }),
            cv: Condvar::new(),
            abort: AtomicBool::new(false),
            sink_buf: pollers::SinkBuffer::new(
                NonZeroUsize::new(BUF_SIZE).unwrap(),
            ),
            source_buf: pollers::SourceBuffer::new(pollers::Params {
                poll_interval: Duration::from_millis(POLL_INTERVAL_MS as u64),
                poll_miss_thresh: POLL_MISS_THRESH,
                buf_size: NonZeroUsize::new(BUF_SIZE).unwrap(),
            }),
        });

        Ok(this)
    }
    pub fn spawn(
        self: &Arc<Self>,
        sink: Arc<dyn Sink>,
        source: Arc<dyn Source>,
    ) {
        self.sink_buf.attach(sink.as_ref());
        self.source_buf.attach(source.as_ref());

        let this = Arc::clone(self);
        let _task = tokio::spawn(async move {
            let _ = this.run(sink, source).await;
            todo!("get async task hdl");
        });
    }

    fn notify_connected(&self, addr: Option<SocketAddr>) {
        let mut inner = self.inner.lock().unwrap();
        inner.client = addr;
        self.cv.notify_all();
    }

    pub fn wait_for_connect(&self) -> bool {
        let inner = self.inner.lock().unwrap();
        if inner.client.is_some() {
            return true;
        }
        let inner = self
            .cv
            .wait_while(inner, |i| {
                let abort = self.abort.load(Ordering::Relaxed);
                !abort && i.client.is_none()
            })
            .unwrap();
        inner.client.is_some()
    }

    #[cfg(test)]
    pub fn wait_for_disconnect(&self) {
        let inner = self.inner.lock().unwrap();
        if inner.client.is_none() {
            return;
        }
        let _inner = self.cv.wait_while(inner, |i| i.client.is_some());
    }

    pub fn shutdown(&self) {
        self.abort.store(true, Ordering::Relaxed);
        self.cv.notify_all();
    }

    pub async fn run(
        &self,
        sink: Arc<dyn Sink>,
        source: Arc<dyn Source>,
    ) -> Result<()> {
        let lsock = {
            let mut inner = self.inner.lock().unwrap();
            let sock = inner.std_sock.take().unwrap();
            drop(inner);
            sock
        };
        let lsock = UnixListener::from_std(lsock)?;
        while let Ok((sock, addr)) = lsock.accept().await {
            self.notify_connected(Some(addr));
            let (readh, writeh) = sock.into_split();

            tokio::select! {
                _sink_done = Self::run_sink(
                    sink.as_ref(),
                    &self.sink_buf,
                    readh,
                ) => {},
                _source_done = Self::run_source(
                    source.as_ref(),
                    &self.source_buf,
                    writeh,
                ) => {},
            };

            self.notify_connected(None);
        }
        Ok(())
    }
    async fn run_sink(
        sink: &dyn Sink,
        sink_buf: &pollers::SinkBuffer,
        mut readh: OwnedReadHalf,
    ) -> Result<()> {
        let mut buf = [0u8; BUF_SIZE];
        loop {
            let num = readh.read(&mut buf).await?;
            if num == 0 {
                // If the client is gone, we're done here
                return Ok(());
            }
            sink_buf.write(&buf[..num], sink).await;
        }
    }
    async fn run_source(
        source: &dyn Source,
        source_buf: &pollers::SourceBuffer,
        mut writeh: OwnedWriteHalf,
    ) -> Result<()> {
        let mut buf = [0u8; BUF_SIZE];
        loop {
            if let Some(n) = source_buf.read(&mut buf, source).await {
                writeh.write_all(&buf[..n]).await?;
            }
        }
    }
}

#[cfg(test)]
mod test {
    use std::os::unix::net::UnixStream;
    use std::time::Duration;

    use super::*;
    use crate::chardev;

    use tempfile::NamedTempFile;

    struct TestChardev {
        sink_notify: chardev::NotifierCell<dyn Sink>,
        source_notify: chardev::NotifierCell<dyn Source>,
    }
    impl TestChardev {
        fn new() -> Self {
            Self {
                sink_notify: chardev::NotifierCell::new(),
                source_notify: chardev::NotifierCell::new(),
            }
        }
    }

    impl chardev::Sink for TestChardev {
        fn write(&self, _data: u8) -> bool {
            // Accept all writes
            true
        }

        fn set_notifier(&self, f: Option<chardev::SinkNotifier>) {
            self.sink_notify.set(f);
        }
    }
    impl chardev::Source for TestChardev {
        fn read(&self) -> Option<u8> {
            None
        }

        fn discard(&self, count: usize) -> usize {
            count
        }

        fn set_autodiscard(&self, _active: bool) {}

        fn set_notifier(&self, f: Option<chardev::SourceNotifier>) {
            self.source_notify.set(f);
        }
    }

    async fn wait_connected(sock: &Arc<UDSock>) -> bool {
        let wsock = sock.clone();
        tokio::spawn(async move {
            tokio::task::block_in_place(|| wsock.wait_for_connect())
        })
        .await
        .expect("failed to join on wait_for_connect")
    }
    async fn wait_disconnected(sock: &Arc<UDSock>) {
        let wsock = sock.clone();
        let _ = tokio::spawn(async move {
            tokio::task::block_in_place(|| wsock.wait_for_disconnect())
        })
        .await;
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn bail_on_shutdown_sock() {
        let tempf = NamedTempFile::new().expect("can create tempfile");
        let sockpath = tempf.into_temp_path();

        let testdev = Arc::new(TestChardev::new());

        std::fs::remove_file(&sockpath)
            .expect("can unlink tempfile prior to sock bind");
        let sock = UDSock::bind(&sockpath).expect("socket bind succeeds");

        sock.spawn(testdev.clone(), testdev.clone());

        // Make sure that a client can successfully connect and disconnect

        let csock = UnixStream::connect(&sockpath)
            .expect("can connect to chardev sock");
        assert!(wait_connected(&sock).await);
        drop(csock);

        tokio::time::timeout(Duration::from_secs(1), wait_disconnected(&sock))
            .await
            .expect("socket transitions to disconnected within arb. timeout");

        let csock = UnixStream::connect(&sockpath)
            .expect("can connect to chardev sock");
        assert!(wait_connected(&sock).await);
        drop(csock);
    }

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn abort_wait_for_connect() {
        let tempf = NamedTempFile::new().expect("can create tempfile");
        let sockpath = tempf.into_temp_path();

        let testdev = Arc::new(TestChardev::new());

        std::fs::remove_file(&sockpath)
            .expect("can unlink tempfile prior to sock bind");
        let sock = UDSock::bind(&sockpath).expect("socket bind succeeds");

        sock.spawn(testdev.clone(), testdev.clone());

        // Spawn a task to wait for a connection
        let wait_sock = sock.clone();
        let wait_task =
            tokio::spawn(async move { wait_connected(&wait_sock).await });

        // Now let's try to have it abort waiting for a connection
        sock.shutdown();

        let connected = wait_task.await.expect("failed to join wait_task");
        assert!(!connected);
    }
}


================================================
FILE: lib/propolis/src/common.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::ops::{Add, BitAnd};
use std::ops::{Bound::*, RangeBounds};
use std::slice::SliceIndex;
use std::sync::atomic::{AtomicBool, Ordering};

use crate::vmm::SubMapping;

/// A vCPU number.
#[derive(Clone, Copy, Debug)]
pub struct VcpuId(u32);

impl From<u32> for VcpuId {
    fn from(value: u32) -> Self {
        Self(value)
    }
}

impl From<i32> for VcpuId {
    /// Converts a signed 32-bit value into a CPU identifier.
    ///
    /// # Panics
    ///
    /// Panics if `value` cannot be converted into a `u32`. This should
    /// generally not be possible because bhyve uses non-negative (though
    /// signed) CPU identifiers.
    fn from(value: i32) -> Self {
        Self(
            u32::try_from(value)
                .expect("vCPU number {value} should fit in a u32"),
        )
    }
}

impl Into<u32> for VcpuId {
    fn into(self) -> u32 {
        self.0
    }
}

impl Into<i32> for VcpuId {
    /// Converts a CPU identifier into a signed 32-bit value.
    ///
    /// # Panics
    ///
    /// Panics if the inner value cannot be converted to an `i32`. This should
    /// generally not be possible because Propolis limits the maximum number of
    /// CPUs a VM can have to a number well below `i32::MAX`.
    fn into(self) -> i32 {
        i32::try_from(self.0)
            .expect("vCPU number {self.0} should fit in an i32")
    }
}

/// Controls whether items wrapped in a [`GuestData`] are displayed or redacted
/// when the wrappers are printed via their `Display` or `Debug` impls.
//
// The Propolis server binary should only link the Propolis lib once (any
// structure that links the lib multiple times means something is very odd about
// its dependency graph), so there should never be any ambiguity about what
// `DISPLAY_GUEST_DATA` refers to when linking. But to be maximally cautious,
// label this static as `no_mangle` so that pulling in multiple Propolis
// libraries will break the build instead of possibly resolving ambiguously.
#[no_mangle]
pub static DISPLAY_GUEST_DATA: AtomicBool = AtomicBool::new(false);

/// A wrapper type denoting that the contained `T` was obtained from the guest
/// (e.g. by reading the guest's memory). This type implements various traits
/// (`Deref`, `DerefMut`, and `Borrow`) that allow it to be treated in most
/// cases as just another instance of a `T`. The main difference is that this
/// wrapper has custom `Display` and `Debug` implementations that redact the
/// wrapped value unless the program has set the [`DISPLAY_GUEST_DATA`] flag.
///
/// NOTE: This wrapper type is not airtight: owners of a wrapper can always
/// dereference it and invoke the Display/Debug impls directly on the resulting
/// reference to the wrapped value. If `T` is `Clone`, they can also clone the
/// dereferenced value and display the clone. (This comes with the territory
/// here: users need to be able to get at the wrapped value to be able to do
/// anything useful with it!)
///
/// NOTE: This type does not provide any other security guarantees (e.g. it does
/// not ensure that the wrapped memory will be zeroed on drop).
#[derive(Clone, Copy)]
#[repr(transparent)]
pub struct GuestData<T: ?Sized>(T);

impl<T: std::fmt::Display> std::fmt::Display for GuestData<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if DISPLAY_GUEST_DATA.load(Ordering::Relaxed) {
            write!(f, "{}", self.0)
        } else {
            write!(f, "<guest data redacted>")
        }
    }
}

impl<T: std::fmt::Debug> std::fmt::Debug for GuestData<T> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if DISPLAY_GUEST_DATA.load(Ordering::Relaxed) {
            write!(f, "{:?}", self.0)
        } else {
            write!(f, "<guest data redacted>")
        }
    }
}

impl<T> std::ops::Deref for GuestData<T> {
    type Target = T;

    fn deref(&self) -> &Self::Target {
        &self.0
    }
}

impl<T> std::ops::DerefMut for GuestData<T> {
    fn deref_mut(&mut self) -> &mut Self::Target {
        &mut self.0
    }
}

impl<T> From<T> for GuestData<T> {
    fn from(value: T) -> Self {
        Self(value)
    }
}

impl<T> std::borrow::Borrow<T> for GuestData<T> {
    fn borrow(&self) -> &T {
        &self.0
    }
}

fn numeric_bounds(
    bound: impl RangeBounds<usize>,
    len: usize,
) -> (usize, usize) {
    match (bound.start_bound(), bound.end_bound()) {
        (Unbounded, Unbounded) => (0, len),
        (Unbounded, Included(i)) => {
            assert!(*i < len);
            (0, i.checked_add(1).unwrap())
        }
        (Unbounded, Excluded(e)) => {
            assert!(*e < len);
            (0, *e)
        }

        (Included(i), Unbounded) => {
            assert!(*i < len);
            (*i, len)
        }
        (Included(si), Included(ei)) => {
            assert!(*si < len);
            assert!(*ei < len);
            assert!(*si <= *ei);
            (*si, ei.checked_add(1).unwrap())
        }
        (Included(si), Excluded(ee)) => {
            assert!(*si < len);
            assert!(*ee <= len);
            assert!(*si <= *ee);
            (*si, *ee)
        }
        (Excluded(_), _) => {
            panic!("Exclude start_bound not supported");
        }
    }
}

enum ROInner<'a> {
    Buf(&'a mut [u8]),
    Map(SubMapping<'a>),
}

/// Represents an abstract requested read operation.
///
/// Exposes an API with various "write" methods, which fulfill the request.
pub struct ReadOp<'a> {
    inner: ROInner<'a>,
    offset: usize,
    write_offset: usize,
}

impl<'a> ReadOp<'a> {
    /// Initializes a new read operation from a mapping.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: An auxiliary offset stored within the operation,
    /// identifying the region which should be accessed to populate `mapping`.
    /// - `mapping`: A mapping which represents the "sink" of the read operation.
    pub fn from_mapping(op_offset: usize, mapping: SubMapping<'a>) -> Self {
        Self {
            inner: ROInner::Map(mapping),
            offset: op_offset,
            write_offset: 0,
        }
    }

    /// Initializes a new read operation from a buffer.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: An auxiliary offset stored within the operation,
    /// identifying the region which should be accessed to populate `buf`.
    /// - `buffer`: A buffer which represents the "sink" of the read operation.
    pub fn from_buf(op_offset: usize, buffer: &'a mut [u8]) -> Self {
        Self { inner: ROInner::Buf(buffer), offset: op_offset, write_offset: 0 }
    }

    /// Constructs a child read operation from within an existing read
    /// operation.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: Offset of the child operation. Does not need to correlate
    /// to the `parent` operation's offset.
    /// - `parent`: The operation from which this operation is being split.
    /// - `range`: The location within the parent operation to be moved
    /// to the child.
    pub fn new_child<'b, R>(
        op_offset: usize,
        parent: &'a mut ReadOp,
        range: R,
    ) -> ReadOp<'b>
    where
        'a: 'b,
        R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
    {
        match &mut parent.inner {
            ROInner::Buf(b) => ReadOp {
                inner: ROInner::Buf(&mut b[range]),
                offset: op_offset,
                write_offset: 0,
            },
            ROInner::Map(m) => {
                let (start, end) = numeric_bounds(range, m.len());
                let len = end - start;
                let m = m.subregion(start, len).unwrap();
                ReadOp {
                    inner: ROInner::Map(m),
                    offset: op_offset,
                    write_offset: 0,
                }
            }
        }
    }

    pub fn len(&self) -> usize {
        match &self.inner {
            ROInner::Buf(b) => b.len(),
            ROInner::Map(m) => m.len(),
        }
    }
    pub fn avail(&self) -> usize {
        self.len().checked_sub(self.write_offset).unwrap()
    }
    pub fn offset(&self) -> usize {
        self.offset
    }
    pub fn bytes_written(&self) -> usize {
        self.write_offset
    }

    pub fn write_u8(&mut self, val: u8) {
        self.write_bytes(&val.to_le_bytes()[..]);
    }
    pub fn write_u16(&mut self, val: u16) {
        self.write_bytes(&val.to_le_bytes()[..]);
    }
    pub fn write_u32(&mut self, val: u32) {
        self.write_bytes(&val.to_le_bytes()[..]);
    }
    pub fn write_u64(&mut self, val: u64) {
        self.write_bytes(&val.to_le_bytes()[..]);
    }
    pub fn write_bytes(&mut self, data: &[u8]) {
        let copy_len = data.len();
        let data_len = self.len();
        let wr_off = self.write_offset;
        assert!(copy_len <= data_len.checked_sub(wr_off).unwrap());

        match &mut self.inner {
            ROInner::Buf(b) => {
                b[wr_off..(wr_off + copy_len)]
                    .copy_from_slice(&data[..copy_len]);
            }
            ROInner::Map(m) => {
                // FIXME: this does not properly apply write offsetting
                m.write_bytes(data).unwrap();
            }
        }
        self.write_offset += copy_len;
    }
    pub fn fill(&mut self, val: u8) {
        match &mut self.inner {
            ROInner::Buf(buf) => {
                for b in buf[self.write_offset..].iter_mut() {
                    *b = val
                }
            }
            ROInner::Map(m) => {
                m.write_byte(val, m.len() - self.write_offset).unwrap();
            }
        }
        self.write_offset = self.len();
    }
}

enum WOInner<'a> {
    Buf(&'a [u8]),
    Map(SubMapping<'a>),
}

/// Represents an abstract requested write operation.
///
/// Exposes an API with various "read" methods, which fulfill the request.
pub struct WriteOp<'a> {
    inner: WOInner<'a>,
    offset: usize,
    read_offset: usize,
}
impl<'a> WriteOp<'a> {
    /// Initializes a new write operation from a mapping.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: An auxiliary offset stored within the operation,
    /// identifying the region within the emulated resource where `mapping` should
    /// be stored.
    /// - `mapping`: A mapping which represents the "source" of the write operation.
    pub fn from_mapping(op_offset: usize, mapping: SubMapping<'a>) -> Self {
        Self { inner: WOInner::Map(mapping), offset: op_offset, read_offset: 0 }
    }

    /// Initializes a new write operation from a buffer.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: An auxiliary offset stored within the operation,
    /// identifying the region within the emulated resource where `buf` should
    /// be stored.
    /// - `buf`: A buffer which represents the "source" of the write operation.
    pub fn from_buf(op_offset: usize, buf: &'a [u8]) -> Self {
        Self { inner: WOInner::Buf(buf), offset: op_offset, read_offset: 0 }
    }

    /// Constructs a child write operation from within an existing write
    /// operation.
    ///
    /// # Arguments
    ///
    /// - `op_offset`: Offset of the child operation. Does not need to correlate
    /// to the `parent` operation's offset.
    /// - `parent`: The operation from which this operation is being split.
    /// - `range`: The location within the parent operation to be moved
    /// to the child.
    pub fn new_child<'b, R>(
        op_offset: usize,
        parent: &'a mut WriteOp,
        range: R,
    ) -> WriteOp<'b>
    where
        'a: 'b,
        R: RangeBounds<usize> + SliceIndex<[u8], Output = [u8]>,
    {
        match &mut parent.inner {
            WOInner::Buf(b) => WriteOp {
                inner: WOInner::Buf(&b[range]),
                offset: op_offset,
                read_offset: 0,
            },
            WOInner::Map(m) => {
                let (start, end) = numeric_bounds(range, m.len());
                let len = end - start;
                let m = m.subregion(start, len).unwrap();
                WriteOp {
                    inner: WOInner::Map(m),
                    offset: op_offset,
                    read_offset: 0,
                }
            }
        }
    }

    pub fn len(&self) -> usize {
        match &self.inner {
            WOInner::Buf(b) => b.len(),
            WOInner::Map(m) => m.len(),
        }
    }
    pub fn avail(&self) -> usize {
        self.len().checked_sub(self.read_offset).unwrap()
    }
    pub fn offset(&self) -> usize {
        self.offset
    }
    pub fn bytes_read(&self) -> usize {
        self.read_offset
    }

    fn read_val<const COUNT: usize>(&mut self) -> [u8; COUNT] {
        let mut buf = [0u8; COUNT];
        self.read_bytes(&mut buf);
        buf
    }
    pub fn read_u8(&mut self) -> u8 {
        u8::from_le_bytes(self.read_val())
    }
    pub fn read_u16(&mut self) -> u16 {
        u16::from_le_bytes(self.read_val())
    }
    pub fn read_u32(&mut self) -> u32 {
        u32::from_le_bytes(self.read_val())
    }
    pub fn read_u64(&mut self) -> u64 {
        u64::from_le_bytes(self.read_val())
    }
    pub fn read_bytes(&mut self, data: &mut [u8]) {
        let copy_len = data.len();
        if copy_len == 0 {
            return;
        }
        let data_len = self.len();
        let rd_off = self.read_offset;
        assert!(copy_len <= data_len.checked_sub(rd_off).unwrap());
        match &mut self.inner {
            WOInner::Buf(b) => {
                data[..copy_len]
                    .copy_from_slice(&b[rd_off..(rd_off + copy_len)]);
            }
            WOInner::Map(m) => {
                // FIXME: this does not properly apply read offsetting
                m.read_bytes(data).unwrap();
            }
        }
        self.read_offset += copy_len;
    }
}

pub enum RWOp<'a, 'b> {
    Read(&'a mut ReadOp<'b>),
    Write(&'a mut WriteOp<'b>),
}
impl RWOp<'_, '_> {
    pub fn offset(&self) -> usize {
        match self {
            RWOp::Read(ro) => ro.offset,
            RWOp::Write(wo) => wo.offset,
        }
    }
    pub fn len(&self) -> usize {
        match self {
            RWOp::Read(ro) => ro.len(),
            RWOp::Write(wo) => wo.len(),
        }
    }
    pub fn is_read(&self) -> bool {
        matches!(self, RWOp::Read(_))
    }
    pub fn is_write(&self) -> bool {
        matches!(self, RWOp::Write(_))
    }
}

/// An address within a guest VM.
#[derive(Copy, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct GuestAddr(pub u64);

impl GuestAddr {
    pub fn offset<T: Sized>(&self, count: usize) -> Self {
        Self(self.0 + (count * std::mem::size_of::<T>()) as u64)
    }
}

/// A region of memory within a guest VM.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct GuestRegion(pub GuestAddr, pub usize);

impl Add<usize> for GuestAddr {
    type Output = Self;

    fn add(self, rhs: usize) -> Self::Output {
        Self(self.0 + rhs as u64)
    }
}
impl BitAnd<usize> for GuestAddr {
    type Output = Self;

    fn bitand(self, rhs: usize) -> Self::Output {
        Self(self.0 & rhs as u64)
    }
}

pub use crate::lifecycle::Lifecycle;

pub const PAGE_SIZE: usize = 0x1000;
pub const PAGE_OFFSET: usize = 0xfff;
pub const PAGE_MASK: usize = usize::MAX - PAGE_OFFSET;
pub const PAGE_SHIFT: usize = 12;

pub fn round_up_p2(val: usize, to: usize) -> usize {
    assert!(to.is_power_of_two());
    assert!(to != 0);

    val.checked_add(to - 1).unwrap() & !(to - 1)
}

/// Bytes per KiB
pub const KB: usize = 1024;
/// Bytes per MiB
pub const MB: usize = 1024 * 1024;
/// Bytes per GiB
pub const GB: usize = 1024 * 1024 * 1024;
/// Bytes per TiB
pub const TB: usize = 1024 * 1024 * 1024 * 1024;

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn readop_base_size() {
        let mut buf = [0u8; 8];
        let mut ro8 = ReadOp::from_buf(0, &mut buf[0..1]);
        ro8.write_u8(1);
        assert_eq!(buf, [1, 0, 0, 0, 0, 0, 0, 0]);

        let mut ro16 = ReadOp::from_buf(0, &mut buf[0..2]);
        ro16.write_u16(0x2000);
        assert_eq!(buf, [0, 0x20, 0, 0, 0, 0, 0, 0]);

        let mut ro32 = ReadOp::from_buf(0, &mut buf[0..4]);
        ro32.write_u32(0x4000_0000);
        assert_eq!(buf, [0, 0, 0, 0x40, 0, 0, 0, 0]);

        let mut ro64 = ReadOp::from_buf(0, &mut buf);
        ro64.write_u64(0x8000_0000_0000_0000);
        assert_eq!(buf, [0, 0, 0, 0, 0, 0, 0, 0x80]);
    }

    #[test]
    fn writeop_base_size() {
        let buf = [0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, 0x80];
        let mut wo8 = WriteOp::from_buf(0, &buf[0..1]);
        assert_eq!(wo8.read_u8(), 0x10);

        let mut wo16 = WriteOp::from_buf(0, &buf[0..2]);
        assert_eq!(wo16.read_u16(), 0x2010);

        let mut wo32 = WriteOp::from_buf(0, &buf[0..4]);
        assert_eq!(wo32.read_u32(), 0x40302010);

        let mut wo64 = WriteOp::from_buf(0, &buf);
        assert_eq!(wo64.read_u64(), 0x8070605040302010);
    }

    #[test]
    #[should_panic]
    fn readop_oversize() {
        let mut buf = [0u8];
        let mut ro8 = ReadOp::from_buf(0, &mut buf);
        ro8.write_u16(0x1000);
    }

    #[test]
    #[should_panic]
    fn writeop_oversize() {
        let buf = [0u8];
        let mut wo8 = WriteOp::from_buf(0, &buf);
        let _ = wo8.read_u16();
    }

    #[test]
    fn readop_short() {
        let mut buf = [0u8, 0u8];
        let mut ro = ReadOp::from_buf(0, &mut buf);
        ro.write_u8(0x10);
        ro.write_u8(0x20);
        assert_eq!(buf, [0x10, 0x20]);
    }

    #[test]
    fn writeop_short() {
        let buf = [0x10, 0x20];
        let mut wo = WriteOp::from_buf(0, &buf);
        assert_eq!(wo.read_u8(), 0x10);
        assert_eq!(wo.read_u8(), 0x20);
    }
}


================================================
FILE: lib/propolis/src/cpuid.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(unused)]

use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::num::NonZeroU8;
use std::ops::Bound;

use bhyve_api::vcpu_cpuid_entry;
use cpuid_utils::{CpuidIdent, CpuidMap, CpuidSet, CpuidValues, CpuidVendor};

/// Convert a [vcpu_cpuid_entry] into an ([CpuidIdent],
/// [CpuidValues]) tuple, suitable for insertion into a [CpuidSet].
///
/// This would be implemented as a [From] trait if rust let us.
pub fn from_raw(
    value: bhyve_api::vcpu_cpuid_entry,
) -> (CpuidIdent, CpuidValues) {
    let subleaf = if value.vce_flags & bhyve_api::VCE_FLAG_MATCH_INDEX != 0 {
        Some(value.vce_index)
    } else {
        None
    };

    (
        CpuidIdent { leaf: value.vce_function, subleaf },
        CpuidValues {
            eax: value.vce_eax,
            ebx: value.vce_ebx,
            ecx: value.vce_ecx,
            edx: value.vce_edx,
        },
    )
}

#[derive(Debug, thiserror::Error)]
pub enum SpecializeError {
    #[error("unsupported cache level")]
    UnsupportedCacheLevel,
    #[error("missing vcpu count")]
    MissingVcpuCount,
    #[error("missing vcpu id")]
    MissingVcpuId,
    #[error("unable to specialize leaf")]
    IncompatibleTopology { leaf: u32, num_vcpu: u32, why: Option<&'static str> },
}

/// Specialize a set of cpuid leafs for provided attributes.
///
/// This includes things such as a CPU topology (cores/threads/etc), a given
/// vCPU ID (APIC, core/thread ID, etc), or other info tidbits.
#[derive(Default)]
pub struct Specializer {
    has_smt: bool,
    num_vcpu: Option<NonZeroU8>,
    vcpuid: Option<i32>,
    cpu_topo_populate: BTreeSet<TopoKind>,
    cpu_topo_clear: BTreeSet<TopoKind>,
    do_cache_topo: bool,
}
impl Specializer {
    pub fn new() -> Self {
        Self::default()
    }

    /// Specify number of vCPUs in instance, and if SMT is enabled
    pub fn with_vcpu_count(self, count: NonZeroU8, has_smt: bool) -> Self {
        Self { num_vcpu: Some(count), has_smt, ..self }
    }

    /// Specify vCPU ID to specialize for
    pub fn with_vcpuid(self, vcpuid: i32) -> Self {
        assert!((vcpuid as usize) < crate::vcpu::MAXCPU);
        Self { vcpuid: Some(vcpuid), ..self }
    }

    /// Specify CPU topology types to render into the specialized [CpuidSet]
    ///
    /// Without basic information such as the number of vCPUs (set by
    /// [`Self::with_vcpu_count()`]), population of the requested topology
    /// information may be incomplete.
    pub fn with_cpu_topo(
        self,
        populate: impl Iterator<Item = TopoKind>,
    ) -> Self {
        let mut cpu_topo_populate = BTreeSet::new();

        for t in populate {
            cpu_topo_populate.insert(t);
        }

        Self { cpu_topo_populate, ..self }
    }

    /// Specify CPU topology types to clear from the specialized [CpuidSet]
    ///
    /// Some leafs in the provided set may not match expectations for the given
    /// CPU vendor.  Without populating it with generated data (via
    /// [`Self::with_cpu_topo()`]), those leafs can be cleared out.
    pub fn clear_cpu_topo(self, clear: impl Iterator<Item = TopoKind>) -> Self {
        let mut cpu_topo_clear = BTreeSet::new();
        for t in clear {
            cpu_topo_clear.insert(t);
        }

        Self { cpu_topo_clear, ..self }
    }

    /// Update cache topology information for specified vCPU count and SMT
    /// capabilities
    pub fn with_cache_topo(self) -> Self {
        Self { do_cache_topo: true, ..self }
    }

    /// Given the attributes and modifiers specified in this [Specializer],
    /// render an updated [CpuidSet] reflecting those data.
    pub fn execute(
        self,
        mut set: CpuidSet,
    ) -> Result<CpuidSet, SpecializeError> {
        match set.vendor() {
            CpuidVendor::Amd => {
                if self.do_cache_topo && self.num_vcpu.is_some() {
                    self.fix_amd_cache_topo(&mut set)?;
                }
            }
            _ => {}
        }

        // apply any requested topo info fixups
        self.fix_cpu_topo(&mut set)?;

        // APIC ID based on vcpuid
        if let Some(vcpuid) = self.vcpuid.as_ref() {
            if let Some(ent) = set.get_mut(CpuidIdent::leaf(1)) {
                // bits 31:24 contain initial APIC ID
                ent.ebx &= !0xff000000;
                ent.ebx |= ((*vcpuid as u32) & 0xff) << 24;
            }
        }

        // logical CPU count (if SMT is enabled)
        if let Some(num_vcpu) = self.num_vcpu.as_ref() {
            if self.has_smt {
                if let Some(ent) = set.get_mut(CpuidIdent::leaf(1)) {
                    ent.edx |= (0x1 << 28);
                    // bits 23:16 contain max IDs for logical CPUs in package
                    ent.ebx &= !0xff0000;
                    ent.ebx |= u32::from(num_vcpu.get()) << 16;
                }
            }
        }

        Ok(set)
    }

    fn fix_amd_cache_topo(
        &self,
        set: &mut CpuidSet,
    ) -> Result<(), SpecializeError> {
        assert!(self.do_cache_topo);
        let num = self.num_vcpu.unwrap().get();
        for ecx in 0..u32::MAX {
            match set.get_mut(CpuidIdent::subleaf(0x8000001d, ecx)) {
                None => break,
                Some(vals) => {
                    // bits 7:5 hold the cache level
                    let visible_count = match ((vals.eax & 0b11100000) >> 5) {
                        0b001 | 0b010 => {
                            // L1/L2 shared by SMT siblings
                            if self.has_smt {
                                2
                            } else {
                                1
                            }
                        }
                        0b011 => {
                            // L3 shared by all vCPUs
                            // TODO: segregate by sockets, if configured
                            u32::from(num)
                        }
                        _ => {
                            // unceremonious handling of unexpected cache levels
                            return Err(SpecializeError::UnsupportedCacheLevel);
                        }
                    };
                    // the number of logical CPUs (minus 1) sharing this cache
                    // is stored in bits 25:14
                    vals.eax &= !(0xfff << 14);
                    vals.eax |= (visible_count - 1) << 14;
                }
            }
        }
        Ok(())
    }

    fn fix_cpu_topo(&self, set: &mut CpuidSet) -> Result<(), SpecializeError> {
        // The number of logical threads available to the guest.
        let num_vcpu = self
            .num_vcpu
            .ok_or(SpecializeError::MissingVcpuCount)
            .map(|n| u32::from(n.get()))?;

        // The number of logical processors the guest should see. If we
        // indicate that SMT is enabled, then vCPUs are presented as pairs
        // of sibling threads on vproc-many processors.
        let num_vproc = if self.has_smt {
            // If vCPUs are not even but we're asked to indicate SMT, we'll have
            // one leftover core which won't have an SMT sibling. We should
            // reject this situation, but we're in a bit of a pickle: guests are
            // set up with `has_smt: true`, with no even-vCPUs constraint, which
            // seems to have been OK in practice due to #940.
            //
            // One remaining question here is: what topology information is
            // communicated for a single-processor system?  There have not been
            // single-processor x86 systems since at *least* Zen, so there's no
            // hardware to compare against here.
            if num_vcpu == 1 {
                // Round up to one virtual processor and hope that guest OSes
                // handle this cleanly.
                1
            } else {
                num_vcpu >> 1
            }
        } else {
            num_vcpu
        };

        for topo in self.cpu_topo_populate.union(&self.cpu_topo_clear) {
            let leaf = *topo as u32;

            if !self.cpu_topo_populate.contains(topo) {
                // We aren't fixing up this leaf, we're just asked to entirely
                // discard it.
                set.remove_leaf(leaf);

                if *topo == TopoKind::Ext1E {
                    let ext_features = CpuidIdent::leaf(0x8000_0001);
                    // If the CPUID profile defines leaf 0x8000_0001, Extended
                    // Processor and Feature Identifier, make sure the bit that
                    // would indicate leaf Ext1E support is clear. We've just
                    // discarded the leaf, so there's no meaningful data here.
                    if let Some(features) = set.get_mut(ext_features) {
                        // APM volume 3, "CPUID Fn8000_0001_ECX Feature
                        // Identifiers"
                        const TOPO_EXTENSIONS_BIT: u32 = 1 << 22;
                        features.ecx &= !TOPO_EXTENSIONS_BIT;
                    }
                }

                continue;
            }

            if !set.contains_leaf(leaf) {
                // If the leaf isn't present at all, we won't try to specialize
                // it. This lets callers request specializing any/all leaves
                // related to CPU topology without us inventing Intel-only
                // leaves on AMD or AMD-only leaves on Intel.
                continue;
            }

            match topo {
                TopoKind::Std4 => {
                    // Leaf 4 is reserved by AMD, but Intel includes some
                    // topology information here that OSes may use. From the
                    // Intel SDM vol. 2A on
                    //
                    // > Deterministic Cache Parameters Leaf
                    // > (Initial EAX Value = 04H)
                    //
                    // Bits 25-14 are the maximum number of addressable IDs for
                    // logical processors sharing this cache (e.g. "1" for L1
                    // and L2 caches, and "all" for L3)
                    //
                    // Bits 31-26 are the maximum number of addressable IDs for
                    // processor cores in the package. This is constant for all
                    // valid subleaves.

                    // If the number of vCPUs is more than bits 31-26 can
                    // represent, I don't know what to do! This is probably an
                    // all-bits-set-and-use-another-topo-method condition, but
                    // bail out here and demand someone take a look.
                    if num_vproc >= 0b100_0000 {
                        return Err(SpecializeError::IncompatibleTopology {
                            leaf,
                            num_vcpu,
                            why: Some(
                                "Don't know how to set CPUID leaf 4 processor \
                                 count if there are more than 64 processors!",
                            ),
                        });
                    } else if num_vproc == 0 {
                        return Err(SpecializeError::IncompatibleTopology {
                            leaf,
                            num_vcpu,
                            why: Some(
                                "Cannot specialize CPUID leaf 4 \
                                 for 0 processor VM",
                            ),
                        });
                    }

                    // Cache types come in any order, but type 0 means there are
                    // no more caches, so iterate and adjust as needed until we
                    // see that.
                    for i in 0..cpuid_utils::bits::MAX_REASONABLE_SUBLEAVES {
                        let subleaf = set.get_mut(CpuidIdent::subleaf(4, i));
                        let Some(mut subleaf) = subleaf else {
                            // We've reached the end of provided subleaves, so
                            // we're done here.
                            break;
                        };

                        const LEAF4_EAX_CACHE_TYPE: u32 = 0x00_00_00_1f;
                        const LEAF4_EAX_CACHE_LEVEL: u32 = 0x00_00_00_e0;
                        // EAX bits 13-8 are reserved or not consulted here.
                        const LEAF4_EAX_VCPU_MASK: u32 = 0x03_ff_c0_00;
                        const LEAF4_EAX_VPROC_MASK: u32 = 0xfc_00_00_00;

                        let ty = subleaf.eax & LEAF4_EAX_CACHE_TYPE;
                        let level = (subleaf.eax & LEAF4_EAX_CACHE_LEVEL) >> 5;

                        if ty == 0 {
                            // "Null" cache. This is not a cache, and there are
                            // no more caches. We're done here.
                            break;
                        }

                        // Zero out the prior processor core count.
                        subleaf.eax &= !LEAF4_EAX_VPROC_MASK;
                        // The processor count is encoded as one less than the
                        // real count (e.g. 0x3f is 64 processors, 0x00 is 1
                        // processor)
                        subleaf.eax |= (num_vproc - 1) << 26;

                        // Present L1 and L2 caches as per-thread, L3 is across
                        // the whole VM.
                        if level < 3 {
                            subleaf.eax &= !LEAF4_EAX_VCPU_MASK;
                            // And leave that range 0: this means only one
                            // vCPU shares the cache.
                        } else {
                            subleaf.eax &= !LEAF4_EAX_VCPU_MASK;
                            let shifted_vcpu = (num_vcpu - 1) << 14;
                            if shifted_vcpu & !LEAF4_EAX_VCPU_MASK != 0 {
                                return Err(
                                    SpecializeError::IncompatibleTopology {
                                        leaf,
                                        num_vcpu,
                                        why: Some("too many vCPUs"),
                                    },
                                );
                            }
                            subleaf.eax |= shifted_vcpu;
                        }
                    }
                }
                TopoKind::StdB => {
                    // Queries with invalid ecx will get all-zeroes
                    set.insert(CpuidIdent::leaf(leaf), CpuidValues::default());
                    let Some(vcpuid) = self.vcpuid.map(|id| id as u32) else {
                        return Err(SpecializeError::MissingVcpuId);
                    };

                    if self.has_smt {
                        set.insert(
                            CpuidIdent::subleaf(leaf, 0),
                            CpuidValues {
                                eax: 0x1,
                                ebx: 0x2,
                                ecx: 0x100,
                                edx: vcpuid,
                            },
                        );
                    } else {
                        // We notionally want to insert a leaf like
                        // CpuidValues {
                        //     eax: 0x0,
                        //     ebx: 0x1,
                        //     ecx: 0x100,
                        //     edx: vcpuid,
                        // }
                        // here, but EAX=0 implies the leaf is invalid, rather
                        // than the desired "shift x2APIC ID right by 0 to get
                        // to the topology ID of processor cores"
                        //
                        // The question here is: what does this leaf look like
                        // with hyperthreading disabled? Does this leaf return 0
                        // in EAX implying that it is invalid, or is it 1, with
                        // x2APIC IDs skipping every other entry for the
                        // disabled SMT siblings? Whatever hardware does here is
                        // least likely to surprise guest OSes.
                        return Err(SpecializeError::IncompatibleTopology {
                            leaf,
                            num_vcpu,
                            why: Some("Leaf B.1 would have EAX=0"),
                        });
                    }
                    // TODO: Not wholly clear if we should just set this to
                    // `ceil(log2(num_vcpu))` (which should guarantee that the
                    // VM is conceptually one socket) or set this to 7 or 8 like
                    // a "normal" processor.
                    //
                    // Go with 8 for now and error if num_vcpu is above that:
                    // you should trip over this quickly if you've bumped up
                    // VM_MAXCPU and you may have hardware to compare against at
                    // that point.
                    if num_vcpu >= 256 {
                        return Err(SpecializeError::IncompatibleTopology {
                            leaf,
                            num_vcpu,
                            why: Some(
                                "Don't know how to specialize CPUID leaf \
                                       B for more than 256 processors!",
                            ),
                        });
                    }
                    set.insert(
                        CpuidIdent::subleaf(leaf, 1),
                        CpuidValues {
                            eax: 0x8,
                            ebx: num_vcpu,
                            ecx: 0x201,
                            edx: vcpuid,
                        },
                    );
                }
                TopoKind::Std1F => {
                    // TODO: add 0x1f topo info
                }
                TopoKind::Ext1E => {
                    let id = self.vcpuid.unwrap_or(0) as u32;
                    let mut ebx = id;
                    if self.has_smt {
                        // bits 15:8 hold the zero-based threads-per-compute-unit
                        ebx |= 0x100;
                    }
                    set.insert(
                        CpuidIdent::leaf(leaf),
                        CpuidValues {
                            eax: id,
                            ebx,
                            // ECX set to 0 indicates that there is one node in
                            // the virtual CPU socket, and that the current
                            // logical processor is in node 0. Until we support
                            // core scheduling and can meaningfully communicate
                            // NUMA topology to guests this is an OK default.
                            ecx: 0,
                            edx: 0,
                        },
                    );
                }
            }
        }
        Ok(())
    }
}

/// Flavors of CPU topology information
#[derive(
    Clone, Copy, Debug, Ord, PartialOrd, Eq, PartialEq, strum::EnumIter,
)]
pub enum TopoKind {
    /// Leaf 0x4 (legacy Intel cache topology with some CPU information)
    Std4 = 0x4,
    /// Leaf 0xB AMD (and legacy on Intel)
    StdB = 0xb,
    /// Leaf 0x1F (Intel)
    Std1F = 0x1f,
    /// LEAF 0x8000001E (AMD)
    Ext1E = 0x8000001e,
}

impl TopoKind {
    /// Return an iterator of the CPU topology information that Propolis
    /// supports specializing. This is intended as a "reasonable default" for
    /// CPUID profile specialization.
    pub fn supported() -> std::array::IntoIter<Self, 2> {
        // Topology leaves 1Fh and 8000_001E are partially or entirely TODO, so
        // we can't specialize them.
        [TopoKind::Std4, TopoKind::StdB].into_iter()
    }
}

/// Parse the Processor Brand String (aka ProcName) from extended leafs
/// 0x8000_0002 - 0x8000_0004.
pub fn parse_brand_string(
    leafs: [CpuidValues; 3],
) -> Result<String, std::str::Utf8Error> {
    let mut buf = Vec::with_capacity(16 * 3);
    for ent in leafs {
        buf.extend_from_slice(&ent.eax.to_le_bytes());
        buf.extend_from_slice(&ent.ebx.to_le_bytes());
        buf.extend_from_slice(&ent.ecx.to_le_bytes());
        buf.extend_from_slice(&ent.edx.to_le_bytes());
    }
    // remove NUL and trailing chars
    if let Some(nul_pos) = buf.iter().position(|c| *c == 0) {
        buf.truncate(nul_pos);
    }
    let untrimmed = std::str::from_utf8(&buf)?;

    // trim any bounding whitespace which remains
    Ok(untrimmed.trim().to_string())
}


================================================
FILE: lib/propolis/src/enlightenment/bhyve.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Provides a bhyve-compatible guest-hypervisor interface.
//!
//! This interface supplies no special enlightenments; it merely identifies
//! itself as a bhyve hypervisor in CPUID leaf 0x4000_0000.

use std::sync::Arc;

use cpuid_utils::{
    bits::HYPERVISOR_BASE_LEAF, CpuidIdent, CpuidSet, CpuidValues,
};

use crate::{
    accessors::MemAccessor,
    common::{Lifecycle, VcpuId},
    enlightenment::{AddCpuidError, Enlightenment},
    msr::{MsrId, RdmsrOutcome, WrmsrOutcome},
    vmm::VmmHdl,
};

/// An implementation of the bhyve guest-hypervisor interface. This interface
/// exposes no special enlightenments; its only purpose is to inject the
/// appropriate hypervisor ID into CPUID leaf 0x4000_0000, since this leaf will
/// not otherwise appear in a propolis-server instance specification's CPUID
/// settings.
pub struct BhyveGuestInterface;

impl Lifecycle for BhyveGuestInterface {
    fn type_name(&self) -> &'static str {
        "bhyve-guest-interface"
    }
}

impl Enlightenment for BhyveGuestInterface {
    fn add_cpuid(&self, cpuid: &mut CpuidSet) -> Result<(), AddCpuidError> {
        let mut to_add = CpuidSet::new(cpuid.vendor());
        to_add
            .insert(
                CpuidIdent::leaf(HYPERVISOR_BASE_LEAF),
                // Leaf 0x4000_0000 is the maximum hypervisor leaf. "bhyve bhyve "
                // is the vendor ID, split across ebx/ecx/edx.
                CpuidValues {
                    eax: HYPERVISOR_BASE_LEAF,
                    ebx: 0x76796862,
                    ecx: 0x68622065,
                    edx: 0x20657679,
                },
            )
            .expect("the map was previously empty");

        super::add_cpuid(cpuid, to_add)
    }

    fn rdmsr(&self, _vcpu: VcpuId, _msr: MsrId) -> RdmsrOutcome {
        RdmsrOutcome::NotHandled
    }

    fn wrmsr(&self, _vcpu: VcpuId, _msr: MsrId, _value: u64) -> WrmsrOutcome {
        WrmsrOutcome::NotHandled
    }

    fn attach(&self, _parent: &MemAccessor, _vmm_hdl: Arc<VmmHdl>) {}
}


================================================
FILE: lib/propolis/src/enlightenment/hyperv/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Constant definitions and flags for Hyper-V emulations. These are drawn from
//! the Hyper-V TLFS version 6.0b (referred to as "TLFS" below). See the parent
//! module documentation for more details.
//!
//! Where possible, constants in this module (such as MSR identifiers) are given
//! names that match those used in the TLFS.

use cpuid_utils::CpuidValues;

/// Hyper-V-compatible hypervisors are required to support hypervisor CPUID
/// leaves up to 0x4000_0005.
pub(super) const HYPERV_MIN_REQUIRED_CPUID_LEAF: u32 = 0x40000005;

/// CPUID leaf 0x4000_0000 contains hypervisor identifying information. eax
/// receives the highest valid CPUID leaf in the hypervisor range. ebx, ecx, and
/// edx receive a 12-byte vendor ID.
///
/// In order to get both Linux and Windows guests to accept these
/// enlightenments, the ebx/ecx/edx ID here is set to "Microsoft Hv". Windows
/// guests will accept other vendor IDs (they look at leaf 0x4000_0001 eax to
/// identify the hypervisor interface instead of reading the vendor ID in leaf
/// 0), but Linux guests only consider the vendor ID.
const HYPERV_LEAF_0_VALUES: CpuidValues = CpuidValues {
    eax: HYPERV_MIN_REQUIRED_CPUID_LEAF,
    ebx: 0x7263694D,
    ecx: 0x666F736F,
    edx: 0x76482074,
};

/// Generates values for CPUID leaf 0x4000_0000, which contains hypervisor
/// identifying information. eax receives the value of `max_leaf`, the maximum
/// valid CPUID leaf in the hypervisor range; ebx, ecx, and edx contain an
/// appropriate vendor ID.
///
/// `max_leaf` supplies the maximum valid CPUID leaf in the hypervisor range.
///
/// # Panics
///
/// Panics if `max_leaf` is less than [`HYPERV_MIN_REQUIRED_CPUID_LEAF`].
pub(super) fn hyperv_leaf_0_values(max_leaf: u32) -> CpuidValues {
    assert!(
        max_leaf >= HYPERV_MIN_REQUIRED_CPUID_LEAF,
        "requested max leaf {max_leaf:#x} less than minimum required"
    );

    CpuidValues { eax: max_leaf, ..HYPERV_LEAF_0_VALUES }
}

/// Hyper-V leaf 0x4000_0001 contains an (ostensibly vendor-neutral) interface
/// identifier. eax receives "Hv#1"; the other three outputs are reserved.
pub(super) const HYPERV_LEAF_1_VALUES: CpuidValues =
    CpuidValues { eax: 0x31237648, ebx: 0, ecx: 0, edx: 0 };

/// Hyper-V leaf 0x4000_0002 contains hypervisor version information. To avoid
/// having to reason about what it means to expose a specific hypervisor version
/// across a live migration between potentially different host and/or Propolis
/// versions, this information is always set to 0.
pub(super) const HYPERV_LEAF_2_VALUES: CpuidValues =
    CpuidValues { eax: 0, ebx: 0, ecx: 0, edx: 0 };

bitflags::bitflags! {
    /// Hyper-V leaf 0x4000_0003 eax returns synthetic MSR access rights.
    /// Only the bits actually used by this enlightenment stack are enumerated
    /// here.
    #[derive(Clone, Copy, Debug, PartialEq, Eq)]
    pub struct HyperVLeaf3Eax: u32 {
        const PARTITION_REFERENCE_COUNTER = 1 << 1;
        const HYPERCALL = 1 << 5;
        const VP_INDEX = 1 << 6;
        const PARTITION_REFERENCE_TSC = 1 << 9;

        // Bits 14-31 of this register are reserved.
    }
}

impl Default for HyperVLeaf3Eax {
    /// Grants access to the VP index and hypercall MSRs. This is the minimum
    /// set of access rights that all Hyper-V-compatible hypervisors must grant.
    fn default() -> Self {
        HyperVLeaf3Eax::VP_INDEX | HyperVLeaf3Eax::HYPERCALL
    }
}

/// Hyper-V leaf 0x4000_0004 describes behavior that the guest OS should
/// implement for optimal performance. Propolis expresses no opinion about these
/// options, except that it indicates in ebx that the guest should never try to
/// notify the hypervisor about failed spinlock acquisitions.
pub(super) const HYPERV_LEAF_4_VALUES: CpuidValues =
    CpuidValues { eax: 0, ebx: 0xFFFFFFFF, ecx: 0, edx: 0 };

/// Hyper-V leaf 0x4000_0005 describes the hypervisor's CPU and interrupt
/// remapping limits. Hypervisors are allowed not to expose these limits by
/// publishing 0s to this leaf.
pub(super) const HYPERV_LEAF_5_VALUES: CpuidValues =
    CpuidValues { eax: 0, ebx: 0, ecx: 0, edx: 0 };

/// Allows the guest to report its type and version information. See TLFS
/// section 2.6 for details about this MSR's format.
///
/// Guest OSes are required to identify themselves via this MSR before they can
/// set the enabled bit in [`HV_X64_MSR_HYPERCALL`] or make any hypercalls.
///
/// Read-write; requires the [`HyperVLeaf3Eax::HYPERCALL`] privilege.
pub(super) const HV_X64_MSR_GUEST_OS_ID: u32 = 0x4000_0000;

/// Specifies the guest physical address at which the guest would like to place
/// the hypercall page. See TLFS section 3.13 and the [`MsrHypercallValue`]
/// struct.
///
/// Read-write; requires the [`HyperVLeaf3Eax::HYPERCALL`] privilege.
///
/// [`MsrHypercallValue`]: super::hypercall::MsrHypercallValue
pub(super) const HV_X64_MSR_HYPERCALL: u32 = 0x4000_0001;

/// Guests may read this register to obtain the index of the vCPU that read the
/// register.
///
/// Read-only; requires the [`HyperVLeaf3Eax::VP_INDEX`] privilege.
pub(super) const HV_X64_MSR_VP_INDEX: u32 = 0x4000_0002;

/// Guests may read this register to obtain the time since this VM was created,
/// in 100-nanosecond units.
///
/// Read-only; requires the [`HyperVLeaf3Eax::PARTITION_REFERENCE_COUNTER`]
/// privilege.
pub(super) const HV_X64_MSR_TIME_REF_COUNT: u32 = 0x4000_0020;

/// Specifies the guest physical address at which the guest would like to place
/// the reference TSC page. See TLFS section 12.7 and the
/// [`MsrReferenceTscValue`] struct.
///
/// Read-write; requires the [`HyperVLeaf3Eax::PARTITION_REFERENCE_TSC`]
/// privilege.
///
/// [`MsrReferenceTscValue`]: super::tsc::MsrReferenceTscValue
pub(super) const HV_X64_MSR_REFERENCE_TSC: u32 = 0x4000_0021;


================================================
FILE: lib/propolis/src/enlightenment/hyperv/hypercall.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for hypercalls and their related MSRs.

use crate::{
    common::{GuestAddr, PAGE_MASK, PAGE_SHIFT, PAGE_SIZE},
    vmm::Pfn,
};

const LOCKED_BIT: u64 = 1;
const LOCKED_MASK: u64 = 1 << LOCKED_BIT;
const ENABLED_BIT: u64 = 0;
const ENABLED_MASK: u64 = 1 << ENABLED_BIT;

/// Represents a value written to the [`HV_X64_MSR_HYPERCALL`] register.
///
/// Writing to this register enables the hypercall page. The hypervisor
/// overwrites this page with an instruction sequence that the guest should
/// execute in order to issue a call to the hypervisor. See
/// [`HYPERCALL_INSTRUCTION_SEQUENCE`].
///
/// Bits 11:2 of this register are reserved. The TLFS specifies that the guest
/// "should ignore [them] on reads and preserve [them] on writes," but imposes
/// no particular penalties on guests that modify these bits.
///
/// [`HV_X64_MSR_HYPERCALL`]: super::bits::HV_X64_MSR_HYPERCALL
#[derive(Clone, Copy, Default)]
pub(super) struct MsrHypercallValue(pub(super) u64);

impl std::fmt::Debug for MsrHypercallValue {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MsrHypercallValue")
            .field("raw", &format!("{:#x}", self.0))
            .field("gpa", &format!("{:#x}", self.gpa().0))
            .field("locked", &self.locked())
            .field("enabled", &self.enabled())
            .finish()
    }
}

impl MsrHypercallValue {
    /// Yields the guest page number (the PFN) at which the guest would like the
    /// hypercall page to be placed.
    pub fn gpfn(&self) -> Pfn {
        Pfn::new(self.0 >> PAGE_SHIFT).unwrap()
    }

    /// Returns the guest physical address at which the guest would like the
    /// hypercall page to be placed.
    pub fn gpa(&self) -> GuestAddr {
        GuestAddr(self.0 & PAGE_MASK as u64)
    }

    /// Returns whether the hypercall page location is locked. Once locked, the
    /// value in `MSR_HYPERCALL` cannot change until the hypervisor resets the
    /// guest.
    pub fn locked(&self) -> bool {
        (self.0 & LOCKED_MASK) != 0
    }

    /// Indicates whether the hypercall page is enabled.
    pub fn enabled(&self) -> bool {
        (self.0 & ENABLED_MASK) != 0
    }

    /// Clears this value's enabled bit.
    pub fn clear_enabled(&mut self) {
        self.0 &= !ENABLED_MASK;
    }
}

/// The sequence of instructions to write to the hypercall page. This sequence
/// is `mov rax, 2; ret`, which returns a "not supported" status for all
/// hypercalls without actually requiring the guest to exit.
//
// If and when actual hypercall support is required, this should change to
// either `0f 01 c1` (VMCALL) or `0f 01 d9` (VMMCALL), depending on whether the
// host is VMX- or SVM-based.
const HYPERCALL_INSTRUCTION_SEQUENCE: [u8; 8] =
    [0x48, 0xc7, 0xc0, 0x02, 0x00, 0x00, 0x00, 0xc3];

/// Yields a page-sized buffer containing the contents of the hypercall page.
pub(super) fn hypercall_page_contents() -> [u8; PAGE_SIZE] {
    let mut page = [0u8; PAGE_SIZE];
    page[0..8].copy_from_slice(&HYPERCALL_INSTRUCTION_SEQUENCE);
    page
}


================================================
FILE: lib/propolis/src/enlightenment/hyperv/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for Microsoft Hyper-V emulation.
//!
//! Windows guests and many Linux guests can interoperate with hypervisors that
//! implement the hypervisor described in Microsoft's Hypervisor Top-Level
//! Functional Specification (TLFS). The behavior in this module is based on
//! version 6.0b of the TLFS, which is available on GitHub:
//! <https://github.com/MicrosoftDocs/Virtualization-Documentation/blob/main/tlfs/Hypervisor%20Top%20Level%20Functional%20Specification%20v6.0b.pdf>
//!
//! Microsoft also maintains a list of minimum requirements for any hypervisor
//! that intends to implement a Hyper-V-compatible interface:
//! <https://github.com/MicrosoftDocs/Virtualization-Documentation/blob/main/tlfs/Requirements%20for%20Implementing%20the%20Microsoft%20Hypervisor%20Interface.pdf>

use std::sync::{Arc, Mutex, OnceLock};

use cpuid_utils::{CpuidIdent, CpuidSet, CpuidValues};
use overlay::{OverlayError, OverlayKind, OverlayManager, OverlayPage};
use slog::info;

use crate::{
    accessors::MemAccessor,
    common::{Lifecycle, VcpuId},
    enlightenment::{
        hyperv::{
            bits::*,
            hypercall::MsrHypercallValue,
            tsc::{MsrReferenceTscValue, ReferenceTsc},
        },
        AddCpuidError,
    },
    migrate::{
        MigrateCtx, MigrateSingle, MigrateStateError, Migrator, PayloadOffer,
        PayloadOutput,
    },
    msr::{MsrId, RdmsrOutcome, WrmsrOutcome},
    vmm::{self, VmmHdl},
};

mod bits;
mod hypercall;
mod overlay;
mod tsc;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn hyperv_wrmsr_guest_os_id(val: u64) {}
    fn hyperv_wrmsr_hypercall(val: u64, gpa: u64, locked: bool, enabled: bool) {
    }
    fn hyperv_wrmsr_reference_tsc(val: u64, gpa: u64, enabled: bool) {}
    fn hyperv_wrmsr_hypercall_bad_gpa(gpa: u64) {}
    fn hyperv_rdmsr_reference_time(time_units: u64) {}
}

const TYPE_NAME: &str = "guest-hyperv-interface";

/// A set of features that can be enabled for a given Hyper-V instance.
#[derive(Clone, Copy, Debug, Default)]
pub struct Features {
    /// Enables the reference time MSR and the reference TSC page.
    pub reference_tsc: bool,
}

/// Wrapper around a hypercall overlay page.
struct HypercallOverlay(OverlayPage);

/// Wrapper around a TSC overlay page.
struct TscOverlay(OverlayPage);

/// A collection of overlay pages that a Hyper-V enlightenment stack might be
/// managing.
#[derive(Default)]
struct OverlayPages {
    hypercall: Option<HypercallOverlay>,
    tsc: Option<TscOverlay>,
}

struct Inner {
    /// This enlightenment's overlay manager.
    overlay_manager: Arc<OverlayManager>,

    /// The last value stored in the [`bits::HV_X64_MSR_GUEST_OS_ID`] MSR.
    msr_guest_os_id_value: u64,

    /// The last value stored in the [`bits::HV_X64_MSR_HYPERCALL`] MSR.
    msr_hypercall_value: MsrHypercallValue,

    /// The state of this stack's reference TSC enlightenment.
    reference_tsc: ReferenceTsc,

    /// This enlightenment's active overlay page handles.
    overlays: OverlayPages,
}

impl Inner {
    fn new(features: &Features) -> Self {
        Self {
            overlay_manager: Arc::default(),
            msr_guest_os_id_value: 0,
            msr_hypercall_value: MsrHypercallValue::default(),
            reference_tsc: if features.reference_tsc {
                ReferenceTsc::Uninitialized
            } else {
                ReferenceTsc::Disabled
            },
            overlays: OverlayPages::default(),
        }
    }

    /// Resets this enlightenment block's volatile values (e.g. MSR values) to
    /// their initial values.
    fn reset(&mut self) {
        *self = Self {
            overlay_manager: self.overlay_manager.clone(),
            msr_guest_os_id_value: 0,
            msr_hypercall_value: MsrHypercallValue::default(),
            reference_tsc: match &self.reference_tsc {
                ReferenceTsc::Enabled { guest_freq, .. } => {
                    ReferenceTsc::Enabled {
                        guest_freq: *guest_freq,
                        msr_value: MsrReferenceTscValue::default(),
                    }
                }
                tsc => *tsc,
            },
            overlays: OverlayPages::default(),
        }
    }

    fn handle_rdmsr_reference_tsc(&self) -> RdmsrOutcome {
        match self.reference_tsc {
            ReferenceTsc::Disabled => RdmsrOutcome::GpException,
            // Well-behaved users of the enlightenment shouldn't allow vCPUs to
            // start dispatching calls to it until the enlightenment is fully
            // initialized.
            ReferenceTsc::Uninitialized => {
                panic!(
                    "reference TSC read from uninitialized enlightenment \
                    (perhaps vCPUs were started without calling attach()?)"
                )
            }
            ReferenceTsc::Enabled { msr_value, .. } => {
                RdmsrOutcome::Handled(msr_value.0)
            }
        }
    }

    fn handle_wrmsr_reference_tsc(&mut self, value: u64) -> WrmsrOutcome {
        if !self.reference_tsc.is_present() {
            return WrmsrOutcome::GpException;
        }

        let new = MsrReferenceTscValue(value);
        probes::hyperv_wrmsr_reference_tsc!(|| (
            value,
            new.gpa().0,
            new.enabled()
        ));

        // Unlike the hypercall MSR, writes to the reference TSC MSR always
        // succeed without raising an exception, even if they try to enable the
        // TSC overlay page at an invalid PFN. See TLFS section 12.7.1.
        let old_overlay = self.overlays.tsc.take();
        self.reference_tsc.set_msr_value(new);
        self.overlays.tsc = if new.enabled() {
            if let Some(mut overlay) = old_overlay {
                overlay.0.move_to(new.gpfn()).ok().map(|_| overlay)
            } else {
                self.reference_tsc.create_overlay(&self.overlay_manager)
            }
        } else {
            None
        };

        WrmsrOutcome::Handled
    }
}

pub struct HyperV {
    #[allow(dead_code)]
    log: slog::Logger,
    features: Features,
    inner: Mutex<Inner>,
    acc_mem: MemAccessor,
    vmm_hdl: OnceLock<Arc<VmmHdl>>,
}

impl HyperV {
    /// Creates a new Hyper-V enlightenment stack with the supplied `features`.
    ///
    /// The caller must call [`attach`] to finish initializing this
    /// enlightenment stack before starting any VM components that depend on it.
    /// Otherwise the stack may panic while the VM is running.
    ///
    /// [`attach`]: super::Enlightenment::attach
    pub fn new(log: &slog::Logger, features: Features) -> Self {
        let acc_mem = MemAccessor::new_orphan();
        let log = log.new(slog::o!("component" => "hyperv"));
        info!(
            log,
            "creating Hyper-V enlightenment stack";
            "features" => ?features
        );

        Self {
            log,
            features,
            inner: Mutex::new(Inner::new(&features)),
            acc_mem,
            vmm_hdl: OnceLock::new(),
        }
    }

    /// Returns a reference to this manager's VMM handle.
    ///
    /// # Panics
    ///
    /// Panics if the handle has not been initialized yet, which occurs if this
    /// routine is called before the enlightenment receives its `attach`
    /// callout.
    fn vmm_hdl(&self) -> &Arc<VmmHdl> {
        self.vmm_hdl.get().expect(
            "a fully-initialized Hyper-V enlightenment always has a \
            VMM handle (did the library user remember to call `attach`?)",
        )
    }

    /// Handles a write to the HV_X64_MSR_GUEST_OS_ID register.
    fn handle_wrmsr_guest_os_id(&self, value: u64) -> WrmsrOutcome {
        probes::hyperv_wrmsr_guest_os_id!(|| value);
        let mut inner = self.inner.lock().unwrap();

        // TLFS section 3.13 says that the hypercall page "becomes disabled" if
        // the guest OS ID register is cleared after the hypercall register is
        // set. It also specifies that attempts to set the Enabled bit in that
        // register will be ignored if the guest OS ID is zeroed, so handle this
        // case by clearing the hypercall MSR's Enabled bit but otherwise
        // leaving the hypercall page untouched (as would happen if the guest
        // manually cleared this bit).
        if value == 0 {
            inner.msr_hypercall_value.clear_enabled();
            inner.overlays.hypercall.take();
        }

        inner.msr_guest_os_id_value = value;
        WrmsrOutcome::Handled
    }

    /// Handles a write to the HV_X64_MSR_HYPERCALL register. See TLFS section
    /// 3.13 and [`MsrHypercallValue`].
    fn handle_wrmsr_hypercall(&self, value: u64) -> WrmsrOutcome {
        let mut new = MsrHypercallValue(value);
        probes::hyperv_wrmsr_hypercall!(|| (
            value,
            new.gpa().0,
            new.locked(),
            new.enabled()
        ));

        let mut inner = self.inner.lock().unwrap();
        let old = inner.msr_hypercall_value;

        // This MSR is immutable once the Locked bit is set.
        if old.locked() {
            return WrmsrOutcome::Handled;
        }

        // If this MSR is written when no guest OS ID is set, the Enabled bit is
        // cleared and the write succeeds.
        if inner.msr_guest_os_id_value == 0 {
            new.clear_enabled();
        }

        // If the Enabled bit is not set, there's nothing to try to expose to
        // the guest.
        if !new.enabled() {
            inner.msr_hypercall_value = new;
            inner.overlays.hypercall.take();
            return WrmsrOutcome::Handled;
        }

        // Ensure the overlay is in the correct position.
        let res = if let Some(overlay) = inner.overlays.hypercall.as_mut() {
            overlay.0.move_to(new.gpfn())
        } else {
            inner
                .overlay_manager
                .add_overlay(
                    new.gpfn(),
                    OverlayKind::HypercallReturnNotSupported,
                )
                .map(|overlay| {
                    inner.overlays.hypercall = Some(HypercallOverlay(overlay));
                })
        };

        match res {
            Ok(()) => {
                inner.msr_hypercall_value = new;
                WrmsrOutcome::Handled
            }
            Err(OverlayError::AddressInaccessible(_)) => {
                WrmsrOutcome::GpException
            }
            // There should only ever be one hypercall overlay at a time, and
            // guest memory should be accessible in the context of a VM exit, so
            // (barring some other invariant being violated) adding an overlay
            // should always succeed if the target PFN is valid.
            Err(e) => {
                panic!("unexpected error establishing hypercall overlay: {e}")
            }
        }
    }

    /// Handles a read of the `HV_X64_MSR_TIME_REF_COUNT` register. See TLFS
    /// section 12.4.
    fn handle_rdmsr_time_ref_count(&self) -> RdmsrOutcome {
        if !self.features.reference_tsc {
            return RdmsrOutcome::GpException;
        }

        let time_data = vmm::time::export_time_data(self.vmm_hdl())
            .expect("VMM time data can always be exported");

        // Two fields in the `time_data` struct are relevant here:
        //
        // - `hrtime` is the time since the host booted, in nanoseconds.
        // - `boot_hrtime` is the host time at which the VM booted.
        //
        // `boot_hrtime` is allowed to be negative if the VM started before
        // its current host did. This can happen if the VM migrated to this host
        // after being started on some other (even longer-lived) host.
        //
        // Validate a couple of assumptions:
        //
        // - The host never reports a negative uptime. (Note that i64::MAX
        //   nanoseconds is 9.2e18 ns, so it takes approximately 292 years for
        //   a nanosecond uptime counter to wrap.)
        // - The guest's boot time is never in the future, i.e., it is never
        //   greater than the current host time. If this happens, it either
        //   means that host time went backwards or that the guest's
        //   `boot_hrtime` was incorrectly mutated. In either case, this
        //   computation is going to produce an incorrect guest timestamp value.
        //
        // These cases are both unexpected, so if either occurs, just crash the
        // VM rather than make the guest deal (perhaps badly, e.g. by persisting
        // an invalid calculated wall-clock time to disk) with reference time
        // going backwards or with large skips in reference time.
        //
        // Note that during a live migration, the migration protocol is expected
        // to verify these conditions and fail migration if creating either of
        // them is required to represent guest time accurately.
        assert!(time_data.hrtime >= 0);
        assert!(time_data.hrtime >= time_data.boot_hrtime);

        // Since hrtime is non-negative, this subtraction should never
        // underflow, but it can *overflow* if `boot_hrtime` is negative and of
        // sufficient magnitude.
        //
        // Although this situation could be represented by trying to wrap the
        // reference counter, it's simpler just to abort, since this implies a
        // VM uptime of more than 292 years. (If you are dealing with this
        // problem from the 24th century, please accept the present author's
        // apologies!)
        let guest_uptime = time_data
            .hrtime
            .checked_sub(time_data.boot_hrtime)
            .expect("overflow while calculating reference uptime");

        // Since hrtime >= boot_hrtime, the resulting guest uptime should always
        // be non-negative, and so it should be trivial to represent it as a
        // u64.
        let guest_uptime: u64 = guest_uptime
            .try_into()
            .expect("boot_hrtime should be less than host hrtime");

        // The computed uptime is in nanoseconds, but reference time is measured
        // in 100 ns units.
        let reference_uptime = guest_uptime / 100;

        probes::hyperv_rdmsr_reference_time!(|| reference_uptime);
        RdmsrOutcome::Handled(reference_uptime)
    }

    fn handle_wrmsr_reference_tsc(&self, value: u64) -> WrmsrOutcome {
        self.inner.lock().unwrap().handle_wrmsr_reference_tsc(value)
    }
}

impl super::Enlightenment for HyperV {
    fn add_cpuid(&self, cpuid: &mut CpuidSet) -> Result<(), AddCpuidError> {
        let mut to_add = CpuidSet::new(cpuid.vendor());

        let mut add_to_set = |id, val| {
            to_add
                .insert(id, val)
                .expect("Hyper-V CPUID values don't conflict");
        };

        add_to_set(CpuidIdent::leaf(0x4000_0001), HYPERV_LEAF_1_VALUES);
        add_to_set(CpuidIdent::leaf(0x4000_0002), HYPERV_LEAF_2_VALUES);

        let mut leaf_3_eax = HyperVLeaf3Eax::default();
        if self.features.reference_tsc {
            leaf_3_eax |= HyperVLeaf3Eax::PARTITION_REFERENCE_COUNTER;
            leaf_3_eax |= HyperVLeaf3Eax::PARTITION_REFERENCE_TSC;
        }

        add_to_set(
            CpuidIdent::leaf(0x4000_0003),
            CpuidValues { eax: leaf_3_eax.bits(), ..Default::default() },
        );

        add_to_set(CpuidIdent::leaf(0x4000_0004), HYPERV_LEAF_4_VALUES);
        add_to_set(CpuidIdent::leaf(0x4000_0005), HYPERV_LEAF_5_VALUES);

        // Set the maximum available CPUID leaf to the smallest value required
        // to expose all of the enlightenment's features.
        //
        // WARNING: In at least some versions of propolis-server, the CPUID
        // configuration generated by this enlightenment is not part of the
        // instance description that the migration source sends to its target.
        // Instead, the source sends the target its *enlightenment
        // configuration* and assumes that the target will produce the same
        // CPUID settings the source produced. This includes the maximum
        // available enlightenment leaf: it should not be set to the maximum
        // leaf this version of Propolis knows about, but to the maximum leaf
        // required by the features enabled in this enlightenment stack.
        add_to_set(
            CpuidIdent::leaf(0x4000_0000),
            bits::hyperv_leaf_0_values(0x4000_0005),
        );

        super::add_cpuid(cpuid, to_add)
    }

    fn rdmsr(&self, vcpu: VcpuId, msr: MsrId) -> RdmsrOutcome {
        match msr.0 {
            HV_X64_MSR_GUEST_OS_ID => RdmsrOutcome::Handled(
                self.inner.lock().unwrap().msr_guest_os_id_value,
            ),
            HV_X64_MSR_HYPERCALL => RdmsrOutcome::Handled(
                self.inner.lock().unwrap().msr_hypercall_value.0,
            ),
            HV_X64_MSR_VP_INDEX => {
                let id: u32 = vcpu.into();
                RdmsrOutcome::Handled(id as u64)
            }
            HV_X64_MSR_TIME_REF_COUNT => self.handle_rdmsr_time_ref_count(),
            HV_X64_MSR_REFERENCE_TSC => {
                self.inner.lock().unwrap().handle_rdmsr_reference_tsc()
            }
            _ => RdmsrOutcome::NotHandled,
        }
    }

    fn wrmsr(&self, _vcpu: VcpuId, msr: MsrId, value: u64) -> WrmsrOutcome {
        match msr.0 {
            HV_X64_MSR_GUEST_OS_ID => self.handle_wrmsr_guest_os_id(value),
            HV_X64_MSR_HYPERCALL => self.handle_wrmsr_hypercall(value),
            HV_X64_MSR_REFERENCE_TSC => self.handle_wrmsr_reference_tsc(value),
            HV_X64_MSR_VP_INDEX | HV_X64_MSR_TIME_REF_COUNT => {
                WrmsrOutcome::GpException
            }
            _ => WrmsrOutcome::NotHandled,
        }
    }

    fn attach(&self, mem_acc: &MemAccessor, vmm_hdl: Arc<VmmHdl>) {
        mem_acc.adopt(&self.acc_mem, Some(TYPE_NAME.to_owned()));

        let mut inner = self.inner.lock().unwrap();
        inner.overlay_manager.attach(&self.acc_mem);

        if let ReferenceTsc::Uninitialized = inner.reference_tsc {
            let time_data = vmm::time::export_time_data(&vmm_hdl)
                .expect("VMM time data is accessible during attach");

            // N.B. This guest TSC frequency may be overwritten by a future
            // request to import state from a migration source. This is
            // intentional; the migration protocol will configure the kernel VMM
            // to apply hardware TSC scaling so that the guest observes the
            // imported frequency.
            inner.reference_tsc = ReferenceTsc::Enabled {
                guest_freq: time_data.guest_freq,
                msr_value: MsrReferenceTscValue::default(),
            }
        }

        // `attach` should only called once on each enlightenment instance.
        // `VmmHdl` doesn't implement `Debug`, so it's not possible to use
        // `unwrap` or `expect` here.
        assert!(
            self.vmm_hdl.set(vmm_hdl).is_ok(),
            "Enlightenment::attach should be called exactly once per stack"
        );
    }
}

impl Lifecycle for HyperV {
    fn type_name(&self) -> &'static str {
        TYPE_NAME
    }

    fn pause(&self) {
        let mut inner = self.inner.lock().unwrap();

        // Remove all active overlays from service. If the VM migrates, this
        // allows the original guest pages that sit underneath those overlays to
        // be transferred as part of the guest RAM transfer phase instead of
        // possibly being serialized and sent during the device state phase. Any
        // active overlays will be re-established on the target during its
        // device state import phase.
        //
        // Any guest data written to the overlay pages will be lost. That's OK
        // because all the overlays this module currently supports are
        // semantically read-only (guests should expect to take an exception if
        // they try to write to them, although today no such exception is
        // raised).
        //
        // The caller who is coordinating the "pause VM" operation is required
        // to ensure that devices are paused only if vCPUs are paused, so no
        // vCPU will be able to observe the missing overlay.
        inner.overlays = OverlayPages::default();

        assert!(inner.overlay_manager.is_empty());
    }

    fn resume(&self) {
        let mut inner = self.inner.lock().unwrap();

        assert!(inner.overlay_manager.is_empty());

        // Re-establish any overlays that were removed when the enlightenment
        // was paused.
        //
        // Writes to the hypercall MSR only persist if they specify a valid
        // overlay PFN, so adding the hypercall overlay is guaranteed to
        // succeed.
        let hypercall_overlay = inner
            .msr_hypercall_value
            .enabled()
            .then(|| {
                inner
                    .overlay_manager
                    .add_overlay(
                        inner.msr_hypercall_value.gpfn(),
                        OverlayKind::HypercallReturnNotSupported,
                    )
                    .expect("hypercall MSR is only enabled with a valid PFN")
            })
            .map(HypercallOverlay);

        let tsc_overlay = inner
            .reference_tsc
            .is_present()
            .then(|| inner.reference_tsc.create_overlay(&inner.overlay_manager))
            .flatten();

        inner.overlays =
            OverlayPages { hypercall: hypercall_overlay, tsc: tsc_overlay };
    }

    fn reset(&self) {
        let mut inner = self.inner.lock().unwrap();

        // The overlay manager shouldn't have any active overlays, because
        // `pause` drops them all, and state drivers are required to call
        // `pause` before `reset`.
        assert!(inner.overlay_manager.is_empty());

        inner.reset();
    }

    fn halt(&self) {
        let inner = self.inner.lock().unwrap();
        assert!(inner.overlay_manager.is_empty());
    }

    fn migrate(&'_ self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}

impl MigrateSingle for HyperV {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let inner = self.inner.lock().unwrap();
        Ok(migrate::HyperVEnlightenmentV1 {
            msr_guest_os_id: inner.msr_guest_os_id_value,
            msr_hypercall: inner.msr_hypercall_value.0,
            reference_tsc: match inner.reference_tsc {
                ReferenceTsc::Disabled => None,
                ReferenceTsc::Uninitialized => {
                    return Err(MigrateStateError::NotReadyForExport);
                }
                ReferenceTsc::Enabled { msr_value, guest_freq } => {
                    Some(migrate::ReferenceTscV1 {
                        msr_value: msr_value.0,
                        guest_freq,
                    })
                }
            },
        }
        .into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let migrate::HyperVEnlightenmentV1 {
            msr_guest_os_id,
            msr_hypercall,
            reference_tsc,
        } = offer.parse()?;

        let mut inner = self.inner.lock().unwrap();

        // Re-establish any overlay pages that are active in the restored MSRs.
        //
        // A well-behaved source should ensure that the hypercall MSR value is
        // within the guest's PA range and that its Enabled bit agrees with the
        // value of the guest OS ID MSR. But this data was received over the
        // wire, so for safety's sake, verify it all and return a migration
        // error if anything is inconsistent.
        let msr_hypercall_value = MsrHypercallValue(msr_hypercall);
        let hypercall_overlay = if msr_hypercall_value.enabled() {
            if msr_guest_os_id == 0 {
                return Err(MigrateStateError::ImportFailed(
                    "hypercall MSR enabled but guest OS ID MSR is 0"
                        .to_string(),
                ));
            }

            match inner.overlay_manager.add_overlay(
                msr_hypercall_value.gpfn(),
                OverlayKind::HypercallReturnNotSupported,
            ) {
                Ok(overlay) => Some(HypercallOverlay(overlay)),
                Err(e) => {
                    return Err(MigrateStateError::ImportFailed(format!(
                        "failed to re-establish hypercall overlay: {e}"
                    )))
                }
            }
        } else {
            None
        };

        let (reference_tsc, tsc_overlay) = if let Some(imported_tsc) =
            reference_tsc
        {
            if !inner.reference_tsc.is_present() {
                return Err(MigrateStateError::ImportFailed(
                    "imported payload has reference TSC data, but that \
                        enlightenment is disabled"
                        .to_string(),
                ));
            }

            // Ensure that the TSC overlay exists and that it exposes the
            // correct scaling factor for the guest's nominal TSC frequency.
            // This may be different from the default scaling factor that was
            // read from the kernel VMM when the enlightenment stack was
            // initialized.
            let reference_tsc = ReferenceTsc::Enabled {
                msr_value: MsrReferenceTscValue(imported_tsc.msr_value),
                guest_freq: imported_tsc.guest_freq,
            };

            let overlay = reference_tsc.create_overlay(&inner.overlay_manager);
            (reference_tsc, overlay)
        } else {
            if inner.reference_tsc.is_present() {
                return Err(MigrateStateError::ImportFailed(
                    "imported payload has no reference TSC data, but that \
                        enlightenment is enabled"
                        .to_string(),
                ));
            }

            (ReferenceTsc::Disabled, None)
        };

        *inner = Inner {
            overlay_manager: inner.overlay_manager.clone(),
            msr_guest_os_id_value: msr_guest_os_id,
            msr_hypercall_value,
            reference_tsc,
            overlays: OverlayPages {
                hypercall: hypercall_overlay,
                tsc: tsc_overlay,
            },
        };
        Ok(())
    }
}

mod migrate {
    use serde::{Deserialize, Serialize};

    use crate::migrate::{Schema, SchemaId};

    /// Reference TSC enlightenment state.
    #[derive(Debug, Serialize, Deserialize)]
    pub struct ReferenceTscV1 {
        /// The value of the `HV_X64_MSR_REFERENCE_TSC` MSR.
        pub(super) msr_value: u64,

        /// The nominal TSC frequency for this VM. This is established when a VM
        /// first boots and determines the TSC scaling factor that's written to
        /// its reference TSC page.
        ///
        /// This module assumes that the guest's observed TSC frequency is
        /// invariant: when a VM migrates, the migrator is required to take
        /// steps to ensure that the guest TSC frequency on the target is the
        /// same as on the source. Migrators can use the
        /// [`crate::vmm::time::adjust_time_data`] function to compute the
        /// appropriate scaling factors to pass to bhyve to achieve this.
        pub(super) guest_freq: u64,
    }

    #[derive(Debug, Serialize, Deserialize)]
    pub struct HyperVEnlightenmentV1 {
        pub(super) msr_guest_os_id: u64,
        pub(super) msr_hypercall: u64,
        pub(super) reference_tsc: Option<ReferenceTscV1>,
    }

    impl Schema<'_> for HyperVEnlightenmentV1 {
        fn id() -> SchemaId {
            (super::TYPE_NAME, 1)
        }
    }
}


================================================
FILE: lib/propolis/src/enlightenment/hyperv/overlay.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for hypervisor overlay pages.
//!
//! # Overview
//!
//! An _overlay page_ is a page of guest memory that "covers" a physical page in
//! the guest's normal physical address space. The physical analogue might be a
//! computer with a special one-page memory in addition to its regular RAM whose
//! memory controller can be asked to redirect accesses to one page's worth of
//! physical addresses to the auxiliary memory. The physical page at this
//! address still exists in main memory, but it can't be addressed while the
//! memory controller is applying the overlay.
//!
//! The virtualized case is similar, but there is only one physical memory (the
//! host's memory), and the host plays the role of the memory controller,
//! changing the guest's "view" of a particular GPA as overlays are applied and
//! removed. One way to do this is to change the VM's nested page table entries
//! (i.e., the translations from GPAs to HPAs) to redirect GPA accesses to a
//! different host physical page. bhyve's memory management infrastructure
//! doesn't currently support this, so this module emulates overlays in software
//! by saving and restoring the contents of the various "layers" that exist at a
//! particular GPA and making sure the active layer is present in the host
//! physical page to which bhyve has mapped the relevant GPA.
//!
//! In general, the Hyper-V stack establishes overlays in response to a guest's
//! request to place a particular overlay at a particular GPA. For example, the
//! guest writes to the `HV_X64_MSR_HYPERCALL` MSR to provide the PFN of a page
//! that the guest would like to see overlaid with a hypercall instruction page.
//! This implies that overlays don't change while the guest's CPUs are paused;
//! this has useful implications for the lifetimes of the structures in this
//! module, which are described further below.
//!
//! TLFS section 5.2.1 describes the semantics of overlay pages in more detail.
//! Importantly, the TLFS specifies that when multiple overlays exist for a
//! single GPA, their logical ordering is implementation-defined (and not, for
//! example, based on the order in which the overlays were requested).
//!
//! The TLFS doesn't explicitly specify the sizes of its overlay pages, but all
//! the overlays this module cares about are 4 KiB, because the guest specifies
//! their locations using 52-bit PFNs.
//!
//! # This module
//!
//! ## Public interface
//!
//! This module's public interface is built around two main types:
//!
//! - The [`OverlayManager`] tracks the set of active overlays in the system and
//!   allows users to create new overlays.
//! - An [`OverlayPage`] represents a single overlay and provides interfaces
//!   to move it or remove it (the latter by dropping the page). Pages hold a
//!   reference (in fact a weak reference; see below) to their managers so that
//!   they can unregister themselves when they are dropped.
//!
//! Each [`super::HyperV`] instance holds a strong reference to an
//! `OverlayManager` and creates new overlays in response to guest activity
//! (e.g., a vCPU writing a requested overlay GPA to a Hyper-V MSR). The
//! `HyperV` instance owns all of the `OverlayPage`s it creates this way and
//! drops them or calls [`OverlayPage::move_to`] on them in response to further
//! guest activity.
//!
//! ## VM shutdown
//!
//! When a VM halts, its guest memory becomes inaccessible, so future attempts
//! to manipulate overlay pages will fail. The `Drop` implementation for
//! [`OverlayPage`] assumes that removing a dropped page will always succeed. To
//! avoid panicking during shutdown, the owning Hyper-V layer must ensure that
//! all active pages are dropped no later than the end of its
//! [`Lifecycle::halt`] callout.
//!
//! [`Lifecycle::halt`]: crate::lifecycle::Lifecycle::halt

use std::{
    collections::{btree_map::Entry, BTreeMap, BTreeSet},
    sync::{Arc, Mutex},
};

use thiserror::Error;

use crate::{
    accessors::MemAccessor, common::PAGE_SIZE,
    enlightenment::hyperv::tsc::ReferenceTscPage, vmm::Pfn,
};

use self::pfn::MappedPfn;

/// An error that can be returned from an overlay page operation.
#[derive(Debug, Error)]
pub(super) enum OverlayError {
    /// The guest memory context can't be accessed right now. Generally this
    /// means that the caller is trying to create an overlay too early (i.e.,
    /// before the overlay manager is attached to the memory hierarchy) or too
    /// late (i.e., after the VM has shut down).
    #[error("guest memory is currently inaccessible")]
    GuestMemoryInaccessible,

    /// The supplied physical address is either out of the guest's physical
    /// address range or is in a region to which the guest lacks read/write
    /// access.
    #[error("overlay target GPA {0:#x} is inaccessible")]
    AddressInaccessible(u64),

    /// An overlay of the supplied kind already exists for the supplied PFN.
    #[error("overlay target PFN {0:#x} already has an overlay of kind {1:?}")]
    KindInUse(u64, OverlayKind),

    /// The requested operation requires an existing overlay at the supplied
    /// PFN, but that PFN has no active overlays.
    #[error("no overlays registered for PFN {0:#x}")]
    NoOverlaysActive(u64),

    /// The supplied PFN has overlays, but not of the kind requested by the
    /// caller.
    #[error("PFN {0:#x} has no overlay of kind {1:?}")]
    NoOverlayOfKind(u64, OverlayKind),
}

/// The contents of a 4 KiB page. These are boxed so that this type can be
/// embedded in a struct that's put into a contiguous collection without putting
/// entire pages between collection members.
pub(super) struct OverlayContents(pub(super) Box<[u8; PAGE_SIZE]>);

impl Default for OverlayContents {
    fn default() -> Self {
        Self(Box::new([0u8; PAGE_SIZE]))
    }
}

impl std::fmt::Debug for OverlayContents {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_tuple("OverlayContents").field(&"<page redacted>").finish()
    }
}

impl TryFrom<Vec<u8>> for OverlayContents {
    type Error = usize;

    fn try_from(value: Vec<u8>) -> Result<Self, Self::Error> {
        let len = value.len();
        Ok(Self(Box::new(value.try_into().map_err(|_| len)?)))
    }
}

/// A kind of overlay page, annotated with any other information that may be
/// needed to generate the page's contents.
#[derive(Clone, Copy, Debug)]
pub(super) enum OverlayKind {
    /// A hypercall page whose instruction sequence immediately returns a "not
    /// supported" error status to its caller.
    HypercallReturnNotSupported,

    ReferenceTsc(ReferenceTscPage),

    #[cfg(test)]
    Test {
        /// An index that disambiguates different test page kinds.
        index: u8,

        /// A byte with which to fill the overlaid page when this overlay
        /// becomes active.
        fill: u8,
    },
}

impl OverlayKind {
    /// Returns the "priority" of this page relative to overlay pages of other
    /// kinds. For each PFN the page with the lowest priority value is active.
    ///
    /// WARNING: The priorities of existing overlay kinds should not be changed,
    /// since this can cause a PFN's active overlay to change if a VM is
    /// migrated from a Propolis using one ordering to a Propolis using a
    /// different ordering. Note, however, that this only occurs when multiple
    /// overlays are present at a single PFN, and stacked overlays will
    /// (hopefully) be extremely rare in practice, since a guest that asks for
    /// them will be unable to access one or more of the overlays it set up.
    fn priority(&self) -> u32 {
        let high: u16 = match self {
            Self::HypercallReturnNotSupported => 0,
            Self::ReferenceTsc(_) => 1,

            #[cfg(test)]
            Self::Test { .. } => u16::MAX,
        };

        let low: u16 = match self {
            #[cfg(test)]
            Self::Test { index, .. } => *index as u16,
            _ => 0,
        };

        ((high as u32) << 16) | (low as u32)
    }

    /// Obtains the contents this overlay should display when it is active.
    fn get_contents(&self) -> OverlayContents {
        match self {
            Self::HypercallReturnNotSupported => OverlayContents(Box::new(
                super::hypercall::hypercall_page_contents(),
            )),
            Self::ReferenceTsc(page) => OverlayContents(page.into()),
            #[cfg(test)]
            Self::Test { index: _, fill } => {
                OverlayContents(Box::new([*fill; PAGE_SIZE]))
            }
        }
    }
}

impl PartialEq for OverlayKind {
    fn eq(&self, other: &Self) -> bool {
        self.priority() == other.priority()
    }
}

impl Eq for OverlayKind {}

impl Ord for OverlayKind {
    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
        self.priority().cmp(&other.priority())
    }
}

impl PartialOrd for OverlayKind {
    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

/// A registered overlay page, held by a user of this module. The overlay is
/// removed when this page is dropped.
pub(super) struct OverlayPage {
    /// A back reference to this page's manager, used to move or destroy the
    /// page on request.
    manager: Arc<OverlayManager>,

    /// The kind of overlay this is.
    kind: OverlayKind,

    /// The PFN at which this overlay is currently applied.
    pfn: Pfn,
}

impl std::fmt::Debug for OverlayPage {
    // Manually implemented since `OverlayManager` is not `Debug`.
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let OverlayPage { kind, pfn, manager: _ } = self;
        f.debug_struct("OverlayPage")
            .field("kind", &kind)
            .field("pfn", &pfn)
            .finish()
    }
}

impl OverlayPage {
    /// Moves this overlay to a new PFN.
    pub(super) fn move_to(&mut self, new_pfn: Pfn) -> Result<(), OverlayError> {
        self.manager.move_overlay(self.pfn, new_pfn, self.kind)?;
        self.pfn = new_pfn;
        Ok(())
    }
}

impl Drop for OverlayPage {
    fn drop(&mut self) {
        self.manager
            .remove_overlay(self.pfn, self.kind)
            .expect("active overlay pages can always be removed");
    }
}

/// A set of overlays that apply to a particular PFN. Only one overlay of a
/// given type may be applied to a particular PFN at a particular time.
#[derive(Debug)]
struct OverlaySet {
    /// The contents of guest memory at this set's PFN at the time an overlay
    /// first became active here.
    original_contents: OverlayContents,

    /// The set of overlays that have been requested at this PFN.
    ///
    /// TLFS section 5.2.1 specifies that when there are multiple overlays for a
    /// given page, the order in which they are applied is
    /// implementation-defined. Here, the overlay with the lowest numerical
    /// priority value (as given by [`OverlayKind::priority`]) is active, and
    /// the others are pending.
    overlays: BTreeSet<OverlayKind>,
}

impl OverlaySet {
    fn active(&self) -> &OverlayKind {
        self.overlays.first().expect("live overlay sets are never empty")
    }
}

/// Synchronized overlay manager state.
#[derive(Default)]
struct ManagerInner {
    /// Tracks the set of PFNs that have active and/or pending overlays.
    overlays: BTreeMap<Pfn, OverlaySet>,
}

impl ManagerInner {
    /// Registers a new overlay at the supplied PFN.
    fn add_overlay(
        &mut self,
        pfn: Pfn,
        kind: OverlayKind,
        acc_mem: &MemAccessor,
    ) -> Result<(), OverlayError> {
        let memctx =
            acc_mem.access().ok_or(OverlayError::GuestMemoryInaccessible)?;

        let mut mapped = MappedPfn::new(pfn, &memctx)?;
        self.add_overlay_using_mapping(&mut mapped, kind)
    }

    /// Applies a new overlay to guest memory using the supplied mapping.
    fn add_overlay_using_mapping(
        &mut self,
        mapped_pfn: &mut MappedPfn,
        kind: OverlayKind,
    ) -> Result<(), OverlayError> {
        let pfn = mapped_pfn.pfn();
        match self.overlays.entry(pfn) {
            Entry::Vacant(e) => {
                let mut original_contents = OverlayContents::default();
                mapped_pfn.read_page(&mut original_contents);
                mapped_pfn.write_page(&kind.get_contents());
                e.insert(OverlaySet {
                    original_contents,
                    overlays: BTreeSet::from([kind]),
                });
            }
            Entry::Occupied(e) => {
                let set = e.into_mut();
                if !set.overlays.insert(kind) {
                    return Err(OverlayError::KindInUse(pfn.into(), kind));
                }

                if *set.active() == kind {
                    mapped_pfn.write_page(&kind.get_contents());
                }
            }
        }

        Ok(())
    }

    /// Removes an existing overlay of the supplied kind from the supplied PFN.
    fn remove_overlay(
        &mut self,
        pfn: Pfn,
        kind: OverlayKind,
        acc_mem: &MemAccessor,
    ) -> Result<(), OverlayError> {
        let Entry::Occupied(mut set) = self.overlays.entry(pfn) else {
            return Err(OverlayError::NoOverlaysActive(pfn.into()));
        };

        if *set.get().active() == kind {
            let memctx = acc_mem
                .access()
                .ok_or(OverlayError::GuestMemoryInaccessible)?;

            let mut mapped = MappedPfn::new(pfn, &memctx)
                .expect("active overlay PFNs can be mapped");

            assert!(set.get_mut().overlays.remove(&kind));

            if set.get().overlays.is_empty() {
                mapped.write_page(&set.get().original_contents);
                set.remove_entry();
            } else {
                mapped.write_page(&set.get().active().get_contents());
            }
        } else {
            // This overlay kind isn't active for this page. If it's pending,
            // it's sufficient just to remove its entry from the pending map,
            // since it has had no effect on guest memory.
            //
            // If there's no pending overlay of this kind, the caller goofed.
            if !set.get_mut().overlays.remove(&kind) {
                return Err(OverlayError::NoOverlayOfKind(pfn.into(), kind));
            }
        }

        Ok(())
    }

    /// Moves the overlay of the supplied `kind` from `from_pfn` to `to_pfn`.
    fn move_overlay(
        &mut self,
        from_pfn: Pfn,
        to_pfn: Pfn,
        kind: OverlayKind,
        acc_mem: &MemAccessor,
    ) -> Result<(), OverlayError> {
        if from_pfn == to_pfn {
            return Ok(());
        }

        let memctx =
            acc_mem.access().ok_or(OverlayError::GuestMemoryInaccessible)?;

        // Moving an overlay consists of applying it to its new location and
        // then removing it from its old one. This is only legal if the target
        // PFN is valid and there is no other overlay of this kind associated
        // with that PFN.
        //
        // Checking these conditions up front allows this function to attempt to
        // remove the existing overlay (checking for errors) and then assert
        // that the overlay can be applied in its new position. This is simpler
        // than removing the old overlay, failing to add the new one, and then
        // having to remember how to add the old overlay back in its previous
        // position.
        let mut to_mapping = MappedPfn::new(to_pfn, &memctx)?;
        if let Some(to_set) = self.overlays.get(&to_pfn) {
            if to_set.overlays.contains(&kind) {
                return Err(OverlayError::KindInUse(to_pfn.into(), kind));
            }
        }

        // Removing the old overlay might fail if it doesn't exist in
        // `from_pfn`, but if it succeeds it will always produce the previous
        // page contents.
        self.remove_overlay(from_pfn, kind, acc_mem)?;

        // Adding a new overlay with a valid mapping can only fail if the PFN
        // already has an overlay of the selected kind, and that was checked
        // above.
        self.add_overlay_using_mapping(&mut to_mapping, kind)
            .expect("already checked new PFN for an overlay of this kind");

        Ok(())
    }
}

/// An overlay tracker that adds, removes, and moves overlays.
///
/// # Usage requirements
///
/// `HyperV` instances that own an overlay manager are expected to do the
/// following:
///
/// - After calling [`OverlayManager::new`], the Hyper-V instance must call
///   [`OverlayManager::attach`] to attach the manager's memory accessor to a
///   VM's memory hierarchy before it can add any overlays.
/// - The Hyper-V instance must drop all [`OverlayPage`]s no later than the end
///   of its [`Lifecycle::halt`] callout (so that the overlay manager still has
///   access to guest memory when those pages are destroyed).
///
/// [`Lifecycle::halt`]: crate::lifecycle::Lifecycle::halt
pub(super) struct OverlayManager {
    inner: Mutex<ManagerInner>,
    acc_mem: MemAccessor,
}

impl Default for OverlayManager {
    fn default() -> Self {
        let acc_mem = MemAccessor::new_orphan();
        Self { inner: Mutex::new(ManagerInner::default()), acc_mem }
    }
}

impl OverlayManager {
    /// Attaches this overlay manager to the supplied memory accessor hierarchy.
    pub(super) fn attach(&self, parent_mem: &MemAccessor) {
        parent_mem
            .adopt(&self.acc_mem, Some("hyperv-overlay-manager".to_string()));
    }

    /// Returns `true` if this manager has no active overlays.
    pub(super) fn is_empty(&self) -> bool {
        self.inner.lock().unwrap().overlays.is_empty()
    }

    /// Adds an overlay of the supplied `kind` with the supplied `contents` over
    /// the supplied `pfn`.
    pub(super) fn add_overlay(
        self: &Arc<Self>,
        pfn: Pfn,
        kind: OverlayKind,
    ) -> Result<OverlayPage, OverlayError> {
        let mut inner = self.inner.lock().unwrap();
        inner.add_overlay(pfn, kind, &self.acc_mem)?;
        Ok(OverlayPage { manager: self.clone(), kind, pfn })
    }

    /// Removes an overlay of the supplied `kind` from the set at the supplied
    /// `pfn`.
    fn remove_overlay(
        &self,
        pfn: Pfn,
        kind: OverlayKind,
    ) -> Result<(), OverlayError> {
        self.inner
            .lock()
            .unwrap()
            .remove_overlay(pfn, kind, &self.acc_mem)
            .map(|_| ())
    }

    /// Moves an overlay of the supplied `kind` from `from_pfn` to `to_pfn`.
    fn move_overlay(
        &self,
        from_pfn: Pfn,
        to_pfn: Pfn,
        kind: OverlayKind,
    ) -> Result<(), OverlayError> {
        self.inner.lock().unwrap().move_overlay(
            from_pfn,
            to_pfn,
            kind,
            &self.acc_mem,
        )
    }
}

/// Helpers for dealing with page frame numbers (guest physical page numbers, or
/// PFNs). These help to avoid page alignment checks that would otherwise be
/// necessary when dealing with full physical addresses.
mod pfn {
    use crate::{
        common::{GuestRegion, PAGE_SIZE},
        vmm::{Pfn, SubMapping},
    };

    use super::{OverlayContents, OverlayError};

    /// A mapping of a page of guest memory with a particular PFN.
    pub(super) struct MappedPfn<'a> {
        pfn: Pfn,
        mapping: SubMapping<'a>,
    }

    impl<'a> MappedPfn<'a> {
        /// Creates a new page-sized mapping of the supplied PFN.
        pub(super) fn new(
            pfn: Pfn,
            memctx: &'a crate::accessors::Guard<
                'a,
                crate::vmm::mem::MemAccessed,
            >,
        ) -> Result<Self, OverlayError> {
            let gpa = pfn.addr();
            let mapping = memctx
                .readwrite_region(&GuestRegion(gpa, PAGE_SIZE))
                .ok_or(OverlayError::AddressInaccessible(gpa.0))?;

            Ok(Self { pfn, mapping })
        }

        /// Yields this mapping's PFN.
        pub(super) fn pfn(&self) -> Pfn {
            self.pfn
        }

        /// Writes the supplied overlay page to this mapping's guest physical
        /// page.
        ///
        /// # Panics
        ///
        /// Panics if the mapping is inaccessible or if the page was only
        /// partially written.
        pub(super) fn write_page(&mut self, page: &OverlayContents) {
            assert_eq!(
                self.mapping
                    .write_bytes(page.0.as_slice())
                    .expect("PFN mappings are always accessible"),
                PAGE_SIZE
            );
        }

        /// Reads from the mapped page into the supplied buffer.
        ///
        /// # Panics
        ///
        /// Panics if the mapping is inaccessible or if the page was only
        /// partially read.
        pub(super) fn read_page(&mut self, page: &mut OverlayContents) {
            assert_eq!(
                self.mapping
                    .read_bytes(page.0.as_mut_slice())
                    .expect("PFN mappings are always accessible"),
                PAGE_SIZE
            );
        }
    }
}

#[cfg(test)]
mod test {
    use std::collections::VecDeque;

    use crate::vmm::{PhysMap, VmmHdl, MAX_PHYSMEM};

    use super::*;

    impl super::OverlayContents {
        fn filled(fill: u8) -> Self {
            Self(Box::new([fill; PAGE_SIZE]))
        }
    }

    /// A context for overlay tests, which includes a fake guest memory.
    struct TestCtx {
        manager: Arc<OverlayManager>,
        _vmm_hdl: Arc<VmmHdl>,
        _physmem: PhysMap,
        acc_mem: MemAccessor,
    }

    impl TestCtx {
        /// Creates a new test context with 1 MiB of addressable memory (i.e., a
        /// maximum PFN of 0xFF).
        fn new() -> Self {
            let hdl = Arc::new(VmmHdl::new_test(1024 * 1024).unwrap());
            let mut map = PhysMap::new(MAX_PHYSMEM, hdl.clone());
            map.add_test_mem("test-ram".to_string(), 0, 1024 * 1024).unwrap();
            let acc_mem = map.finalize();

            let mgr = Arc::new(OverlayManager::default());
            mgr.attach(&acc_mem);
            TestCtx { manager: mgr, _vmm_hdl: hdl, _physmem: map, acc_mem }
        }

        /// Writes the supplied `contents` to the supplied `pfn` in the
        /// context's guest memory.
        fn write_page(&self, pfn: Pfn, contents: &OverlayContents) {
            let memctx = self.acc_mem.access().unwrap();
            let mut mapping = MappedPfn::new(pfn, &memctx).unwrap();
            mapping.write_page(&contents);
        }

        /// Asserts that page `pfn` of the context's memory is filled with
        /// the supplied `fill` bytes.
        fn assert_pfn_has_fill(&self, pfn: Pfn, fill: u8) {
            let memctx = self.acc_mem.access().unwrap();
            let mut mapping = MappedPfn::new(pfn, &memctx).unwrap();
            let mut contents = OverlayContents::default();
            mapping.read_page(&mut contents);
            assert_eq!(
                contents.0.as_slice(),
                [fill; PAGE_SIZE],
                "guest memory at pfn {pfn} has unexpected fill byte {:#x}",
                (contents.0)[0]
            );
        }
    }

    /// Tests that adding an overlay page causes its contents to appear in guest
    /// memory.
    #[test]
    fn basic_add() {
        let ctx = TestCtx::new();
        let pfn = Pfn::new_unchecked(0x10);

        let _page = ctx
            .manager
            .add_overlay(pfn, OverlayKind::Test { index: 0, fill: 0xAB })
            .unwrap();

        ctx.assert_pfn_has_fill(pfn, 0xAB);
    }

    /// Tests that moving an overlay page from one PFN to another causes its
    /// contents to move from the old PFN to the new one and that the old page's
    /// contents are restored.
    #[test]
    fn basic_move() {
        let ctx = TestCtx::new();
        let pfn1 = Pfn::new_unchecked(0x10);
        let pfn2 = Pfn::new_unchecked(0x20);

        let mut page = ctx
            .manager
            .add_overlay(pfn1, OverlayKind::Test { index: 0, fill: 0xCD })
            .unwrap();

        ctx.assert_pfn_has_fill(pfn1, 0xCD);
        page.move_to(pfn2).unwrap();
        ctx.assert_pfn_has_fill(pfn1, 0);
        ctx.assert_pfn_has_fill(pfn2, 0xCD);

        drop(page);
        ctx.assert_pfn_has_fill(pfn2, 0);
    }

    /// Tests that removing the last overlay for a given physical page restores
    /// its original contents.
    #[test]
    fn underlay_restored_after_drop() {
        let ctx = TestCtx::new();
        let pfn = Pfn::new_unchecked(0x10);
        ctx.write_page(pfn, &OverlayContents::filled(0x11));

        let page = ctx
            .manager
            .add_overlay(pfn, OverlayKind::Test { index: 0x11, fill: 0x99 })
            .unwrap();

        ctx.assert_pfn_has_fill(pfn, 0x99);
        drop(page);
        ctx.assert_pfn_has_fill(pfn, 0x11);
    }

    /// Tests that the manager rejects requests to add an overlay with a PFN
    /// outside of the guest's physical address range.
    #[test]
    fn out_of_bounds_pfn() {
        let ctx = TestCtx::new();
        ctx.manager
            .add_overlay(
                Pfn::new_unchecked(0xFFFFF),
                OverlayKind::Test { index: 0, fill: 0xFF },
            )
            .unwrap_err();
    }

    /// Tests that the manager rejects requests to have more than one overlay of
    /// a given kind at a given PFN. (This invariant is required for overlay
    /// pages to move and remove themselves correctly.)
    #[test]
    fn duplicate_kind_at_pfn() {
        let ctx = TestCtx::new();
        let pfn = Pfn::new_unchecked(0x20);
        let kind = OverlayKind::Test { index: 0, fill: 0x22 };

        let page = ctx.manager.add_overlay(pfn, kind).unwrap();
        ctx.manager.add_overlay(pfn, kind).unwrap_err();
        drop(page);
        ctx.manager.add_overlay(pfn, kind).unwrap();
    }

    /// Tests that a page with multiple overlays applies those overlays in the
    /// correct order and restores the original page contents when all the
    /// overlays are gone.
    #[test]
    fn multiple_overlays() {
        let ctx = TestCtx::new();
        let pfn = Pfn::new_unchecked(0x40);
        let fills = [1, 2, 3, 4];
        let mut pages: VecDeque<_> = fills
            .iter()
            .map(|i| {
                ctx.manager
                    .add_overlay(pfn, OverlayKind::Test { index: *i, fill: *i })
                    .unwrap()
            })
            .collect();

        ctx.assert_pfn_has_fill(pfn, 1);
        pages.pop_front();
        ctx.assert_pfn_has_fill(pfn, 2);
        pages.pop_front();
        ctx.assert_pfn_has_fill(pfn, 3);
        pages.pop_front();
        ctx.assert_pfn_has_fill(pfn, 4);
        pages.pop_front();
        ctx.assert_pfn_has_fill(pfn, 0);
    }

    /// Tests that removing a nonexistent overlay from a PFN fails without
    /// disrupting any existing overlays at that PFN.
    #[test]
    fn remove_nonexistent_overlay() {
        let ctx = TestCtx::new();
        let pfn = Pfn::new_unchecked(0x30);

        let _page = ctx
            .manager
            .add_overlay(pfn, OverlayKind::Test { index: 0, fill: 0x70 })
            .unwrap();

        ctx.manager
            .remove_overlay(pfn, OverlayKind::Test { index: 1, fill: 0x70 })
            .unwrap_err();

        ctx.assert_pfn_has_fill(pfn, 0x70);
    }
}


================================================
FILE: lib/propolis/src/enlightenment/hyperv/tsc.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for the Hyper-V reference time enlightenment. See TLFS section 12.7.
//!
//! # Theory
//!
//! The x86 timestamp counter (TSC) gives system software a high-resolution
//! performance counter that increments roughly once per processor clock cycle.
//! The TSC is just a counter and does not return elapsed time in SI units;
//! instead, readers convert TSC values to elapsed time by dividing the number
//! of TSC ticks by the TSC frequency to get a number of seconds, which they can
//! then convert to a reference frequency:
//!
//! ```text
//! elapsed reference units
//!     = elapsed seconds * (reference units / 1 sec)
//!     = TSC ticks * (1 / (TSC ticks / 1 sec)) * reference frequency
//!     = TSC ticks * (1 / TSC frequency) * reference frequency
//!     = TSC ticks * (reference frequency / TSC frequency)
//! ```
//!
//! (This calculation assumes that the TSC does in fact tick at a constant
//! frequency. This is often the case on modern processors, but it is not
//! guaranteed, and system software is expected to check CPUID to see if the CPU
//! advertises such an "invariant" TSC before doing this kind of calculation.)
//!
//! KVM and Linux use nanoseconds as the reference time unit, so their reference
//! frequency is 1e9 ticks/sec. Windows and Hyper-V use 100ns units and a
//! frequency of 1e7 ticks/sec. Because these frequencies are expressed in
//! cycles per second, using simple integer divisions to convert ticks to
//! seconds or to scale frequencies will lose all sub-second precision, which
//! defeats the purpose of having a high-resolution timekeeping facility. To
//! avoid this problem without having to use floating-point arithmetic,
//! timekeeping enlightenments usually turn to fixed-point scaling fractions.
//!
//! The idea (at least in this enlightenment) is to take a 64-bit frequency
//! value and represent it as a 128-bit integer with an implicit radix point
//! between bits 63 and 64. The upper 64 bits of the value are its integer part,
//! and the lower 64 bits are its fractional part. Doing this to the integer
//! reference frequency in the conversion above amounts to writing the
//! following:
//!
//! ```text
//! frequency multiplier
//!     = reference frequency / TSC frequency
//!     = ((reference frequency as u128) * 2^64) / TSC frequency) * (1 / 2^64)
//! ```
//!
//! The first term of this multiplication is a 128-bit scaling factor. Notice
//! that, because the TSC frequency is a 64-bit integer and therefore guaranteed
//! to be less than 2^64, the division in the first term won't truncate to 0;
//! instead, if the quotient has a fractional part, the high 64 bits of that
//! fractional part will be preserved in the low 64 bits of the integer
//! division quotient.
//!
//! The scaling factor is still an integer, so a TSC reader that wants to apply
//! it can do so with an integer multiplication followed by a shift:
//!
//! ```text
//! elapsed reference units
//!     = TSC ticks * (reference frequency / TSC frequency)
//!     = (TSC ticks * scaling factor) / 2^64
//!     = (TSC ticks * scaling factor) >> 64
//! ```
//!
//! There is one small catch: the scaling factor was computed as a 128-bit
//! value, but the x86-64 `IMUL` instruction's maximum operand size is 64 bits.
//! This enlightenment avoids this problem by observing that if the host TSC
//! frequency is greater than 10 MHz (highly likely on a platform with an
//! invariant TSC), then the scaling factor is less than 1, which means its
//! upper 64 bits are 0, which means that the scaled TSC value can be trivially
//! rewritten as the product of a 64-bit TSC value and the lower 64 bits of the
//! scaling factor. This 128-bit product can then be shifted right by 64 bits to
//! produce an elapsed time.
//!
//! If the host TSC frequency is too low for the scaling factor to fit in 64
//! bits, Hyper-V simply disables the enlightenment by writing a special value
//! to the reference page. Other hypervisors like KVM may handle the situation
//! differently, e.g. by having the guest shift its TSC readings before
//! multiplying by the scaling factor to guarantee that the product won't
//! overflow.
//!
//! Although this discussion focused on 64.64 fixed-point fractions, the same
//! principles can be applied for values of different widths and different radix
//! points. For example, Intel processors that support TSC scaling use a 64-bit
//! scaling value with 16 integer bits and 48 fractional bits.
//!
//! # Practice
//!
//! Hyper-V provides an overlay page that contains a 64-bit scaling factor and
//! an offset that a guest can use to convert a guest TSC reading to the time
//! since guest boot in 100-nanosecond units. Section 12.7.3 of the TLFS
//! specifies the following computation:
//!
//! ```text
//! reference_time: u128 = ((tsc * scale) >> 64) + offset
//! ```
//!
//! The host computes the `scale` factor by shifting the reference frequency
//! (1e7) left by 64 places and dividing by the guest's effective TSC frequency
//! to get a scaling fraction, as described above. The `offset` depends on the
//! difference between the host and guest TSC values; this implementation
//! assumes that bhyve will set up the guest such that this offset can always be
//! 0 (i.e., the guest will obtain an appropriately-offset TSC value directly
//! from RDTSC without having to correct it further).
//!
//! Although unlikely on the machines Propolis generally targets, it is
//! theoretically possible for the host TSC frequency to be so low that the
//! scaling factor cannot be expressed as a 0.64 fixed-point fraction. In this
//! case the hypervisor writes a special value to the TSC page's `sequence`
//! field to denote that the rest of the page's contents are invalid. See
//! [`ReferenceTscPage`] for more details.
//!
//! # Live migration
//!
//! When a VM migrates from one host to another, it will usually find that the
//! hosts' TSC values are not in sync, either because they were started at
//! different times or they have different TSC frequencies (or both).
//!
//! Propolis accounts for these differences using hardware TSC scaling and
//! offset features. These are similar to the scale and offset fields on the
//! reference page: the hypervisor programs a fixed-point scaling multiplier and
//! offset into the VM's control structures before entering the guest, and the
//! processor applies these factors when the guest executes RDTSC.
//!
//! This module assumes that if its VM is migrated, the overarching migration
//! protocol will ensure that the guest's observed TSC frequency and offset will
//! remain unchanged, such that the reference TSC page's contents can remain
//! unchanged when a VM migrates. (The propolis-server migration protocol
//! ensures this by requiring migration targets to support hardware-based TSC
//! scaling and offsetting.)

use std::sync::Arc;

use crate::{
    common::{GuestAddr, PAGE_MASK, PAGE_SHIFT, PAGE_SIZE},
    enlightenment::hyperv::{
        overlay::{OverlayKind, OverlayManager},
        TscOverlay,
    },
    vmm::Pfn,
};

use zerocopy::{Immutable, IntoBytes};

const ENABLED_BIT: u64 = 0;
const ENABLED_MASK: u64 = 1 << ENABLED_BIT;

/// Represents a value written to the [`HV_X64_MSR_REFERENCE_TSC`] register.
///
/// [`HV_X64_MSR_REFERENCE_TSC`]: super::bits::HV_X64_MSR_REFERENCE_TSC
#[derive(Clone, Copy, Default)]
pub(super) struct MsrReferenceTscValue(pub(super) u64);

impl std::fmt::Debug for MsrReferenceTscValue {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("MsrReferenceTscValue")
            .field("raw", &format!("{:#x}", self.0))
            .field("gpa", &format!("{:#x}", self.gpa().0))
            .field("enabled", &self.enabled())
            .finish()
    }
}

impl MsrReferenceTscValue {
    /// Yields the PFN at which the guest would like to place the reference TSC
    /// page.
    pub fn gpfn(&self) -> Pfn {
        Pfn::new(self.0 >> PAGE_SHIFT).unwrap()
    }

    /// Yields the guest physical address at which the guest would like to place
    /// the reference TSC page.
    pub fn gpa(&self) -> GuestAddr {
        GuestAddr(self.0 & PAGE_MASK as u64)
    }

    /// Returns `true` if the reference TSC overlay is enabled.
    pub fn enabled(&self) -> bool {
        (self.0 & ENABLED_MASK) != 0
    }
}

/// The contents of a reference TSC page, defined in TLFS section 12.7.2.
#[derive(Clone, Copy, Debug, Default, IntoBytes, Immutable)]
#[repr(packed, C)]
pub(super) struct ReferenceTscPage {
    /// Incremented whenever the `scale` or `offset` fields of this page are
    /// modified. Guests are meant to read the sequence value, read the scale
    /// and offset fields, and re-read the sequence value, consuming the scale
    /// and offset only if the sequence did not change.
    ///
    /// If this value is 0, guests are not to use the scale and offset factors
    /// on this page and are to fall back to another time source.
    ///
    /// This module assumes that if a VM migrates, the overarching migration
    /// protocol will work with bhyve to ensure that the guest TSC offset and
    /// observed frequency remain unchanged.
    sequence: u32,

    /// Reserved for alignment.
    reserved: u32,

    /// The 0.64 fixed-point scaling factor to use to convert guest TSC ticks
    /// into 100-nanosecond time units. This is computed as `((10_000_000u128 <<
    /// 64) / guest_tsc_frequency`. If this value cannot be represented as a
    /// 0.64 fixed-point fraction, the reference TSC page is disabled.
    scale: u64,

    /// The offset, in 100 ns units, that the guest should add to its scaled TSC
    /// readings to obtain the number of 100 ns units that have elapsed since
    /// the guest booted.
    ///
    /// This implementation assumes that bhyve ensures that the guest TSC is
    /// always correctly offset from the host TSC, so it always sets this value
    /// to 0.
    offset: i64,
}

impl ReferenceTscPage {
    /// Creates reference TSC data with a scaling factor computed from the
    /// supplied guest TSC frequency.
    pub(super) fn new(guest_freq: u64) -> Self {
        let (scale, sequence) =
            if let Some(scale) = guest_freq_to_scale(guest_freq) {
                (scale, 1)
            } else {
                (0, 0)
            };

        Self { sequence, scale, ..Default::default() }
    }
}

impl From<&ReferenceTscPage> for Box<[u8; PAGE_SIZE]> {
    fn from(value: &ReferenceTscPage) -> Self {
        let mut page = Box::new([0u8; PAGE_SIZE]);
        page[0..std::mem::size_of::<ReferenceTscPage>()]
            .copy_from_slice(value.as_bytes());

        page
    }
}

/// Converts the supplied guest TSC frequency into a 0.64 fixed-point scaling
/// factor. Returns `None` if the correct factor cannot be so expressed.
fn guest_freq_to_scale(guest_freq: u64) -> Option<u64> {
    const HUNDRED_NS_PER_SEC: u128 = 10_000_000;
    let scale: u128 = (HUNDRED_NS_PER_SEC << 64) / guest_freq as u128;
    if (scale >> 64) != 0 {
        None
    } else {
        Some(scale as u64)
    }
}

/// The enablement status of a reference TSC enlightenment.
#[derive(Clone, Copy, Debug)]
pub(super) enum ReferenceTsc {
    /// The enlightenment is disabled.
    Disabled,

    /// The enlightenment is enabled, but has not yet been initialized.
    Uninitialized,

    /// The enlightenment is enabled and initialized.
    Enabled { msr_value: MsrReferenceTscValue, guest_freq: u64 },
}

impl ReferenceTsc {
    /// Returns `true` if the reference TSC enlightenment is present in this
    /// Hyper-V stack, regardless of whether it has been initialized yet.
    pub(super) fn is_present(&self) -> bool {
        matches!(
            self,
            ReferenceTsc::Uninitialized | ReferenceTsc::Enabled { .. }
        )
    }

    /// Sets this enlightenment's reference TSC MSR value.
    ///
    /// # Panics
    ///
    /// Panics if this enlightenment is not enabled and fully initialized.
    pub(super) fn set_msr_value(&mut self, value: MsrReferenceTscValue) {
        match self {
            Self::Enabled { msr_value, .. } => *msr_value = value,
            _ => panic!(
                "setting TSC MSR value for invalid enlightenment {self:?}"
            ),
        }
    }

    /// Registers a reference TSC overlay page with the supplied overlay manager
    /// at the PFN specified by this struct's `msr_value`.
    ///
    /// # Return value
    ///
    /// `Some` if an overlay was successfully created at the relevant PFN.
    /// `None` if the MSR value indicates the overlay is disabled or if the
    /// overlay could not be created at the requested PFN.
    ///
    /// # Panics
    ///
    /// Panics if this enlightenment is not enabled and fully initialized.
    pub(super) fn create_overlay(
        &self,
        overlay_manager: &Arc<OverlayManager>,
    ) -> Option<TscOverlay> {
        let Self::Enabled { msr_value, guest_freq } = self else {
            panic!(
                "asked to create a TSC overlay for invalid enlightenment \
                {self:?}"
            );
        };

        if !msr_value.enabled() {
            return None;
        }

        let page = ReferenceTscPage::new(*guest_freq);
        overlay_manager
            .add_overlay(msr_value.gpfn(), OverlayKind::ReferenceTsc(page))
            .ok()
            .map(TscOverlay)
    }
}


================================================
FILE: lib/propolis/src/enlightenment/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Components that implement _enlightenments_: mechanisms that allow guest
//! software to cooperate directly with its host hypervisor.
//!
//! # Background
//!
//! Although the high-level point of a virtual machine is to allow guest
//! software to behave as though it was running on its own physical hardware,
//! there are (as with any abstraction) places where a virtual machine is bound
//! to behave differently than a "real" computer. For example, accessing a piece
//! of chipset functionality (like a hardware timer) might be relatively
//! inexpensive on real hardware, but in a virtual machine it requires an
//! expensive VM exit. Similarly, guest OSes may generally assume that hardware
//! timer interrupts will be delivered and serviced very promptly---promptly
//! enough that they can be used as watchdogs to guarantee forward progress; not
//! so in a virtual machine, where the host may elect not to immediately run a
//! vCPU with a pending timer interrupt.
//!
//! To help smooth some of these problems over, many hypervisors implement a set
//! of "enlightenments" that allow the guest and the hypervisor to cooperate
//! directly with one another. This module distinguishes enlightenments from
//! other kinds of virtual devices by the interfaces the guest uses to
//! communicate with the host. For hypervisor enlightenments these include the
//! following:
//!
//! 1. CPUID: The enlightenment stack injects synthetic CPUID values in a
//!    well-known range of leaves (beginning with leaf 0x4000_0000) to advertise
//!    its capabilities to guests.
//! 2. Synthetic MSRs: The enlightenment stack intercepts RDMSR and WRMSR
//!    instructions targeting MSRs in a well-known range (beginning with MSR ID
//!    and interprets them according to its interface's definitions.
//! 3. Direct sharing of guest physical memory: The guest can use MSRs to offer
//!    to share its physical pages with the host, either to communicate directly
//!    or for the host to overlay with larger blocks of information the guest
//!    may wish to read.
//! 4. Special VM exits: Both Intel's VMX and AMD's SVM provide special opcodes
//!    (VMCALL and VMMCALL, respectively) that trigger a VM exit with a unique
//!    exit code. The hypervisor can detect exits with this code, read the
//!    guest's registers, and interpret them as parameters to a virtual function
//!    call.
//!
//! Enlightenment stacks generally do not use port I/O or memory-mapped I/O
//! to receive data from guests. This distinguishes them from other purely
//! virtual devices (like virtio devices or the pvpanic device) that do not
//! emulate any particular kind of physical hardware but nevertheless manifest
//! themselves to the guest as attachments to a virtual bus.
//!
//! # This module
//!
//! This module defines traits that allow other Propolis components (notably
//! vCPUs) to interact with an enlightenment stack. This module's submodules
//! define various kinds of emulated hypervisor platforms and implement the
//! enlightenments they supply.

use std::sync::Arc;

use cpuid_utils::{CpuidIdent, CpuidSet};
use thiserror::Error;

use crate::{
    accessors::MemAccessor,
    common::{Lifecycle, VcpuId},
    msr::{MsrId, RdmsrOutcome, WrmsrOutcome},
    vmm::VmmHdl,
};

pub mod bhyve;
pub mod hyperv;

/// Functionality provided by all enlightenment interfaces.
pub trait Enlightenment: Lifecycle + Send + Sync {
    fn as_lifecycle(self: Arc<Self>) -> Arc<dyn Lifecycle>
    where
        Self: Sized,
    {
        self
    }

    /// Attaches this enlightenment stack to a VM.
    ///
    /// Users of an enlightenment stack must guarantee that this function is
    /// called exactly once per instance of that stack and must do this before
    /// starting any vCPUs or other VM components that may use the stack.
    ///
    /// # Arguments
    ///
    /// - `mem_acc`: Supplies the root memory accessor for this stack's VM.
    ///   Stacks that wish to access guest memory should call
    ///   [`MemAccessor::new_orphan`] when they're created and then should call
    ///   [`MemAccessor::adopt`] from this function.
    /// - `vmm_hdl`: A handle to the bhyve VMM for the VM that owns this stack.
    fn attach(&self, mem_acc: &MemAccessor, vmm_hdl: Arc<VmmHdl>);

    /// Adds this hypervisor interface's CPUID entries to `cpuid`.
    ///
    /// CPUID leaves from 0x4000_0000 to 0x4000_00FF are reserved for the
    /// hypervisor's use. On entry, the caller must ensure that `cpuid` does not
    /// contain any leaf entries in this range.
    fn add_cpuid(&self, cpuid: &mut CpuidSet) -> Result<(), AddCpuidError>;

    /// Asks this enlightenment stack to attempt to handle an RDMSR instruction
    /// on the supplied `vcpu` targeting the supplied `msr`.
    fn rdmsr(&self, vcpu: VcpuId, msr: MsrId) -> RdmsrOutcome;

    /// Asks this enlightenment stack to attempt to handle a WRMSR instruction
    /// on the supplied `vcpu` that will write `value` to the supplied `msr`.
    fn wrmsr(&self, vcpu: VcpuId, msr: MsrId, value: u64) -> WrmsrOutcome;
}

/// An error that can arise while inserting hypervisor CPUID leaves into a CPUID
/// set via [`Enlightenment::add_cpuid`].
///
/// These errors indicate caller bugs: `Enlightenment::add_cpuid` requires that
/// the input CPUID set contain no leaves in the hypervisor CPUID region.
#[derive(Debug, Error)]
pub enum AddCpuidError {
    /// The enlightenment tried to insert a leaf that was already present in the
    /// input CPUID set.
    #[error("input CPUID set already contains key {0:?}")]
    LeafAlreadyPresent(CpuidIdent),

    /// The enlightenment tried to insert a leaf or subleaf entry that
    /// conflicted with an existing entry in the input CPUID set.
    #[error("input CPUID set has subleaf presence conflict at key {0:?}")]
    SubleafConflict(CpuidIdent),
}

/// Adds the CPUID entries in `to_add` to `to_modify`.
///
/// Implementations of [`Enlightenment`] can construct a [`CpuidSet`] that
/// contains the hypervisor CPUID entries they want to add and pass it to this
/// function to add them en masse while returning the correct error variant if
/// the original map contained conflicting leaves.
///
/// # Panics
///
/// Panics if `to_add` contains a leaf outside of the hypervisor range
/// (0x4000_0000 to 0x4000_00FF).
fn add_cpuid(
    to_modify: &mut CpuidSet,
    to_add: CpuidSet,
) -> Result<(), AddCpuidError> {
    for (ident, values) in to_add.iter() {
        assert!((0x4000_0000..0x4000_0100).contains(&ident.leaf));

        // `CpuidSet` maintains the invariant that a single leaf value can
        // appear either with or without subleaf entries (but not both). Its
        // `insert` method returns `Err` if an insertion would violate this
        // invariant and `Ok(Some)` if the insertion replaced an existing entry.
        // If either of these cases arises, the input map contained a
        // conflicting hypervisor entry, which is a caller error.
        //
        // Note that because `to_add` is itself a `CpuidSet`, it cannot contain
        // a leaf/subleaf conflict or a duplicate leaf entry. Therefore, any
        // conflicts of this kind must originate with the contents of
        // `to_modify`.
        match to_modify.insert(ident, values) {
            Ok(None) => {}
            Ok(Some(_)) => {
                return Err(AddCpuidError::LeafAlreadyPresent(ident))
            }
            Err(_) => return Err(AddCpuidError::SubleafConflict(ident)),
        }
    }

    Ok(())
}


================================================
FILE: lib/propolis/src/exits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Describes transitions from VMs to the VMM.

use std::os::raw::c_void;
use std::time::Duration;

use bhyve_api::{
    vm_entry, vm_entry_cmds, vm_entry_payload, vm_exit, vm_exitcode,
    vm_suspend_how,
};

use crate::common::GuestData;

/// Describes the reason for exiting execution of a vCPU.
pub struct VmExit {
    /// The instruction pointer of the guest at the time of exit.
    pub rip: u64,
    /// The length of the instruction which triggered the exit.
    /// Zero if inapplicable to the exit or unknown.
    pub inst_len: u8,
    /// Describes the reason for triggering an exit.
    pub kind: VmExitKind,
}
impl Default for VmExit {
    fn default() -> Self {
        Self { rip: 0, inst_len: 0, kind: VmExitKind::Bogus }
    }
}
impl VmExit {
    pub fn parse(exit: &vm_exit, api_version: u32) -> Self {
        VmExit {
            rip: exit.rip,
            inst_len: exit.inst_length as u8,
            kind: VmExitKind::parse(exit, api_version),
        }
    }
}

#[derive(Copy, Clone, Debug)]
pub struct IoPort {
    pub port: u16,
    pub bytes: u8,
}

#[derive(Copy, Clone, Debug)]
pub enum InoutReq {
    In(IoPort),
    Out(IoPort, u32),
}

#[derive(Copy, Clone, Debug)]
pub struct MmioReadReq {
    pub addr: u64,
    pub bytes: u8,
}
#[derive(Copy, Clone, Debug)]
pub struct MmioWriteReq {
    pub addr: u64,
    pub data: u64,
    pub bytes: u8,
}

#[derive(Copy, Clone, Debug)]
pub enum MmioReq {
    Read(MmioReadReq),
    Write(MmioWriteReq),
}

#[derive(Copy, Clone, Debug)]
pub struct SvmDetail {
    pub exit_code: u64,
    pub info1: u64,
    pub info2: u64,
}
#[derive(Copy, Clone, Debug)]
pub struct VmxDetail {
    pub status: i32,
    pub exit_reason: u32,
    pub exit_qualification: u64,
    pub inst_type: i32,
    pub inst_error: i32,
}
impl From<&bhyve_api::vm_exit_vmx> for VmxDetail {
    fn from(raw: &bhyve_api::vm_exit_vmx) -> Self {
        Self {
            status: raw.status,
            exit_reason: raw.exit_reason,
            exit_qualification: raw.exit_qualification,
            inst_type: raw.inst_type,
            inst_error: raw.inst_error,
        }
    }
}

#[derive(Copy, Clone, Debug)]
pub struct InstEmul {
    pub inst_data: GuestData<[u8; 15]>,
    pub len: u8,
}

impl InstEmul {
    pub fn bytes(&self) -> &[u8] {
        &self.inst_data[..usize::min(self.inst_data.len(), self.len as usize)]
    }
}
impl From<&bhyve_api::vm_inst_emul> for InstEmul {
    fn from(raw: &bhyve_api::vm_inst_emul) -> Self {
        let mut res =
            Self { inst_data: GuestData::from([0u8; 15]), len: raw.num_valid };
        assert!(res.len as usize <= res.inst_data.len());
        res.inst_data.copy_from_slice(&raw.inst[..]);

        res
    }
}

#[derive(Copy, Clone, Debug)]
pub enum Suspend {
    Halt,
    Reset,
    TripleFault(i32),
}

#[derive(Copy, Clone, Debug)]
pub struct SuspendDetail {
    pub kind: Suspend,
    pub when: Duration,
}

#[derive(Copy, Clone, Debug)]
pub enum VmExitKind {
    Bogus,
    Inout(InoutReq),
    Mmio(MmioReq),
    Rdmsr(u32),
    Wrmsr(u32, u64),
    VmxError(VmxDetail),
    SvmError(SvmDetail),
    Suspended(SuspendDetail),
    InstEmul(InstEmul),
    Debug,
    Paging(u64, i32),
    Unknown(i32),
}
impl VmExitKind {
    /// Get the raw `VM_EXITCODE` corresponding to this exit kind
    pub const fn code(&self) -> i32 {
        match self {
            VmExitKind::Bogus => vm_exitcode::VM_EXITCODE_BOGUS as i32,
            VmExitKind::Inout(_) => vm_exitcode::VM_EXITCODE_INOUT as i32,
            VmExitKind::Mmio(_) => vm_exitcode::VM_EXITCODE_MMIO as i32,
            VmExitKind::Rdmsr(_) => vm_exitcode::VM_EXITCODE_RDMSR as i32,
            VmExitKind::Wrmsr(_, _) => vm_exitcode::VM_EXITCODE_WRMSR as i32,
            VmExitKind::VmxError(_) => vm_exitcode::VM_EXITCODE_VMX as i32,
            VmExitKind::SvmError(_) => vm_exitcode::VM_EXITCODE_SVM as i32,
            VmExitKind::InstEmul(_) => {
                vm_exitcode::VM_EXITCODE_INST_EMUL as i32
            }
            VmExitKind::Suspended(_) => {
                vm_exitcode::VM_EXITCODE_SUSPENDED as i32
            }
            VmExitKind::Debug => vm_exitcode::VM_EXITCODE_DEBUG as i32,
            VmExitKind::Paging(_, _) => vm_exitcode::VM_EXITCODE_PAGING as i32,
            VmExitKind::Unknown(code) => *code,
        }
    }

    /// Is the vCPU in a consistent (for save/restore or migration) state at the
    /// point when this VM-exit was taken?
    pub const fn is_consistent(&self) -> bool {
        match self {
            // These exitcodes represent conditions unrelated to behavior of the
            // guest vCPU itself, but rather conditions of the host VMM, such as
            // scheduler contention for the CPU, or other software requests that
            // the thread exit to userspace.
            //
            // The checks which would emit such codes are performed only after
            // the rest of the vCPU state is made consistent prior to entry into
            // VM context.
            VmExitKind::Bogus | VmExitKind::Debug => true,

            // When the vCPU(s) enter the suspended state, no further forward
            // progress can be made until the instance is reset.
            VmExitKind::Suspended(_) => true,

            // The instruction emulation exits, by their nature, leave the vCPU
            // in an inconsistent state until they can be completed
            VmExitKind::Inout(_)
            | VmExitKind::Mmio(_)
            | VmExitKind::Rdmsr(_)
            | VmExitKind::Wrmsr(_, _)
            | VmExitKind::InstEmul(_) => false,

            // Unhandled paging exits are likely terminal for the instance.
            VmExitKind::Paging(_, _) => true,

            // Unhandled errors or exit codes indicate a terminal state for the
            // entire instance.
            VmExitKind::VmxError(_)
            | VmExitKind::SvmError(_)
            | VmExitKind::Unknown(_) => true,
        }
    }
}
impl VmExitKind {
    pub fn parse(exit: &vm_exit, api_version: u32) -> Self {
        let code = match vm_exitcode::from_repr(exit.exitcode) {
            None => return VmExitKind::Unknown(exit.exitcode),
            Some(c) => c,
        };
        match code {
            vm_exitcode::VM_EXITCODE_BOGUS => VmExitKind::Bogus,
            vm_exitcode::VM_EXITCODE_DEPRECATED2 => {
                // Prior to v16, this was REQIDLE, which can be translated into
                // a BOGUS exit.
                if api_version < bhyve_api::ApiVersion::V16 {
                    VmExitKind::Bogus
                } else {
                    // At or after v16, we do not expect to see this code
                    VmExitKind::Unknown(code as i32)
                }
            }
            vm_exitcode::VM_EXITCODE_INOUT => {
                let inout = unsafe { &exit.u.inout };
                let port = IoPort { port: inout.port, bytes: inout.bytes };
                if inout.flags & bhyve_api::INOUT_IN != 0 {
                    VmExitKind::Inout(InoutReq::In(port))
                } else {
                    VmExitKind::Inout(InoutReq::Out(port, inout.eax))
                }
            }
            vm_exitcode::VM_EXITCODE_RDMSR => {
                let msr = unsafe { &exit.u.msr };
                VmExitKind::Rdmsr(msr.code)
            }
            vm_exitcode::VM_EXITCODE_WRMSR => {
                let msr = unsafe { &exit.u.msr };
                VmExitKind::Wrmsr(msr.code, msr.wval)
            }
            vm_exitcode::VM_EXITCODE_MMIO => {
                let mmio = unsafe { &exit.u.mmio };
                if mmio.read != 0 {
                    VmExitKind::Mmio(MmioReq::Read(MmioReadReq {
                        addr: mmio.gpa,
                        bytes: mmio.bytes,
                    }))
                } else {
                    VmExitKind::Mmio(MmioReq::Write(MmioWriteReq {
                        addr: mmio.gpa,
                        data: mmio.data,
                        bytes: mmio.bytes,
                    }))
                }
            }
            vm_exitcode::VM_EXITCODE_VMX => {
                let vmx = unsafe { &exit.u.vmx };
                VmExitKind::VmxError(VmxDetail::from(vmx))
            }
            vm_exitcode::VM_EXITCODE_SVM => {
                let svm = unsafe { &exit.u.svm };
                VmExitKind::SvmError(SvmDetail {
                    exit_code: svm.exitcode,
                    info1: svm.exitinfo1,
                    info2: svm.exitinfo2,
                })
            }
            vm_exitcode::VM_EXITCODE_SUSPENDED => {
                let detail = unsafe { &exit.u.suspend };
                // Prior to v16, the only field in vm_exit.u.suspend was `how`.
                // The `source` and `when` fields are valid in v16 or later.
                let valid_detail = api_version >= bhyve_api::ApiVersion::V16;
                let kind = match vm_suspend_how::from_repr(detail.how as u32) {
                    Some(vm_suspend_how::VM_SUSPEND_RESET) => Suspend::Reset,
                    Some(vm_suspend_how::VM_SUSPEND_POWEROFF)
                    | Some(vm_suspend_how::VM_SUSPEND_HALT) => Suspend::Halt,
                    Some(vm_suspend_how::VM_SUSPEND_TRIPLEFAULT) => {
                        Suspend::TripleFault(
                            valid_detail.then_some(detail.source).unwrap_or(-1),
                        )
                    }
                    Some(vm_suspend_how::VM_SUSPEND_NONE) | None => {
                        panic!("invalid vm_suspend_how: {}", detail.how);
                    }
                };
                // Just fake a time if there is not a valid one.
                let when = Duration::from_nanos(
                    valid_detail.then_some(detail.when).unwrap_or(0),
                );

                VmExitKind::Suspended(SuspendDetail { kind, when })
            }
            vm_exitcode::VM_EXITCODE_INST_EMUL => {
                let inst = unsafe { &exit.u.inst_emul };
                VmExitKind::InstEmul(InstEmul::from(inst))
            }
            vm_exitcode::VM_EXITCODE_PAGING => {
                let paging = unsafe { &exit.u.paging };
                // The Paging exit should probably be transformed into an
                // attempted-MMIO exit to make handling easier, but until then
                // we just pass the buck.
                VmExitKind::Paging(paging.gpa, paging.fault_type)
            }
            vm_exitcode::VM_EXITCODE_DEBUG => VmExitKind::Debug,

            vm_exitcode::VM_EXITCODE_TASK_SWITCH => {
                // Intel CPUs do not emulate x86 hardware task switching, so it
                // is left to userspace.
                todo!("Implement task-switching emulation on Intel")
            }
            vm_exitcode::VM_EXITCODE_HLT | vm_exitcode::VM_EXITCODE_PAUSE => {
                // Until propolis is changed to request userspace exits for HLT
                // or PAUSE, we do not ever expect to see them.
                panic!("Unexpected {:?}", code);
            }
            vm_exitcode::VM_EXITCODE_BPT | vm_exitcode::VM_EXITCODE_MTRAP => {
                // Propolis is not using VMX breakpoints or mtraps (yet)
                panic!("Unexpected {:?}", code);
            }
            vm_exitcode::VM_EXITCODE_MWAIT
            | vm_exitcode::VM_EXITCODE_MONITOR
            | vm_exitcode::VM_EXITCODE_VMINSN
            | vm_exitcode::VM_EXITCODE_IOAPIC_EOI
            | vm_exitcode::VM_EXITCODE_MMIO_EMUL
            | vm_exitcode::VM_EXITCODE_HT
            | vm_exitcode::VM_EXITCODE_RUN_STATE => {
                // These exitcodes are used (and handled) internally by bhyve
                // and should never be emitted to userspace.
                panic!("Unexpected internal exit: {:?}", code);
            }
            c => VmExitKind::Unknown(c as i32),
        }
    }
}

pub enum InoutRes {
    In(IoPort, u32),
    Out(IoPort),
}
impl InoutRes {
    /// Emit result equivalent to failed IO port operation.  Reads (INx) yield
    /// all 1s, while writes are ignored.
    pub fn emulate_failed(req: &InoutReq) -> Self {
        match req {
            InoutReq::In(port) => InoutRes::In(*port, !0u32),
            InoutReq::Out(port, _data) => InoutRes::Out(*port),
        }
    }
}

pub struct MmioReadRes {
    pub addr: u64,
    pub data: u64,
    pub bytes: u8,
}
pub struct MmioWriteRes {
    pub addr: u64,
    pub bytes: u8,
}

pub enum MmioRes {
    Read(MmioReadRes),
    Write(MmioWriteRes),
}
impl MmioRes {
    /// Emit result equivalent to failed MMIO operation.  Reads yield all 1s,
    /// while writes are ignored.
    pub fn emulate_failed(req: &MmioReq) -> Self {
        match req {
            MmioReq::Read(read) => MmioRes::Read(MmioReadRes {
                addr: read.addr,
                data: !0u64,
                bytes: read.bytes,
            }),
            MmioReq::Write(write) => MmioRes::Write(MmioWriteRes {
                addr: write.addr,
                bytes: write.bytes,
            }),
        }
    }
}

pub enum VmEntry {
    Run,
    InoutFulfill(InoutRes),
    MmioFulfill(MmioRes),
}
impl VmEntry {
    pub fn to_raw(&self, cpuid: i32, exit_ptr: *mut vm_exit) -> vm_entry {
        let mut payload = vm_entry_payload::default();
        let cmd = match self {
            VmEntry::Run => vm_entry_cmds::VEC_DEFAULT,
            VmEntry::InoutFulfill(res) => {
                let io = match res {
                    InoutRes::In(io, val) => {
                        payload.inout.flags = bhyve_api::INOUT_IN;
                        payload.inout.eax = *val;
                        io
                    }
                    InoutRes::Out(io) => {
                        payload.inout.flags = 0;
                        payload.inout.eax = 0;
                        io
                    }
                };
                payload.inout.port = io.port;
                payload.inout.bytes = io.bytes;
                vm_entry_cmds::VEC_FULFILL_INOUT
            }
            VmEntry::MmioFulfill(res) => {
                let (addr, bytes) = match res {
                    MmioRes::Read(read) => {
                        payload.mmio.read = 1;
                        payload.mmio.data = read.data;
                        (read.addr, read.bytes)
                    }
                    MmioRes::Write(write) => (write.addr, write.bytes),
                };
                payload.mmio.gpa = addr;
                payload.mmio.bytes = bytes;
                vm_entry_cmds::VEC_FULFILL_MMIO
            }
        };
        vm_entry {
            cpuid,
            cmd: cmd as u32,
            u: payload,
            exit_data: exit_ptr as *mut c_void,
        }
    }
}


================================================
FILE: lib/propolis/src/firmware/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod smbios;


================================================
FILE: lib/propolis/src/firmware/smbios/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::mem::size_of;

const ANCHOR: [u8; 4] = [b'_', b'S', b'M', b'_'];
const IANCHOR: [u8; 5] = [b'_', b'D', b'M', b'I', b'_'];

/// Each SMBIOS table is expected to terminate with a double-NUL
pub const TABLE_TERMINATOR: [u8; 2] = [0, 0];

/// SMBIOS Structure Table Entry Point
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct EntryPoint {
    /// Anchor String (_SM_)
    pub anchor: [u8; 4],
    /// Entry Point Structure Checksum
    pub ep_checksum: u8,
    /// Entry Point Length
    pub ep_length: u8,
    /// SMBIOS Major Version
    pub major_version: u8,
    /// SMBIOS Minor Version
    pub minor_version: u8,
    /// Maximum Structure Size
    pub max_struct_sz: u16,
    /// Entry Point Revision
    pub ep_revision: u8,
    /// Formatted Area
    pub formatted_area: [u8; 5],
    /// Intermediate Anchor (_DMI_)
    pub intermed_anchor: [u8; 5],
    /// Intermediate Checksum
    pub intermed_checksum: u8,
    /// Structure Table Length
    pub table_length: u16,
    /// Structure Table Address
    pub table_address: u32,
    /// Number of SMBIOS Structures
    pub num_structs: u16,
    /// SMBIOS BCD Revision
    pub bcd_revision: u8,
}
impl EntryPoint {
    pub(crate) fn new() -> Self {
        Self {
            anchor: ANCHOR,
            ep_checksum: 0,
            ep_length: 0,
            major_version: 0,
            minor_version: 0,
            max_struct_sz: 0,
            ep_revision: 0,
            formatted_area: [0, 0, 0, 0, 0],
            intermed_anchor: IANCHOR,
            intermed_checksum: 0,
            table_length: 0,
            table_address: 0,
            num_structs: 0,
            bcd_revision: 0,
        }
    }
    pub(crate) fn update_cksums(&mut self) {
        self.ep_checksum = 0;
        self.intermed_checksum = 0;
        let isum = self.to_raw_bytes()[0x10..]
            .iter()
            .fold(0u8, |sum, item| sum.wrapping_add(*item));
        self.intermed_checksum = (!isum).wrapping_add(1);

        let sum = self
            .to_raw_bytes()
            .iter()
            .fold(0u8, |sum, item| sum.wrapping_add(*item));
        self.ep_checksum = (!sum).wrapping_add(1);
    }
    pub(crate) fn to_raw_bytes(&self) -> &[u8] {
        unsafe {
            std::slice::from_raw_parts(
                self as *const Self as *const u8,
                size_of::<Self>(),
            )
        }
    }
}

#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct StructureHeader {
    pub stype: u8,
    pub length: u8,
    pub handle: u16,
}

macro_rules! raw_table_impl {
    ($type_name:ident, $type_val:literal) => {
        unsafe impl RawTable for $type_name {
            const TYPE: u8 = $type_val;
            fn header_mut(&mut self) -> &mut StructureHeader {
                &mut self.header
            }
        }
    };
}

/// Type 0 (BIOS Information) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type0 {
    pub header: StructureHeader,
    pub vendor: u8,
    pub bios_version: u8,
    pub bios_starting_seg_addr: u16,
    pub bios_release_date: u8,
    pub bios_rom_size: u8,
    pub bios_characteristics: u64,
    pub bios_ext_characteristics: u16,
    pub bios_major_release: u8,
    pub bios_minor_release: u8,
    pub ec_firmware_major_rel: u8,
    pub ec_firmware_minor_rel: u8,
}
raw_table_impl!(Type0, 0);

/// Type 1 (System Information) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type1 {
    pub header: StructureHeader,
    pub manufacturer: u8,
    pub product_name: u8,
    pub version: u8,
    pub serial_number: u8,
    pub uuid: [u8; 16],
    pub wake_up_type: u8,
    pub sku_number: u8,
    pub family: u8,
}
raw_table_impl!(Type1, 1);

// We don't create type 2 tables yet, but it's nice to have the definition on hand.
/// Type 2 (Baseboard Information)
#[derive(Copy, Clone)]
#[repr(C, packed)]
#[allow(dead_code)]
pub(crate) struct Type2 {
    pub header: StructureHeader,
    pub manufacturer: u8,
    pub product: u8,
    pub version: u8,
    pub serial_number: u8,
    pub asset_tag: u8,
    pub feature_flags: u8,
    pub location_in_chassis: u8,
    pub chassis_handle: u16,
    pub board_type: u8,
    pub number_obj_handles: u8,
    pub obj_handles: [u16; 0],
}
raw_table_impl!(Type2, 2);

// We don't create type 3 tables yet, but it's nice to have the definition on hand.
/// Type 3 (System Enclosure) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
#[allow(dead_code)]
pub(crate) struct Type3 {
    pub header: StructureHeader,
    pub manufacturer: u8,
    pub stype: u8,
    pub version: u8,
    pub serial_number: u8,
    pub asset_tag: u8,
    pub bootup_state: u8,
    pub psu_state: u8,
    pub thermal_state: u8,
    pub security_status: u8,
    pub oem_defined: u32,
    pub height: u8,
    pub num_cords: u8,
    pub contained_elem_count: u8,
    pub contained_elem_len: u8,
}
raw_table_impl!(Type3, 3);

/// Type 4 (Processor Information) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type4 {
    pub header: StructureHeader,
    pub socket_designation: u8,
    pub proc_type: u8,
    pub proc_family: u8,
    pub proc_manufacturer: u8,
    pub proc_id: u64,
    pub proc_version: u8,
    pub voltage: u8,
    pub external_clock: u16,
    pub max_speed: u16,
    pub current_speed: u16,
    pub status: u8,
    pub proc_upgrade: u8,
    pub l1_cache_handle: u16,
    pub l2_cache_handle: u16,
    pub l3_cache_handle: u16,
    pub serial_number: u8,
    pub asset_tag: u8,
    pub part_number: u8,
    pub core_count: u8,
    pub core_enabled: u8,
    pub thread_count: u8,
    pub proc_characteristics: u16,
    pub proc_family2: u16,
}
raw_table_impl!(Type4, 4);

/// Type 16 (Physical Memory Array) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type16 {
    pub header: StructureHeader,
    pub location: u8,
    pub array_use: u8,
    pub error_correction: u8,
    pub max_capacity: u32,
    pub error_info_handle: u16,
    pub num_mem_devices: u16,
    pub extended_max_capacity: u64,
}
raw_table_impl!(Type16, 16);

/// Type 17 (Memory Device) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type17 {
    pub header: StructureHeader,
    pub phys_mem_array_handle: u16,
    pub mem_err_info_handle: u16,
    pub total_width: u16,
    pub data_width: u16,
    pub size: u16,
    pub form_factor: u8,
    pub device_set: u8,
    pub device_locator: u8,
    pub bank_locator: u8,
    pub memory_type: u8,
    pub type_detail: u16,
    pub speed: u16,
    pub manufacturer: u8,
    pub serial_number: u8,
    pub asset_tag: u8,
    pub part_number: u8,
    pub attributes: u8,
    pub extended_size: u32,
    pub cfgd_mem_clock_speed: u16,
    pub min_voltage: u16,
    pub max_voltage: u16,
    pub cfgd_voltage: u16,
}
raw_table_impl!(Type17, 17);

/// Type 32 (System Boot Information) - Version 2.7
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type32 {
    pub header: StructureHeader,
    pub reserved: [u8; 6],
    pub boot_status: [u8; 0],
}
raw_table_impl!(Type32, 32);

/// Type 127 (End-of-Table)
#[derive(Copy, Clone)]
#[repr(C, packed)]
pub(crate) struct Type127 {
    pub header: StructureHeader,
}
raw_table_impl!(Type127, 127);

pub(crate) unsafe trait RawTable: Sized {
    const TYPE: u8;

    fn new(handle: u16) -> Self {
        // Safety: All of these structs are repr(C,packed) with no padding,
        // and thus safe to initialize as zeroed.
        let mut data =
            unsafe { std::mem::MaybeUninit::<Self>::zeroed().assume_init() };

        let header = data.header_mut();
        header.stype = Self::TYPE;
        header.length = size_of::<Self>() as u8;
        header.handle = handle;

        data
    }

    fn header_mut(&mut self) -> &mut StructureHeader;

    fn to_raw_bytes(&self) -> &[u8] {
        unsafe {
            std::slice::from_raw_parts(
                self as *const Self as *const u8,
                size_of::<Self>(),
            )
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::mem::size_of;

    #[test]
    fn entry_point() {
        assert_eq!(size_of::<EntryPoint>(), 0x1f);
    }
    #[test]
    fn struct_header() {
        assert_eq!(size_of::<StructureHeader>(), 0x4);
    }

    #[test]
    fn bios_information() {
        assert_eq!(size_of::<Type0>(), 0x18);
        let data = Type0::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 0, "type byte incorrect")
    }
    #[test]
    fn system_information() {
        assert_eq!(size_of::<Type1>(), 0x1b);
        let data = Type1::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 1, "type byte incorrect")
    }
    #[test]
    fn baseboard_information() {
        assert_eq!(size_of::<Type2>(), 0xf);
        let data = Type2::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 2, "type byte incorrect")
    }
    #[test]
    fn system_enclosure() {
        assert_eq!(size_of::<Type3>(), 0x15);
        let data = Type2::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 2, "type byte incorrect")
    }
    #[test]
    fn processor_information() {
        assert_eq!(size_of::<Type4>(), 0x2a);
        let data = Type3::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 3, "type byte incorrect")
    }
    #[test]
    fn physical_memory_array() {
        assert_eq!(size_of::<Type16>(), 0x17);
        let data = Type16::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 16, "type byte incorrect")
    }
    #[test]
    fn memory_device() {
        assert_eq!(size_of::<Type17>(), 0x28);
        let data = Type17::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 17, "type byte incorrect")
    }
    #[test]
    fn system_boot_information() {
        assert_eq!(size_of::<Type32>(), 0xa);
        let data = Type32::new(0xfffe);
        let data_raw = data.to_raw_bytes();
        assert_eq!(data_raw[0], 32, "type byte incorrect")
    }
}


================================================
FILE: lib/propolis/src/firmware/smbios/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;
use std::fmt;

use table::{Table, Type127};

mod bits;
pub mod table;

/// Collection of SMBIOS [table] instances, which will be rendered out
/// into two blocks of raw bytes representing the SMBIOS Entry Point and
/// SMBIOS Structure Table.
pub struct Tables {
    tables: BTreeMap<Handle, Vec<u8>>,
    total_size: usize,
    max_single_size: usize,
    eot_handle: Handle,
}
impl Tables {
    /// Create a new [Tables] collection.
    ///
    /// # Arguments
    /// - `eot_handle`: [Handle] for the automatically-added
    /// [end-of-table](table::Type127) structure, which must terminate the
    /// Structure Table.
    pub fn new(eot_handle: Handle) -> Self {
        let mut this = Self {
            tables: BTreeMap::new(),
            total_size: 0,
            max_single_size: 0,
            eot_handle,
        };
        this.add(eot_handle, &Type127::default()).unwrap();

        this
    }

    /// Add a [Table] to this collection
    ///
    /// # Arguments
    /// - `handle`: SMBIOS [Handle] to identify this table
    /// - `table`: [Table] to be added
    pub fn add(
        &mut self,
        handle: Handle,
        table: &dyn Table,
    ) -> Result<(), TableError> {
        let table_bytes = table.render(handle);
        let table_size = table_bytes.len();
        assert!(table_size != 0, "table should not be zero-length");

        if let Some(conflict) = self.tables.insert(handle, table_bytes) {
            // replace the item which we conflicted with in the first place
            let _ = self.tables.insert(handle, conflict);
            Err(TableError::HandleConflict(handle))
        } else {
            self.total_size += table_size;
            self.max_single_size = usize::max(self.max_single_size, table_size);
            Ok(())
        }
    }

    /// Build Entry Point structure.  Emits the raw-byte values of both the
    /// Entry Point and the associated Structure Table data.
    pub fn commit(self) -> TableBytes {
        let mut data = bits::EntryPoint::new();
        // hardcode to version 2.7 for now
        data.major_version = 2;
        data.minor_version = 7;
        data.bcd_revision = 0x27;
        data.table_length = self.total_size as u16;
        data.num_structs = self.tables.len() as u16;
        data.max_struct_sz = self.max_single_size as u16;
        data.update_cksums();

        let mut table_data = Vec::with_capacity(self.total_size);
        for (handle, table) in self.tables.iter() {
            // copy all non-EoT tables
            if *handle != self.eot_handle {
                table_data.extend_from_slice(table);
            }
        }
        // end-of-table goes at the end
        table_data.extend_from_slice(
            self.tables.get(&self.eot_handle).expect("EoT entry is present"),
        );

        TableBytes {
            entry_point: data.to_raw_bytes().to_vec(),
            structure_table: table_data,
        }
    }
}

pub struct TableBytes {
    pub entry_point: Vec<u8>,
    pub structure_table: Vec<u8>,
}

/// Possible errors when adding [Table] entries to [Tables]
#[derive(thiserror::Error, Debug)]
pub enum TableError {
    #[error("Conflicting handle {0}")]
    HandleConflict(Handle),
}

/// Structure Handle
///
/// A 16-bit number used to uniquely identify a single structure in a collection
/// of SMBIOS tables.
///
/// Defaults to 0xffff, which is considered "Unknown" in most SMBIOS tables.
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Ord, PartialOrd)]
#[repr(transparent)]
pub struct Handle(pub u16);
impl Handle {
    pub const UNKNOWN: Self = Self(0xffff);
}
impl fmt::Display for Handle {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        fmt::LowerHex::fmt(&self.0, f)
    }
}
/// Default to the Unknown handle
impl Default for Handle {
    fn default() -> Self {
        Self::UNKNOWN
    }
}
impl From<u16> for Handle {
    fn from(value: u16) -> Self {
        Self(value)
    }
}
impl From<Handle> for u16 {
    fn from(value: Handle) -> Self {
        value.0
    }
}

/// SMBIOS-compatible string
///
/// Strings associated with an SMBIOS table are NUL-terminated, and concatenated
/// together directly following the formatted area of the table.  The
/// [tables](table) string data accept this type in order to expedite proper
/// formatting when they are rendered to raw bytes.
#[derive(Default, Clone)]
pub struct SmbString(Vec<u8>);
impl SmbString {
    pub const fn empty() -> Self {
        Self(Vec::new())
    }
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }
    pub fn len(&self) -> usize {
        self.0.len()
    }
}
impl AsRef<[u8]> for SmbString {
    fn as_ref(&self) -> &[u8] {
        &self.0
    }
}
impl TryFrom<Vec<u8>> for SmbString {
    type Error = SmbStringNulError;

    fn try_from(value: Vec<u8>) -> Result<Self, Self::Error> {
        if value.contains(&0) {
            Err(SmbStringNulError())
        } else {
            Ok(Self(value))
        }
    }
}
impl TryFrom<&str> for SmbString {
    type Error = SmbStringNulError;

    fn try_from(value: &str) -> Result<Self, Self::Error> {
        Self::try_from(value.to_owned().into_bytes())
    }
}
impl TryFrom<String> for SmbString {
    type Error = SmbStringNulError;

    fn try_from(value: String) -> Result<Self, Self::Error> {
        Self::try_from(value.into_bytes())
    }
}

/// Error emitted when attempting to convert data bearing a NUL into a
/// [SmbString]
#[derive(thiserror::Error, Debug)]
#[error("String contains NUL byte")]
pub struct SmbStringNulError();


================================================
FILE: lib/propolis/src/firmware/smbios/table.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! SMBIOS tables.
//!
//! The values of the types in this module are defined by [DSP0136], the _SMBIOS Reference
//! Specification_. Refer to that document for details.
//!
//! [DSP0136]:
//!     https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.7.0.pdf

use crate::common::*;
use crate::firmware::smbios::bits::{self, RawTable};
use crate::firmware::smbios::{Handle, SmbString};
use serde::de::{self, Deserialize};
use serde::ser::{self, Serialize};
use strum::{FromRepr, VariantArray};

pub trait Table {
    fn render(&self, handle: Handle) -> Vec<u8>;
}

#[derive(Default)]
pub struct Type0 {
    pub vendor: SmbString,
    pub bios_version: SmbString,
    pub bios_starting_seg_addr: u16,
    pub bios_release_date: SmbString,
    pub bios_rom_size: u8,
    /// The low 32 bits of the BIOS characteristics field is a set of bitflags
    /// that describes the BIOS.
    pub bios_characteristics: type0::BiosCharacteristics,
    /// The high 32 bits of the 64-bit BIOS characteristics field is reserved
    /// for the BIOS vendor.
    pub bios_characteristics_reserved: u32,
    pub bios_ext_characteristics: type0::BiosExtCharacteristics,
    pub bios_major_release: u8,
    pub bios_minor_release: u8,
    pub ec_firmware_major_rel: u8,
    pub ec_firmware_minor_rel: u8,
}

impl Table for Type0 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let bios_characteristics = {
            let low = u64::from(self.bios_characteristics.bits());
            let high = u64::from(self.bios_characteristics_reserved) << 32;
            low | high
        };
        let mut stab = StringTable::new();
        let data = bits::Type0 {
            vendor: stab.add(&self.vendor),
            bios_version: stab.add(&self.bios_version),
            bios_starting_seg_addr: self.bios_starting_seg_addr,
            bios_release_date: stab.add(&self.bios_release_date),
            bios_rom_size: self.bios_rom_size,
            bios_characteristics,
            bios_ext_characteristics: self.bios_ext_characteristics.bits(),
            bios_major_release: self.bios_major_release,
            bios_minor_release: self.bios_minor_release,
            ec_firmware_major_rel: self.ec_firmware_major_rel,
            ec_firmware_minor_rel: self.ec_firmware_minor_rel,
            ..bits::Type0::new(handle.into())
        };

        render_table(data, None, Some(stab))
    }
}

macro_rules! serialize_enums {
    ($($Enum:ty => $repr:ty),+ $(,)?) => {
        $(
            #[automatically_derived]
            impl<'de> Deserialize<'de> for $Enum {
                fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
                where
                    D: de::Deserializer<'de>,
                    $repr: Deserialize<'de>,
                {
                    lazy_static::lazy_static! {
                        static ref ERR_MSG: String = {
                            match <$Enum>::VARIANTS {
                                [] => panic!("probably don't use the `serialize_enums!` macro with an empty enum..."),
                                &[first] => format!("expected `{:#04x}` ({first:?})", first as $repr),
                                &[first, second] => format!(
                                    "expected `{:#04x}` ({first:?}) or `{:#04x}` ({second:?})))",
                                    first as $repr, second as $repr,
                                ),
                                variants => {
                                    let variants = variants.iter()
                                        .map(|&v| format!("`{:#04x}` ({v:?})", v as $repr))
                                        .collect::<Vec<_>>()
                                        .join(", ");
                                    format!("expected one of: {variants}")
                                }
                            }
                        };
                    }

                    let v = <$repr>::deserialize(deserializer)?;
                    <$Enum>::from_repr(v).ok_or_else(||
                       de::Error::invalid_value(de::Unexpected::Unsigned(v as u64), &ERR_MSG.as_str())
                    )
                }
            }

            #[automatically_derived]
            impl Serialize for $Enum {
                fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
                where
                    S: ser::Serializer,
                    u8: Serialize,
                {
                    (*self as u8).serialize(serializer)
                }
            }
        )+
    };

}

macro_rules! serialize_bitflags {
    ($($Flags:ty => $repr:ty),+ $(,)?) => {
        $(

            #[automatically_derived]
            impl<'de> serde::de::Deserialize<'de> for $Flags {
                fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
                where
                    D: serde::de::Deserializer<'de>,
                    $repr: serde::de::Deserialize<'de>,
                {
                    let v = <$repr as serde::de::Deserialize>::deserialize(deserializer)?;
                    <$Flags>::from_bits(v).ok_or_else(||
                        serde::de::Error::custom(format!(
                            "invalid {} value {v}: only bits {:?} may be set",
                            stringify!($Flags),
                            <$Flags>::all(),
                        ))
                    )
                }
            }

            #[automatically_derived]
            impl serde::ser::Serialize for $Flags {
                fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
                where
                    S: serde::ser::Serializer,
                    $repr: serde::ser::Serialize,
                {
                    self.bits().serialize(serializer)
                }
            }
        )+
    };
}

#[cfg(test)]
macro_rules! enum_deserialize_tests {
    ($(fn $name:ident($Enum:ty, $repr:ty) { $invalid:expr })+) => {
        $(
            #[test]
            fn $name() {
                for variant in <$Enum>::VARIANTS {
                    let serialized =
                        dbg!(serde_json::to_string(&(*dbg!(variant) as $repr)))
                            .unwrap();
                    let deserialized =
                        dbg!(serde_json::from_str::<'_, $Enum>(&serialized))
                            .unwrap();
                    assert_eq!(*variant, deserialized);
                }

                for invalid in $invalid {
                    let serialized = dbg!(serde_json::to_string(&invalid)).unwrap();

                    dbg!(serde_json::from_str::<'_, $Enum>(&serialized))
                        .unwrap_err();
                }
            }
        )+
    }
}

#[cfg(test)]
macro_rules! enum_serde_roundtrip_tests {
    ($(fn $name:ident($Enum:ty) {})+) => {
        $(
            #[test]
            fn $name() {
                for variant in <$Enum>::VARIANTS {
                    let serialized =
                        dbg!(serde_json::to_string(dbg!(variant))).unwrap();
                    let deserialized =
                        dbg!(serde_json::from_str::<'_, $Enum>(&serialized))
                            .unwrap();
                    assert_eq!(*variant, deserialized);
                }
            }
        )+
    }
}

pub mod type0 {
    bitflags! {
        /// BIOS Characteristics flags.
        ///
        /// See Table 7 in section 7.1.1 of [the SMBIOS Reference
        /// Specification][DSP0136] for details.
        ///
        /// [DSP0136]: https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.7.0.pdf
        #[repr(transparent)]
        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
        pub struct BiosCharacteristics: u32 {
            // Bits 0-1 are reserved.

            /// BIOS characteristics are unknown.
            const UNKNOWN = 1 << 2;
            /// BIOS characteristics are not supported.
            const UNSUPPORTED = 1 << 3;
            /// ISA is supported
            const ISA = 1 << 4;
            /// MCA is supported.
            const MCA = 1 << 5;
            /// EISA is supported.
            const EISA = 1 << 6;
            /// PCI is supported.
            const PCI = 1 << 7;
            /// PC card (PCMCIA) is supported.
            const PCMCIA = 1 << 8;
            /// Plug and Play is supported.
            const PLUG_AND_PLAY = 1 << 9;
            /// APM is supported.
            const APM = 1 << 10;
            /// BIOS is upgradeable (flash).
            const UPGRADEABLE = 1 << 11;
            /// BIOS shadowing is allowed.
            const SHADOWING = 1 << 12;
            /// VL-VESA is supported.
            const VL_VESA = 1 << 13;
            /// ESCD support is available.
            const ESCD = 1 << 14;
            /// Boot from CD is supported.
            const BOOT_FROM_CD = 1 << 15;
            /// Selectable boot is supported.
            const BOOT_SELECTABLE = 1 << 16;
            /// BIOS ROM is socketed (e.g. PLCC or SOP socket).
            const ROM_SOCKETED = 1 << 17;
            /// Boot from PC card (PCMCIA) is supported.
            const BOOT_FROM_PCMCIA = 1 << 18;
            /// EDD specification is supported.
            const EDD = 1 << 19;
            /// INT 0x13 --- Japanese floppy for NEC 9800 1.2 MB (3.5”, 1K
            /// bytes/sector, 360 RPM) is supported.
            const FLOPPY_NEC_9800 = 1 << 20;
            /// INT 0x13 --- Japanese floppy for Toshiba 1.2 MB (3.5”, 360
            const FLOPPY_TOSHIBA= 1 << 21;
            /// INT 0x13 --- 5.25”/360 KB floppy services are supported.
            const FLOPPY_5_25_IN_360KB = 1 << 22;
            /// INT 0x13 --- 5.25”/1.2 MB floppy services are supported.
            const FLOPPY_5_25_IN_1_2MB = 1 << 23;
            /// INT 0x13 --- 3.5”/720 KB floppy services are supported.
            const FLOPPY_3_5_IN_720KB = 1 << 24;
            /// INT 0x13 --- 3.5”/2.88 MB floppy services are supported.
            const FLOPPY_3_5_IN_2_88MB = 1 << 25;
            /// INT 0x5, print screen service is supported.
            const PRINT_SCREEN = 1 << 26;
            /// INT 0x9, 8042 keyboard services are supported.
            const KEYBOARD_8042 = 1 << 27;
            /// INT 0x14, serial services are supported.
            const SERIAL = 1 << 28;
            /// INT 0x17, printer services are supported.
            const PRINTER = 1 << 29;
            /// INT 0x10, CGA/mono video services are supported.
            const VIDEO_CGA_MONO = 1 << 30;
            /// NEC PC-98
            const NEC_PC_98 = 1 << 31;
        }
    }

    impl Default for BiosCharacteristics {
        fn default() -> Self {
            BiosCharacteristics::UNKNOWN
        }
    }

    bitflags! {
        /// BIOS Characteristics Extension flags.
        ///
        /// See Tables 8 and 9 in section 7.1.1 of [the SMBIOS Reference
        /// Specification][DSP0136] for details.
        ///
        /// [DSP0136]: https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.7.0.pdf
        #[repr(transparent)]
        #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
        pub struct BiosExtCharacteristics: u16 {
            /// ACPI is supported
            const ACPI = 1 << 0;
            /// USB Legacy is supported.
            const USB_LEGACY = 1 << 1;
            /// AGP is supported.
            const AGP = 1 << 2;
            /// I2O boot is supported.
            const BOOT_I2O = 1 << 3;
            /// LS-120 SuperDisk boot is supported.
            const BOOT_LS_120_SUPERDISK = 1 << 4;
            /// ATAPI ZIP drive boot is supported.
            const BOOT_ATAPI_ZIP = 1 << 5;
            /// 1394 boot is supported.
            const BOOT_1394 = 1 << 6;
            /// Smart battery is supported.
            const SMART_BATTERY = 1 << 7;
            /// BIOS boot specification is supported.
            const BIOS_BOOT_SPEC = 1 << 8;
            /// Function key-initiated network service boot is supported.
            ///
            /// When function key-uninitiated network service boot is not supported,
            /// a network adapter option ROM may choose to offer this functionality
            /// on its own, thus offering this capability to legacy systems. When
            /// the function is supported, the network adapter option ROM shall not
            /// offer this capability.
            const NETBOOT_FN_KEY = 1 << 9;
            /// Enable targeted content distribution.
            ///
            /// The manufacturer has ensured that the SMBIOS data is useful in
            /// identifying the computer for targeted delivery of model-specific
            /// software and firmware content through third-party content
            /// distribution services.
            const TARGETED_CONTENT_DIST = 1 << 10;
            /// UEFI specification is supported.
            const UEFI = 1 << 11;
            /// SMBIOS table describes a virtual machine.
            ///
            /// If this bit is not set, no inference can be made about the
            /// virtuality of the system.
            const IS_VM = 1 << 12;
            /// Manufacturing mode is *supported*.
            ///
            /// Manufacturing mode is a special boot mode, not normally available to
            /// end users, that modifies BIOS features and settings for use while
            /// the computer is being manufactured and tested.
            const HAS_MFG_MODE = 1 << 13;
            /// Manufacturing mode is *enabled*.
            const IN_MFG_MODE = 1 << 14;
        }
    }

    serialize_bitflags! {
        BiosCharacteristics => u32,
        BiosExtCharacteristics => u16,
    }
}
#[derive(Default)]
pub struct Type1 {
    pub manufacturer: SmbString,
    pub product_name: SmbString,
    pub version: SmbString,
    pub serial_number: SmbString,
    pub uuid: [u8; 16],
    pub wake_up_type: type1::WakeUpType,
    pub sku_number: SmbString,
    pub family: SmbString,
}
impl Table for Type1 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let mut stab = StringTable::new();
        let data = bits::Type1 {
            manufacturer: stab.add(&self.manufacturer),
            product_name: stab.add(&self.product_name),
            version: stab.add(&self.version),
            serial_number: stab.add(&self.serial_number),
            uuid: self.uuid,
            wake_up_type: self.wake_up_type as u8,
            sku_number: stab.add(&self.sku_number),
            family: stab.add(&self.family),
            ..bits::Type1::new(handle.into())
        };

        render_table(data, None, Some(stab))
    }
}

pub mod type1 {
    use super::*;

    /// Wake-up type.
    ///
    /// See Table 12 in section 7.2.2 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum WakeUpType {
        /// Other
        Other = 0x1,
        /// Unknown
        #[default]
        Unknown = 0x2,
        /// APM Timer
        ApmTimer = 0x3,
        /// Modem Ring
        ModemRing = 0x4,
        /// LAN Remote
        LanRemote = 0x5,
        /// Power Switch
        PowerSwitch = 0x6,
        /// PCI PME#
        PciPme = 0x7,
        /// AC Power Restored
        AcPowerRestored = 0x8,
    }

    serialize_enums! {
        WakeUpType => u8,
    }

    #[cfg(test)]
    mod test {
        use super::*;

        enum_serde_roundtrip_tests! {
            fn wake_up_type_serde_roundtrip(WakeUpType) {}
        }
        enum_deserialize_tests! {
            fn wake_up_type_deserialize(WakeUpType, u8) { [0x9, 0xff, 0x7890] }
        }
    }
}

#[derive(Default)]
pub struct Type4 {
    pub socket_designation: SmbString,
    pub proc_type: type4::ProcType,
    pub proc_family: u8,
    pub proc_manufacturer: SmbString,
    pub proc_id: u64,
    pub proc_version: SmbString,
    pub voltage: u8,
    pub external_clock: u16,
    pub max_speed: u16,
    pub current_speed: u16,
    pub status: type4::ProcStatus,
    pub proc_upgrade: u8,
    pub l1_cache_handle: Handle,
    pub l2_cache_handle: Handle,
    pub l3_cache_handle: Handle,
    pub serial_number: SmbString,
    pub asset_tag: SmbString,
    pub part_number: SmbString,
    pub core_count: u8,
    pub core_enabled: u8,
    pub thread_count: u8,
    pub proc_characteristics: type4::Characteristics,
    pub proc_family2: u16,
}
impl Type4 {
    pub fn set_family(&mut self, family: u16) {
        if family > 0xff {
            self.proc_family = 0xfe;
            self.proc_family2 = family;
        } else {
            self.proc_family = family as u8;
            self.proc_family2 = 0;
        }
    }
}
impl Table for Type4 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let mut stab = StringTable::new();
        let data = bits::Type4 {
            socket_designation: stab.add(&self.socket_designation),
            proc_type: self.proc_type as u8,
            proc_family: self.proc_family,
            proc_manufacturer: stab.add(&self.proc_manufacturer),
            proc_id: self.proc_id,
            proc_version: stab.add(&self.proc_version),
            voltage: self.voltage,
            external_clock: self.external_clock,
            max_speed: self.max_speed,
            current_speed: self.current_speed,
            status: self.status as u8,
            proc_upgrade: self.proc_upgrade,
            l1_cache_handle: self.l1_cache_handle.into(),
            l2_cache_handle: self.l2_cache_handle.into(),
            l3_cache_handle: self.l3_cache_handle.into(),
            serial_number: stab.add(&self.serial_number),
            asset_tag: stab.add(&self.asset_tag),
            part_number: stab.add(&self.part_number),
            core_count: self.core_count,
            core_enabled: self.core_enabled,
            thread_count: self.thread_count,
            proc_characteristics: self.proc_characteristics.bits(),
            proc_family2: self.proc_family2,
            ..bits::Type4::new(handle.into())
        };
        render_table(data, None, Some(stab))
    }
}

pub mod type4 {
    use super::*;

    /// Processor type.
    ///
    /// See Table 21 in section 7.5 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum ProcType {
        /// Other
        Other = 0x01,
        /// Unknown
        #[default]
        Unknown = 0x02,
        /// Central Processor
        Central = 0x03,
        /// Math Processor
        Math = 0x04,
        /// DSP Processor
        Dsp = 0x05,
        /// Video processor
        Video = 0x06,
    }

    /// Processor status.
    ///
    /// See Table 21 in section 7.5 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum ProcStatus {
        /// Status unknown, socket unpopulated.
        UnknownUnpopulated = 0x0,
        /// Status unknown, socket populated.
        #[default]
        UnknownPopulated = STATUS_POPULATED,

        /// CPU Enabled
        ///
        /// It...probably doesn't make sense to have a CPU enabled that's
        /// unpopulated?
        Enabled = 0x1 | STATUS_POPULATED,
        /// CPU Disabled by User through BIOS Setup.
        UserDisabled = 0x2 | STATUS_POPULATED,
        /// CPU Disabled by BIOS (POST Error).
        BiosDisabled = 0x3 | STATUS_POPULATED,
        /// CPU is Idle, waiting to be enabled.
        Idle = 0x4 | STATUS_POPULATED,

        /// Other
        OtherPopulated = 0x7 | STATUS_POPULATED,
        OtherUnpopulated = 0x7,
    }

    const STATUS_POPULATED: u8 = 1 << 6;

    impl ProcStatus {
        pub fn is_populated(&self) -> bool {
            (*self as u8) & STATUS_POPULATED != 0
        }
    }

    serialize_enums! {
        ProcStatus => u8,
        ProcType => u8,
    }

    bitflags! {
        /// Processor characteristics.
        ///
        /// See Table 27 in section 7.5.9 of [the SMBIOS Reference
        /// Specification][DSP0136] for details.
        ///
        /// [DSP0136]:
        ///     https://www.dmtf.org/sites/default/files/standards/documents/DSP0134_3.7.0.pdf
        #[repr(transparent)]
        #[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
        pub struct Characteristics: u16 {
            // Bit 0 is reserved

            /// Unknown
            const UNKNOWN = 1 << 1;
            /// 64-bit Capable
            const IS_64_BIT = 1 << 2;
            /// Multi-core
            const MULTI_CORE = 1 << 3;
            /// Hardware Thread
            const HARDWARE_THREAD = 1 << 4;
            /// Execute Protection
            const EXECUTE_PROTECTION = 1 << 5;
            /// Enhanced Virtualization
            const VIRTUALIZATION = 1 << 6;
            /// Power/Performance Control
            const POWER_PERF_CONTROL = 1 << 7;
            /// 128-bit Capable
            const IS_128_BIT = 1 << 8;
            /// Arm64 SoC ID
            const ARM64_SOC_ID = 1 << 9;
        }
    }

    serialize_bitflags! {
        Characteristics => u16,
    }

    #[cfg(test)]
    mod test {
        use super::*;

        enum_serde_roundtrip_tests! {
            fn proc_status_serde_roundtrip(ProcStatus) {}
            fn proc_type_serde_roundtrip(ProcType) {}
        }

        enum_deserialize_tests! {
            fn proc_status_deserialize(ProcStatus, u8) { [0x9, 0xff, 0x7890] }
            fn proc_type_deserialize(ProcType, u8) { [0x9, 0xff, 0x7890] }
        }
    }
}

#[derive(Default)]
pub struct Type16 {
    pub location: type16::Location,
    pub array_use: type16::ArrayUse,
    pub error_correction: type16::ErrorCorrection,
    pub max_capacity: u32,
    pub error_info_handle: Handle,
    pub num_mem_devices: u16,
    pub extended_max_capacity: u64,
}
impl Type16 {
    pub fn set_max_capacity(&mut self, capacity_bytes: usize) {
        let capacity_kib = capacity_bytes / KB;

        if capacity_bytes >= (2 * TB) {
            self.max_capacity = 0x8000_0000;
            self.extended_max_capacity = capacity_kib as u64;
        } else {
            self.max_capacity = capacity_kib as u32;
        }
    }
}
impl Table for Type16 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let data = bits::Type16 {
            location: self.location as u8,
            array_use: self.array_use as u8,
            error_correction: self.error_correction as u8,
            max_capacity: self.max_capacity,
            error_info_handle: self.error_info_handle.into(),
            num_mem_devices: self.num_mem_devices,
            extended_max_capacity: self.extended_max_capacity,
            ..bits::Type16::new(handle.into())
        };
        render_table(data, None, None)
    }
}

pub mod type16 {
    use super::*;
    /// Memory array location.
    ///
    /// See Table 72 in section 7.17.1 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum Location {
        /// Other
        Other = 0x01,
        /// Unknown
        #[default]
        Unknown = 0x02,
        /// System board or motherboard
        SystemBoard = 0x03,
        /// ISA add-on card
        IsaCard = 0x04,
        /// EISA add-on card
        EisaCard = 0x05,
        /// PCI add-on card
        PciCard = 0x06,
        /// MCA add-on card
        McaCard = 0x07,
        /// PCMCIA add-on card
        PcmciaCard = 0x08,
        /// Proprietary add-on card
        ProprietaryCard = 0x09,
        /// NuBus
        NuBus = 0x0A,
        /// PC-98/C20 add-on card
        Pc98C20Card = 0xA0,
        /// PC-98/C24 add-on card
        Pc98C24Card = 0xA1,
        /// PC-98/E  add-on card
        Pc98ECard = 0xA2,
        /// PC-98/Local bus add-on card
        Pc98LocalCard = 0xA3,
        // CXL add-on card
        CxlCard = 0xA4,
    }
    /// Memory array use field.
    ///
    /// See Table 73 in section 7.17.2 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum ArrayUse {
        /// Other
        Other = 0x1,
        /// Unknown
        #[default]
        Unknown = 0x2,
        /// System memory
        System = 0x3,
        /// Video memory
        Video = 0x4,
        /// Flash memory
        Flash = 0x5,
        /// Non-volatile RAM
        NonVolatile = 0x6,
        /// Cache memory
        Cache = 0x7,
    }

    /// Memory array error correction field.
    ///
    /// See Table 74 in section 7.17.3 of DSP0136 for details.
    #[derive(
        Debug, Default, Copy, Clone, PartialEq, Eq, FromRepr, VariantArray,
    )]
    #[repr(u8)]
    pub enum ErrorCorrection {
        /// Other
        Other = 0x1,
        /// Unknown
        #[default]
        Unknown = 0x2,
        /// No error correction.
        None = 0x3,
        /// Parity
        Parity = 0x4,
        /// Single-bit ECC
        SingleBitEcc = 0x5,
        /// Multi-bit ECC
        MultiBitEcc = 0x6,
        /// CRC
        Crc = 0x7,
    }

    serialize_enums! {
        Location => u8,
        ArrayUse => u8,
        ErrorCorrection => u8,
    }

    #[cfg(test)]
    mod test {
        use super::*;

        enum_serde_roundtrip_tests! {
            fn location_serde_roundtrip(Location) {}
            fn array_use_serde_roundtrip(ArrayUse) {}
            fn error_correction_serde_roundtrip(ErrorCorrection) {}
        }

        enum_deserialize_tests! {
            fn location_deserialize(Location, u8) { [0x11, 0xff, 0x7890] }
            fn array_use_deserialize(ArrayUse, u8) { [0x11, 0xff, 0x7890] }
            fn error_correction_deserialize(ErrorCorrection, u8) { [0x11, 0xff, 0x7890] }
        }
    }
}

#[derive(Default)]
pub struct Type17 {
    pub phys_mem_array_handle: Handle,
    pub mem_err_info_handle: Handle,
    pub total_width: u16,
    pub data_width: u16,
    pub size: u16,
    pub form_factor: u8,
    pub device_set: u8,
    pub device_locator: SmbString,
    pub bank_locator: SmbString,
    pub memory_type: u8,
    pub type_detail: u16,
    pub speed: u16,
    pub manufacturer: SmbString,
    pub serial_number: SmbString,
    pub asset_tag: SmbString,
    pub part_number: SmbString,
    pub attributes: u8,
    pub extended_size: u32,
    pub cfgd_mem_clock_speed: u16,
    pub min_voltage: u16,
    pub max_voltage: u16,
    pub cfgd_voltage: u16,
}
impl Type17 {
    pub fn set_size(&mut self, size_bytes: Option<usize>) {
        match size_bytes {
            None => {
                self.size = 0xffff;
                self.extended_size = 0;
            }
            // size <= 32GiB - 1MiB does not need extended_size
            Some(n) if n < (32767 * MB) => {
                self.size = (n / MB) as u16;
            }
            Some(n) => {
                self.size = 0x7fff;
                self.extended_size = (n / MB) as u32;
            }
        }
    }
}
impl Table for Type17 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let mut stab = StringTable::new();
        let data = bits::Type17 {
            phys_mem_array_handle: self.phys_mem_array_handle.into(),
            mem_err_info_handle: self.mem_err_info_handle.into(),
            total_width: self.total_width,
            data_width: self.data_width,
            size: self.size,
            form_factor: self.form_factor,
            device_set: self.device_set,
            device_locator: stab.add(&self.device_locator),
            bank_locator: stab.add(&self.bank_locator),
            memory_type: self.memory_type,
            type_detail: self.type_detail,
            speed: self.speed,
            manufacturer: stab.add(&self.manufacturer),
            serial_number: stab.add(&self.serial_number),
            asset_tag: stab.add(&self.asset_tag),
            part_number: stab.add(&self.part_number),
            attributes: self.attributes,
            extended_size: self.extended_size,
            cfgd_mem_clock_speed: self.cfgd_mem_clock_speed,
            min_voltage: self.min_voltage,
            max_voltage: self.max_voltage,
            cfgd_voltage: self.cfgd_voltage,
            ..bits::Type17::new(handle.into())
        };

        render_table(data, None, Some(stab))
    }
}

#[derive(Default)]
pub struct Type32();
impl Table for Type32 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        let data = bits::Type32::new(handle.into());

        // Boot status code for "no errors detected"
        let boot_status = [0u8];

        render_table(data, Some(&boot_status), None)
    }
}

#[derive(Default)]
pub struct Type127();
impl Table for Type127 {
    fn render(&self, handle: Handle) -> Vec<u8> {
        bits::Type127::new(handle.into()).to_raw_bytes().into()
    }
}

/// Render all components of a SMBIOS table into raw bytes
///
/// # Arguments
/// - `raw_table`: [RawTable] instance representing the structure
/// - `extra_data`: Any data belonging in the formatted area of the structure
///   which is not already covered by its fields (variable length additions)
/// - `stab`: [StringTable] of any associated strings
fn render_table(
    mut raw_table: impl RawTable,
    extra_data: Option<&[u8]>,
    stab: Option<StringTable>,
) -> Vec<u8> {
    let extra_data = extra_data.unwrap_or(&[]);

    if extra_data.len() > 0 {
        let header = raw_table.header_mut();
        header.length = header
            .length
            .checked_add(extra_data.len() as u8)
            .expect("extra data does not overflow length");
    }
    let raw_data = raw_table.to_raw_bytes();

    // non-generic render, for when raw_table has been turned into bytes
    fn _render_table(
        raw_data: &[u8],
        extra_data: &[u8],
        stab: Option<StringTable>,
    ) -> Vec<u8> {
        let stab_data = stab.and_then(|stab| stab.render());

        let term_len = stab_data
            .as_ref()
            .map(|s| s.len())
            .unwrap_or(bits::TABLE_TERMINATOR.len());

        let mut buf =
            Vec::with_capacity(raw_data.len() + extra_data.len() + term_len);
        buf.extend_from_slice(raw_data);
        buf.extend_from_slice(extra_data);
        if let Some(stab) = stab_data {
            buf.extend_from_slice(&stab);
        } else {
            buf.extend_from_slice(&bits::TABLE_TERMINATOR);
        }
        buf
    }

    _render_table(raw_data, extra_data, stab)
}

struct StringTable<'a> {
    strings: Vec<&'a SmbString>,
    len_with_nulls: usize,
}
impl<'a> StringTable<'a> {
    fn new() -> Self {
        Self { strings: Vec::new(), len_with_nulls: 0 }
    }
    /// Add a [SmbString] to the [StringTable], emitting its index value for
    /// inclusion in the structure to which it is being associated.
    fn add(&mut self, data: &'a SmbString) -> u8 {
        if data.is_empty() {
            0u8
        } else {
            assert!(self.strings.len() < 254);
            self.len_with_nulls += data.len() + 1;
            self.strings.push(data);
            let idx = self.strings.len() as u8;

            idx
        }
    }
    /// Render associated strings raw bytes, properly formatted to be appended
    /// to an associated SMBIOS table.  Returns `None` if no strings were added
    /// to the table.
    fn render(mut self) -> Option<Vec<u8>> {
        if self.strings.is_empty() {
            None
        } else {
            let mut out = Vec::with_capacity(self.len_with_nulls + 1);
            for string in self.strings.drain(..) {
                out.extend_from_slice(string.as_ref());
                out.push(b'\0');
            }
            // table expected to end with double-NUL
            out.push(b'\0');
            Some(out)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/atpic.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated PIC (Intel 8259A)
pub struct BhyveAtPic {
    hdl: Arc<VmmHdl>,
}
impl BhyveAtPic {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self { hdl })
    }
}

impl Lifecycle for BhyveAtPic {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-atpic"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for BhyveAtPic {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(migrate::AtPicV1::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        offer.take::<migrate::AtPicV1>()?.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct AtPicV1 {
        pub chips: [AtPicChipV1; 2],
    }
    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct AtPicChipV1 {
        pub icw_state: u8,
        pub status: u8,
        pub reg_irr: u8,
        pub reg_isr: u8,
        pub reg_imr: u8,
        pub irq_base: u8,
        pub lowprio: u8,
        pub elc: u8,
        pub level: [u32; 8],
    }
    impl From<bhyve_api::vdi_atpic_v1> for AtPicV1 {
        fn from(value: bhyve_api::vdi_atpic_v1) -> Self {
            Self { chips: [value.va_chip[0].into(), value.va_chip[1].into()] }
        }
    }
    impl From<AtPicV1> for bhyve_api::vdi_atpic_v1 {
        fn from(value: AtPicV1) -> Self {
            Self { va_chip: [value.chips[0].into(), value.chips[1].into()] }
        }
    }

    impl From<bhyve_api::vdi_atpic_chip_v1> for AtPicChipV1 {
        fn from(value: bhyve_api::vdi_atpic_chip_v1) -> Self {
            Self {
                icw_state: value.vac_icw_state,
                status: value.vac_status,
                reg_irr: value.vac_reg_irr,
                reg_isr: value.vac_reg_isr,
                reg_imr: value.vac_reg_imr,
                irq_base: value.vac_irq_base,
                lowprio: value.vac_lowprio,
                elc: value.vac_elc,
                level: value.vac_level,
            }
        }
    }
    impl From<AtPicChipV1> for bhyve_api::vdi_atpic_chip_v1 {
        fn from(value: AtPicChipV1) -> Self {
            Self {
                vac_icw_state: value.icw_state,
                vac_status: value.status,
                vac_reg_irr: value.reg_irr,
                vac_reg_isr: value.reg_isr,
                vac_reg_imr: value.reg_imr,
                vac_irq_base: value.irq_base,
                vac_lowprio: value.lowprio,
                vac_elc: value.elc,
                vac_level: value.level,
            }
        }
    }

    impl AtPicV1 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_ATPIC, 1)
                .read::<bhyve_api::vdi_atpic_v1>()?;

            Ok(vdi.into())
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            hdl.data_op(bhyve_api::VDC_ATPIC, 1)
                .write::<bhyve_api::vdi_atpic_v1>(&self.into())?;

            Ok(())
        }
    }
    impl Schema<'_> for AtPicV1 {
        fn id() -> SchemaId {
            ("bhyve-atpic", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/atpit.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated PIT (Intel 8254)
pub struct BhyveAtPit {
    hdl: Arc<VmmHdl>,
}
impl BhyveAtPit {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self { hdl })
    }
}

impl Lifecycle for BhyveAtPit {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-atpit"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for BhyveAtPit {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(migrate::AtPitV1::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        offer.take::<migrate::AtPitV1>()?.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct AtPitV1 {
        pub channel: [AtPitChannelV1; 3],
    }
    impl From<bhyve_api::vdi_atpit_v1> for AtPitV1 {
        fn from(value: bhyve_api::vdi_atpit_v1) -> Self {
            Self {
                channel: [
                    value.va_channel[0].into(),
                    value.va_channel[1].into(),
                    value.va_channel[2].into(),
                ],
            }
        }
    }
    impl Into<bhyve_api::vdi_atpit_v1> for AtPitV1 {
        fn into(self) -> bhyve_api::vdi_atpit_v1 {
            bhyve_api::vdi_atpit_v1 {
                va_channel: [
                    self.channel[0].into(),
                    self.channel[1].into(),
                    self.channel[2].into(),
                ],
            }
        }
    }

    #[derive(Copy, Clone, Default, Serialize, Deserialize)]
    pub struct AtPitChannelV1 {
        pub initial: u16,
        pub reg_cr: u16,
        pub reg_ol: u16,
        pub reg_status: u8,
        pub mode: u8,
        pub status: u8,
        pub time_target: i64,
    }
    impl From<bhyve_api::vdi_atpit_channel_v1> for AtPitChannelV1 {
        fn from(value: bhyve_api::vdi_atpit_channel_v1) -> Self {
            Self {
                initial: value.vac_initial,
                reg_cr: value.vac_reg_cr,
                reg_ol: value.vac_reg_ol,
                reg_status: value.vac_reg_status,
                mode: value.vac_mode,
                status: value.vac_status,
                time_target: value.vac_time_target,
            }
        }
    }
    impl Into<bhyve_api::vdi_atpit_channel_v1> for AtPitChannelV1 {
        fn into(self) -> bhyve_api::vdi_atpit_channel_v1 {
            bhyve_api::vdi_atpit_channel_v1 {
                vac_initial: self.initial,
                vac_reg_cr: self.reg_cr,
                vac_reg_ol: self.reg_ol,
                vac_reg_status: self.reg_status,
                vac_mode: self.mode,
                vac_status: self.status,
                vac_time_target: self.time_target,
            }
        }
    }

    impl AtPitV1 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_ATPIT, 1)
                .read::<bhyve_api::vdi_atpit_v1>()?;

            Ok(vdi.into())
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            hdl.data_op(bhyve_api::VDC_ATPIT, 1)
                .write::<bhyve_api::vdi_atpit_v1>(&self.into())?;

            Ok(())
        }
    }
    impl Schema<'_> for AtPitV1 {
        fn id() -> SchemaId {
            ("bhyve-atpit", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/hpet.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated HPET
pub struct BhyveHpet {
    hdl: Arc<VmmHdl>,
}
impl BhyveHpet {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self { hdl })
    }
}

impl Lifecycle for BhyveHpet {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-hpet"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}
impl MigrateSingle for BhyveHpet {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        Ok(migrate::HpetV1::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        offer.parse::<migrate::HpetV1>()?.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Copy, Clone, Default, Serialize, Deserialize)]
    pub struct HpetV1 {
        pub config: u64,
        pub isr: u64,
        pub count_base: u32,
        pub time_base: i64,

        pub timers: [HpetTimerV1; 8],
    }
    impl From<bhyve_api::vdi_hpet_v1> for HpetV1 {
        fn from(value: bhyve_api::vdi_hpet_v1) -> Self {
            Self {
                config: value.vh_config,
                isr: value.vh_isr,
                count_base: value.vh_count_base,
                time_base: value.vh_time_base,
                timers: value.vh_timers.map(Into::into),
            }
        }
    }
    impl From<HpetV1> for bhyve_api::vdi_hpet_v1 {
        fn from(value: HpetV1) -> Self {
            Self {
                vh_config: value.config,
                vh_isr: value.isr,
                vh_count_base: value.count_base,
                vh_time_base: value.time_base,
                vh_timers: value.timers.map(Into::into),
            }
        }
    }

    #[derive(Copy, Clone, Default, Serialize, Deserialize)]
    pub struct HpetTimerV1 {
        pub config: u64,
        pub msi: u64,
        pub comp_val: u32,
        pub comp_rate: u32,
        pub time_target: i64,
    }
    impl From<bhyve_api::vdi_hpet_timer_v1> for HpetTimerV1 {
        fn from(value: bhyve_api::vdi_hpet_timer_v1) -> Self {
            Self {
                config: value.vht_config,
                msi: value.vht_msi,
                comp_val: value.vht_comp_val,
                comp_rate: value.vht_comp_rate,
                time_target: value.vht_time_target,
            }
        }
    }
    impl From<HpetTimerV1> for bhyve_api::vdi_hpet_timer_v1 {
        fn from(value: HpetTimerV1) -> Self {
            Self {
                vht_config: value.config,
                vht_msi: value.msi,
                vht_comp_val: value.comp_val,
                vht_comp_rate: value.comp_rate,
                vht_time_target: value.time_target,
            }
        }
    }

    impl HpetV1 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_HPET, 1)
                .read::<bhyve_api::vdi_hpet_v1>()?;

            Ok(vdi.into())
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            hdl.data_op(bhyve_api::VDC_HPET, 1)
                .write::<bhyve_api::vdi_hpet_v1>(&self.into())?;

            Ok(())
        }
    }
    impl Schema<'_> for HpetV1 {
        fn id() -> SchemaId {
            ("bhyve-hpet", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/ioapic.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated IO-APIC (Intel 82093AA)
pub struct BhyveIoApic {
    hdl: Arc<VmmHdl>,
}
impl BhyveIoApic {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self { hdl })
    }
}

impl Lifecycle for BhyveIoApic {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-ioapic"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for BhyveIoApic {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(migrate::IoApicV1::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        offer.take::<migrate::IoApicV1>()?.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct IoApicV1 {
        pub id: u32,
        pub reg_sel: u32,
        pub registers: [u64; 32],
        pub levels: [u32; 32],
    }
    impl From<bhyve_api::vdi_ioapic_v1> for IoApicV1 {
        fn from(value: bhyve_api::vdi_ioapic_v1) -> Self {
            Self {
                id: value.vi_id,
                reg_sel: value.vi_reg_sel,
                registers: value.vi_pin_reg,
                levels: value.vi_pin_level,
            }
        }
    }
    impl Into<bhyve_api::vdi_ioapic_v1> for IoApicV1 {
        fn into(self) -> bhyve_api::vdi_ioapic_v1 {
            bhyve_api::vdi_ioapic_v1 {
                vi_pin_reg: self.registers,
                vi_pin_level: self.levels,
                vi_id: self.id,
                vi_reg_sel: self.reg_sel,
            }
        }
    }

    impl IoApicV1 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_IOAPIC, 1)
                .read::<bhyve_api::vdi_ioapic_v1>()?;

            Ok(vdi.into())
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            hdl.data_op(bhyve_api::VDC_IOAPIC, 1)
                .write::<bhyve_api::vdi_ioapic_v1>(&self.into())?;

            Ok(())
        }
    }
    impl Schema<'_> for IoApicV1 {
        fn id() -> SchemaId {
            ("bhyve-ioapic", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

mod atpic;
mod atpit;
mod hpet;
mod ioapic;
mod pmtimer;
mod rtc;

pub use atpic::BhyveAtPic;
pub use atpit::BhyveAtPit;
pub use hpet::BhyveHpet;
pub use ioapic::BhyveIoApic;
pub use pmtimer::BhyvePmTimer;
pub use rtc::BhyveRtc;


================================================
FILE: lib/propolis/src/hw/bhyve/pmtimer.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Mutex};

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated ACPI PM timer (Intel PIIX3/4-ish)
pub struct BhyvePmTimer {
    hdl: Arc<VmmHdl>,
    inner: Mutex<Inner>,
}
struct Inner {
    ioport: u16,
    attached_ioport: Option<u16>,
}
impl BhyvePmTimer {
    pub fn create(hdl: Arc<VmmHdl>, ioport: u16) -> Arc<Self> {
        Arc::new(Self {
            hdl,
            inner: Mutex::new(Inner { ioport, attached_ioport: None }),
        })
    }
    fn update_attachment(&self) {
        let mut inner = self.inner.lock().unwrap();

        let target = inner.ioport;
        let exists = inner.attached_ioport.as_ref();
        if matches!(exists, Some(p) if *p == target) {
            // Attachment is already correct
            return;
        }

        match self.hdl.pmtmr_locate(target) {
            Ok(()) => {
                inner.attached_ioport = Some(target);
            }
            Err(_e) => {
                inner.attached_ioport = None;
                // TODO: squawk about it?
            }
        }
    }
}

impl Lifecycle for BhyvePmTimer {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-pmtimer"
    }
    fn reset(&self) {
        // When the instance is reset, in-kernel attachment of the PM timer IO
        // port may change.
        //
        // We clear `atttached_ioport` here so that a subsequent call of
        // `update_attachement()` during instance start will force the in-kernel
        // configuration of the port to match expectations.
        self.inner.lock().unwrap().attached_ioport = None;
    }
    fn resume(&self) {
        // If the machine was reset
        self.update_attachment();
    }
    fn start(&self) -> anyhow::Result<()> {
        self.update_attachment();
        Ok(())
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for BhyvePmTimer {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(migrate::PmTimerV1::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let data: migrate::PmTimerV1 = offer.take()?;

        data.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Default, Deserialize, Serialize)]
    pub struct PmTimerV1 {
        pub start_time: i64,
    }
    impl PmTimerV1 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_PM_TIMER, 1)
                .read::<bhyve_api::vdi_pm_timer_v1>()?;

            Ok(Self {
                // vdi_pm_timer_v1 also carries the ioport to which the pmtimer
                // is attached, but migration of that state is handled by the
                // chipset PM device.
                start_time: vdi.vpt_time_base,
            })
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            let vdi = bhyve_api::vdi_pm_timer_v1 {
                vpt_time_base: self.start_time,
                // The IO-port field is ignored for writes
                vpt_ioport: 0,
            };
            hdl.data_op(bhyve_api::VDC_PM_TIMER, 1).write(&vdi)?;
            Ok(())
        }
    }
    impl Schema<'_> for PmTimerV1 {
        fn id() -> SchemaId {
            ("bhyve-pmtimer", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/bhyve/rtc.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::io;
use std::sync::Arc;
use std::time::Duration;

use crate::common::Lifecycle;
use crate::migrate::*;
use crate::vmm::VmmHdl;

/// Bhyve VMM-emulated RTC (MC146818 or similar)
pub struct BhyveRtc {
    hdl: Arc<VmmHdl>,
}
impl BhyveRtc {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self { hdl })
    }

    /// Synchronizes the time within the virtual machine
    /// represented by `hdl` with the current system clock,
    /// accurate to the second.
    pub fn set_time(&self, time: Duration) -> io::Result<()> {
        self.hdl.rtc_settime(time)
    }

    /// Store memory size information within the NVRAM area of the RTC device.
    ///
    /// This provides a mechanism for transferring this sizing information
    /// to the host device software.
    /// - `low_mem_bytes`: Memory below 32-bit boundary, must be != 0
    /// - `high_mem_bytes`: Memory above 32-bit boundary
    ///
    /// Size(s) must be aligned to 4KiB.
    pub fn memsize_to_nvram(
        &self,
        low_mem_bytes: u32,
        high_mem_bytes: u64,
    ) -> io::Result<()> {
        assert_ne!(low_mem_bytes, 0, "low-mem must not be zero");
        assert_eq!(low_mem_bytes & 0xfff, 0, "low-mem must be 4KiB aligned");
        assert_eq!(high_mem_bytes & 0xfff, 0, "high-mem must be 4KiB aligned");

        // We mimic the CMOS layout of qemu (expected by OVMF) when it comes to
        // communicating the sizing of instance memory:
        //
        // - 0x15-0x16: Base memory in KiB (0-1MiB, less 384KiB BDA)
        // - 0x17-0x18: Extended memory in KiB (1MiB-64MiB)
        // - 0x30-0x31: Extended memory (duplicate)
        // - 0x34-0x35: Low-mem, less 16MiB, in 64KiB units
        // - 0x5b-0x5d: High-mem in 64KiB units

        const CMOS_OFF_MEM_BASE: u8 = 0x15;
        const CMOS_OFF_MEM_EXT: u8 = 0x17;
        const CMOS_OFF_MEM_EXT_DUP: u8 = 0x30;
        const CMOS_OFF_MEM_LOW: u8 = 0x34;
        const CMOS_OFF_MEM_HIGH: u8 = 0x5b;

        const KIB: usize = 1024;
        const MIB: usize = 1024 * 1024;
        const CHUNK: usize = 64 * KIB;

        // Convert for convenience
        let low_mem = low_mem_bytes as usize;
        let high_mem = high_mem_bytes as usize;

        // First 1MiB, less 384KiB
        let base = u16::min((low_mem / KIB) as u16, 640).to_le_bytes();
        let hdl = &self.hdl;
        hdl.rtc_write(CMOS_OFF_MEM_BASE, base[0])?;
        hdl.rtc_write(CMOS_OFF_MEM_BASE + 1, base[1])?;

        // Next 64MiB
        if low_mem > MIB {
            let ext = (((low_mem - MIB) / KIB) as u16).to_le_bytes();

            hdl.rtc_write(CMOS_OFF_MEM_EXT, ext[0])?;
            hdl.rtc_write(CMOS_OFF_MEM_EXT + 1, ext[1])?;

            // ... and in the duplicate location
            hdl.rtc_write(CMOS_OFF_MEM_EXT_DUP, ext[0])?;
            hdl.rtc_write(CMOS_OFF_MEM_EXT_DUP + 1, ext[1])?;
        }

        // Low-mem, less 16MiB
        if low_mem > 16 * MIB {
            let low = (((low_mem - 16 * MIB) / CHUNK) as u16).to_le_bytes();

            hdl.rtc_write(CMOS_OFF_MEM_LOW, low[0])?;
            hdl.rtc_write(CMOS_OFF_MEM_LOW + 1, low[1])?;
        }

        // High-mem
        if high_mem > 0 {
            // If high_mem is 1TiB or larger, the division below produces a
            // number that overflows the 24 bits available at
            // `CMOS_OFF_MEM_HIGH`. Clamp the value so guests aren't subjected
            // to arbitrary wrapping. OVMF is told about the highmem layout via
            // E820 table anyway, so the only thing that might care about these
            // bytes are guest OSes that check the RTC CMOS bytes directly.
            let chunks =
                std::cmp::min(high_mem / CHUNK, u32::MAX as usize) as u32;
            let high = chunks.to_le_bytes();

            hdl.rtc_write(CMOS_OFF_MEM_HIGH, high[0])?;
            hdl.rtc_write(CMOS_OFF_MEM_HIGH + 1, high[1])?;
            hdl.rtc_write(CMOS_OFF_MEM_HIGH + 2, high[2])?;
        }

        Ok(())
    }
}

impl Lifecycle for BhyveRtc {
    fn type_name(&self) -> &'static str {
        "lpc-bhyve-rtc"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for BhyveRtc {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(migrate::BhyveRtcV2::read(&self.hdl)?.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        offer.take::<migrate::BhyveRtcV2>()?.write(&self.hdl)?;
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;
    use crate::vmm;

    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct BhyveRtcV2 {
        pub base_clock: i64,
        pub last_period: i64,
        #[serde(with = "serde_arrays")]
        pub cmos: [u8; 128],
        pub addr: u8,
    }
    impl From<bhyve_api::vdi_rtc_v2> for BhyveRtcV2 {
        fn from(value: bhyve_api::vdi_rtc_v2) -> Self {
            Self {
                base_clock: value.vr_base_clock,
                last_period: value.vr_last_period,
                cmos: value.vr_content,
                addr: value.vr_addr,
            }
        }
    }
    impl Into<bhyve_api::vdi_rtc_v2> for BhyveRtcV2 {
        fn into(self) -> bhyve_api::vdi_rtc_v2 {
            bhyve_api::vdi_rtc_v2 {
                vr_base_clock: self.base_clock,
                vr_last_period: self.last_period,
                vr_content: self.cmos,
                vr_addr: self.addr,
            }
        }
    }

    impl BhyveRtcV2 {
        pub(super) fn read(hdl: &vmm::VmmHdl) -> std::io::Result<Self> {
            let vdi = hdl
                .data_op(bhyve_api::VDC_RTC, 2)
                .read::<bhyve_api::vdi_rtc_v2>()?;

            Ok(vdi.into())
        }

        pub(super) fn write(self, hdl: &vmm::VmmHdl) -> std::io::Result<()> {
            hdl.data_op(bhyve_api::VDC_RTC, 2)
                .write::<bhyve_api::vdi_rtc_v2>(&self.into())?;

            Ok(())
        }
    }
    impl Schema<'_> for BhyveRtcV2 {
        fn id() -> SchemaId {
            ("bhyve-rtc", 2)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/chipset/i440fx.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::atomic::{AtomicU8, Ordering};
use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::hw::bhyve::{
    BhyveAtPic, BhyveAtPit, BhyveIoApic, BhyvePmTimer, BhyveRtc,
};
use crate::hw::chipset::Chipset;
use crate::hw::ibmpc;
use crate::hw::ids::pci::{
    PIIX3_ISA_DEV_ID, PIIX3_ISA_SUB_DEV_ID, PIIX4_HB_DEV_ID,
    PIIX4_HB_SUB_DEV_ID, PIIX4_PM_DEV_ID, PIIX4_PM_SUB_DEV_ID, VENDOR_INTEL,
    VENDOR_OXIDE,
};
use crate::hw::pci::topology::{LogicalBusId, RoutedBusId};
use crate::hw::pci::{
    self, Bdf, INTxPinID, LintrCfg, PcieCfgDecoder, PioCfgDecoder,
};
use crate::intr_pins::{IntrPin, LegacyPIC, LegacyPin, NoOpPin};
use crate::lifecycle;
use crate::migrate::*;
use crate::mmio::MmioFn;
use crate::pio::{PioBus, PioFn};
use crate::util::regmap::RegMap;
use crate::vmm::{Machine, VmmHdl};

use lazy_static::lazy_static;

const ADDR_PCIE_ECAM_REGION: usize = 0xe000_0000;
const LEN_PCI_ECAM_REGION: usize = 0x1000_0000;

pub const DEFAULT_HB_BDF: Bdf = Bdf::new_unchecked(0, 0, 0);
pub const DEFAULT_LPC_BDF: Bdf = Bdf::new_unchecked(0, 1, 0);
pub const DEFAULT_PM_BDF: Bdf = Bdf::new_unchecked(0, 1, 3);

struct LNKPin {
    inner: Mutex<LNKPinInner>,
}
struct LNKPinInner {
    asserted: bool,
    pin: Option<LegacyPin>,
}
impl LNKPin {
    fn new() -> Self {
        Self { inner: Mutex::new(LNKPinInner { asserted: false, pin: None }) }
    }
    fn reassign(&self, new_pin: Option<LegacyPin>) {
        let mut inner = self.inner.lock().unwrap();
        if let Some(old_pin) = inner.pin.as_ref() {
            if inner.asserted {
                old_pin.deassert()
            }
        }

        if let Some(pin) = new_pin.as_ref() {
            if inner.asserted {
                pin.assert()
            }
        }
        inner.pin = new_pin;
    }
}
impl IntrPin for LNKPin {
    fn assert(&self) {
        let mut inner = self.inner.lock().unwrap();
        inner.asserted = true;
        if let Some(pin) = inner.pin.as_ref() {
            pin.assert();
        }
    }
    fn deassert(&self) {
        let mut inner = self.inner.lock().unwrap();
        inner.asserted = false;
        if let Some(pin) = inner.pin.as_ref() {
            pin.deassert();
        }
    }
    fn pulse(&self) {
        let inner = self.inner.lock().unwrap();
        if let Some(pin) = inner.pin.as_ref() {
            pin.pulse();
        }
    }
    fn is_asserted(&self) -> bool {
        let inner = self.inner.lock().unwrap();
        inner.asserted
    }
    fn import_state(&self, is_asserted: bool) {
        let mut inner = self.inner.lock().unwrap();
        inner.asserted = is_asserted;
        if let Some(pin) = inner.pin.as_ref() {
            pin.import_state(is_asserted);
        }
    }
}

struct IrqConfig {
    pic: Arc<LegacyPIC>,

    lnk_pins: [Arc<LNKPin>; 4],

    #[allow(unused)]
    // XXX: wire up SCI notifications
    sci_pin: Arc<LNKPin>,
}
impl IrqConfig {
    fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        let pic = LegacyPIC::new(hdl);
        let sci_pin = Arc::new(LNKPin::new());
        sci_pin.reassign(pic.pin_handle(SCI_IRQ));
        Arc::new(Self {
            pic,
            lnk_pins: [
                Arc::new(LNKPin::new()),
                Arc::new(LNKPin::new()),
                Arc::new(LNKPin::new()),
                Arc::new(LNKPin::new()),
            ],
            sci_pin,
        })
    }
    fn set_lnk_route(&self, idx: usize, irq: Option<u8>) {
        assert!(idx <= 3);
        self.lnk_pins[idx].reassign(irq.and_then(|i| self.pic.pin_handle(i)));
    }
    fn intr_pin(&self, idx: usize) -> Arc<dyn IntrPin> {
        assert!(idx <= 3);
        Arc::clone(&self.lnk_pins[idx]) as Arc<dyn IntrPin>
    }
}

const PIR_OFFSET: usize = 0x60;
const PIR_LEN: usize = 4;
const PIR_END: usize = PIR_OFFSET + PIR_LEN;

const PIR_MASK_DISABLE: u8 = 0x80;
const PIR_MASK_IRQ: u8 = 0x0f;

const SCI_IRQ: u8 = 0x9;

fn valid_pir_irq(irq: u8) -> bool {
    // Existing ACPI tables allow 3-7, 9-12, 14-15
    matches!(irq, 3..=7 | 9..=12 | 14 | 15)
}

#[derive(Default)]
pub struct Opts {
    pub enable_pcie: bool,
    pub power_pin: Option<Arc<dyn IntrPin>>,
    pub reset_pin: Option<Arc<dyn IntrPin>>,
}

pub struct I440FxHostBridge {
    pci_state: pci::DeviceState,
    indicator: lifecycle::Indicator,

    pci_topology: Arc<pci::topology::Topology>,
    pci_cfg: PioCfgDecoder,
    pcie_cfg: Option<PcieCfgDecoder>,

    pin_power: Arc<dyn IntrPin>,
    pin_reset: Arc<dyn IntrPin>,
}
impl I440FxHostBridge {
    pub fn create(
        pci_topology: Arc<pci::topology::Topology>,
        opts: Opts,
    ) -> Arc<Self> {
        let pci_state = pci::Builder::new(pci::Ident {
            vendor_id: VENDOR_INTEL,
            device_id: PIIX4_HB_DEV_ID,
            sub_vendor_id: VENDOR_OXIDE,
            sub_device_id: PIIX4_HB_SUB_DEV_ID,
            device_class: pci::bits::CLASS_BRIDGE,
            device_subclass: pci::bits::SUBCLASS_BRIDGE_HOST,
            ..Default::default()
        })
        .finish();

        let pin_power = opts.power_pin.unwrap_or_else(|| Arc::new(NoOpPin {}));
        let pin_reset = opts.reset_pin.unwrap_or_else(|| Arc::new(NoOpPin {}));

        let pci_cfg = PioCfgDecoder::new();
        let pcie_cfg = opts.enable_pcie.then(|| {
            PcieCfgDecoder::new(pci::bits::PCIE_MAX_BUSES_PER_ECAM_REGION)
        });

        Arc::new(Self {
            pci_state,
            indicator: Default::default(),

            pci_topology,
            pci_cfg,
            pcie_cfg,

            pin_power,
            pin_reset,
        })
    }

    pub fn attach(self: &Arc<Self>, machine: &Machine) {
        let pio = &machine.bus_pio;
        let pio_dev = Arc::clone(self);
        let piofn =
            Arc::new(move |port: u16, rwo: RWOp| pio_dev.pio_rw(port, rwo))
                as Arc<PioFn>;
        pio.register(
            pci::bits::PORT_PCI_CONFIG_ADDR,
            pci::bits::LEN_PCI_CONFIG_ADDR,
            Arc::clone(&piofn),
        )
        .unwrap();
        pio.register(
            pci::bits::PORT_PCI_CONFIG_DATA,
            pci::bits::LEN_PCI_CONFIG_DATA,
            piofn,
        )
        .unwrap();

        if self.pcie_cfg.is_some() {
            let mmio = &machine.bus_mmio;
            let mmio_dev = Arc::clone(self);
            let mmio_ecam_fn = Arc::new(move |_addr: usize, rwo: RWOp| {
                mmio_dev.pcie_ecam_rw(rwo);
            }) as Arc<MmioFn>;
            mmio.register(
                ADDR_PCIE_ECAM_REGION,
                LEN_PCI_ECAM_REGION,
                mmio_ecam_fn,
            )
            .unwrap();
        }
    }

    fn pci_cfg_rw(&self, bdf: &Bdf, rwo: RWOp) -> Option<()> {
        self.pci_topology.pci_cfg_rw(
            RoutedBusId(bdf.bus.get()),
            bdf.location,
            rwo,
        )
    }

    fn pio_rw(&self, port: u16, rwo: RWOp) {
        match port {
            pci::bits::PORT_PCI_CONFIG_ADDR => {
                self.pci_cfg.service_addr(rwo);
            }
            pci::bits::PORT_PCI_CONFIG_DATA => self
                .pci_cfg
                .service_data(rwo, |bdf, rwo| self.pci_cfg_rw(bdf, rwo)),
            _ => {
                panic!();
            }
        }
    }

    fn pcie_ecam_rw(&self, rwo: RWOp) {
        let pcie_cfg = self
            .pcie_cfg
            .as_ref()
            .expect("PCIe cfg decoder present when ECAM is enabled");

        pcie_cfg.service(rwo, |bdf, rwo| self.pci_cfg_rw(bdf, rwo));
    }
}
impl pci::Device for I440FxHostBridge {
    fn device_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
}
impl Lifecycle for I440FxHostBridge {
    fn type_name(&self) -> &'static str {
        "pci-i440fx-hb"
    }
    fn reset(&self) {
        self.pci_state.reset(self);
    }
    fn start(&self) -> anyhow::Result<()> {
        self.indicator.start();
        Ok(())
    }
    fn pause(&self) {
        self.indicator.pause();
    }
    fn resume(&self) {
        self.indicator.resume();
    }
    fn halt(&self) {
        self.indicator.halt();
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl Chipset for I440FxHostBridge {
    fn pci_attach(
        &self,
        bdf: Bdf,
        dev: Arc<dyn pci::Endpoint>,
        lintr_cfg: Option<LintrCfg>,
    ) {
        self.pci_topology
            .pci_attach(
                LogicalBusId(bdf.bus.get()),
                bdf.location,
                dev,
                lintr_cfg,
            )
            .unwrap();
    }
    fn power_pin(&self) -> Arc<dyn IntrPin> {
        self.pin_power.clone()
    }
    fn reset_pin(&self) -> Arc<dyn IntrPin> {
        self.pin_reset.clone()
    }
}
impl MigrateMulti for I440FxHostBridge {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        MigrateMulti::export(&self.pci_state, output, ctx)?;
        output.push(
            migrate::I440FxHostBridgeV1 { pci_cfg_addr: self.pci_cfg.addr() }
                .into(),
        )
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        MigrateMulti::import(&self.pci_state, offer, ctx)?;
        let data: migrate::I440FxHostBridgeV1 = offer.take()?;
        self.pci_cfg.set_addr(data.pci_cfg_addr);
        Ok(())
    }
}

pub struct Piix3Lpc {
    pci_state: pci::DeviceState,

    pub pic: Arc<BhyveAtPic>,
    pub pit: Arc<BhyveAtPit>,
    pub ioapic: Arc<BhyveIoApic>,
    pub rtc: Arc<BhyveRtc>,

    reg_pir: Mutex<[u8; PIR_LEN]>,
    irq_config: Arc<IrqConfig>,
    post_code: AtomicU8,
}
impl Piix3Lpc {
    pub fn create(hdl: Arc<VmmHdl>) -> Arc<Self> {
        let pci_state = pci::Builder::new(pci::Ident {
            vendor_id: VENDOR_INTEL,
            device_id: PIIX3_ISA_DEV_ID,
            sub_vendor_id: VENDOR_OXIDE,
            sub_device_id: PIIX3_ISA_SUB_DEV_ID,
            device_class: pci::bits::CLASS_BRIDGE,
            device_subclass: pci::bits::SUBCLASS_BRIDGE_ISA,
            ..Default::default()
        })
        .add_custom_cfg(PIR_OFFSET as u8, PIR_LEN as u8)
        .finish();

        let irq_config = IrqConfig::create(hdl.clone());

        Arc::new(Self {
            pci_state,

            pic: BhyveAtPic::create(hdl.clone()),
            pit: BhyveAtPit::create(hdl.clone()),
            ioapic: BhyveIoApic::create(hdl.clone()),
            rtc: BhyveRtc::create(hdl.clone()),

            reg_pir: Mutex::new([0u8; PIR_LEN]),
            post_code: AtomicU8::new(0),
            irq_config,
        })
    }

    pub fn attach(self: &Arc<Self>, pio: &PioBus) {
        let this = Arc::clone(self);
        let piofn = Arc::new(move |port: u16, rwo: RWOp| this.pio_rw(port, rwo))
            as Arc<PioFn>;
        pio.register(
            ibmpc::PORT_FAST_A20,
            ibmpc::LEN_FAST_A20,
            Arc::clone(&piofn),
        )
        .unwrap();
        pio.register(ibmpc::PORT_POST_CODE, ibmpc::LEN_POST_CODE, piofn)
            .unwrap();
    }

    fn pio_rw(&self, port: u16, rwo: RWOp) {
        match port {
            ibmpc::PORT_FAST_A20 => {
                match rwo {
                    RWOp::Read(ro) => {
                        // A20 is always enabled
                        ro.write_u8(0x02);
                    }
                    RWOp::Write(wo) => {
                        let _ = wo.read_u8();
                        // TODO: handle FAST_INIT request
                    }
                }
            }
            ibmpc::PORT_POST_CODE => match rwo {
                RWOp::Read(ro) => {
                    ro.write_u8(self.post_code.load(Ordering::SeqCst));
                }
                RWOp::Write(wo) => {
                    self.post_code.store(wo.read_u8(), Ordering::SeqCst);
                }
            },
            _ => {}
        }
    }

    fn write_pir(&self, idx: usize, val: u8) {
        assert!(idx < PIR_LEN);

        let mut regs = self.reg_pir.lock().unwrap();
        if regs[idx] != val {
            let disabled = (val & PIR_MASK_DISABLE) != 0;
            let irq = val & PIR_MASK_IRQ;

            // XXX better integrate with PCI interrupt routing
            if !disabled && valid_pir_irq(irq) {
                self.irq_config.set_lnk_route(idx, Some(irq));
            } else {
                self.irq_config.set_lnk_route(idx, None);
            }
            regs[idx] = val;
        }
    }

    pub fn route_lintr(&self, bdf: Bdf) -> Option<LintrCfg> {
        if bdf.bus.get() != 0 {
            return None;
        }
        let intx_pin = match (bdf.location.func.get() + 1) % 4 {
            0 => INTxPinID::IntA,
            1 => INTxPinID::IntB,
            2 => INTxPinID::IntC,
            3 => INTxPinID::IntD,
            _ => unreachable!(),
        };
        // D->A->B->C starting at 0:0.0
        let pin_route = (bdf.location.dev.get() + intx_pin as u8 + 2) % 4;
        Some((intx_pin, self.irq_config.intr_pin(pin_route as usize)))
    }

    pub fn irq_pin(&self, irq: u8) -> Option<Box<dyn IntrPin>> {
        self.irq_config
            .pic
            .pin_handle(irq)
            .map(|pin| Box::new(pin) as Box<dyn IntrPin>)
    }
}
impl pci::Device for Piix3Lpc {
    fn device_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }

    fn cfg_rw(&self, region: u8, rwo: RWOp) {
        assert_eq!(region as usize, PIR_OFFSET);
        assert!(rwo.offset() + rwo.len() <= PIR_END - PIR_OFFSET);

        match rwo {
            RWOp::Read(ro) => {
                let off = ro.offset();
                let reg = self.reg_pir.lock().unwrap();
                ro.write_bytes(&reg[off..(off + ro.len())]);
            }
            RWOp::Write(wo) => {
                let off = wo.offset();
                for i in 0..wo.len() {
                    self.write_pir(off + i, wo.read_u8());
                }
            }
        }
    }
}
impl Lifecycle for Piix3Lpc {
    fn type_name(&self) -> &'static str {
        "pci-piix3-lpc"
    }
    fn reset(&self) {
        self.pci_state.reset(self);
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for Piix3Lpc {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let pir = self.reg_pir.lock().unwrap();
        output.push(
            migrate::Piix3LpcV1 {
                pir_regs: *pir,
                post_code: self.post_code.load(Ordering::Acquire),
            }
            .into(),
        )?;
        drop(pir);

        MigrateMulti::export(&self.pci_state, output, ctx)?;

        MigrateMulti::export(self.pic.as_ref(), output, ctx)?;
        MigrateMulti::export(self.pit.as_ref(), output, ctx)?;
        MigrateMulti::export(self.ioapic.as_ref(), output, ctx)?;
        MigrateMulti::export(self.rtc.as_ref(), output, ctx)?;
        Ok(())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let input: migrate::Piix3LpcV1 = offer.take()?;

        // The device is paused during import. Acquiring the PIR lock will
        // add an implicit barrier, so relaxed ordering is OK here.
        self.post_code.store(input.post_code, Ordering::Relaxed);
        *self.reg_pir.lock().unwrap() = input.pir_regs;

        MigrateMulti::import(&self.pci_state, offer, ctx)?;

        MigrateMulti::import(self.pic.as_ref(), offer, ctx)?;
        MigrateMulti::import(self.pit.as_ref(), offer, ctx)?;
        MigrateMulti::import(self.ioapic.as_ref(), offer, ctx)?;
        MigrateMulti::import(self.rtc.as_ref(), offer, ctx)?;

        Ok(())
    }
}

const PMCFG_OFFSET: usize = 0x40;
const PMCFG_LEN: usize = 0x98;

const PMBASE_DEFAULT: u16 = 0xb000;
const PMBASE_LEN: u16 = 0x40;
const SMBBASE_DEFAULT: u16 = 0xb100;
// const SMBBASE_LEN: u16 = 0x40;

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum PmCfg {
    PmBase,
    CountA,
    CountB,
    GpInputCtl,
    DevResD,
    DevActA,
    DevActB,
    DevResA,
    DevResB,
    DevResC,
    DevResE,
    DevResF,
    DevResG,
    DevResH,
    DevResI,
    DevResJ,
    PmRegMisc,
    SmbusBase,
    SmbusHostCfg,
    SmbusSlaveCmd,
    SmbusSlaveShadow1,
    SmbusSlaveShadow2,
    SmbusRev,
    Reserved,
}
lazy_static! {
    static ref PM_CFG_REGS: RegMap<PmCfg> = {
        let layout = [
            (PmCfg::PmBase, 4),
            (PmCfg::CountA, 4),
            (PmCfg::CountB, 4),
            (PmCfg::GpInputCtl, 4),
            (PmCfg::DevResD, 2),
            (PmCfg::Reserved, 2),
            (PmCfg::DevActA, 4),
            (PmCfg::DevActB, 4),
            (PmCfg::DevResA, 4),
            (PmCfg::DevResB, 4),
            (PmCfg::DevResC, 4),
            (PmCfg::DevResE, 4),
            (PmCfg::DevResF, 4),
            (PmCfg::DevResG, 2),
            (PmCfg::Reserved, 2),
            (PmCfg::DevResH, 4),
            (PmCfg::DevResI, 4),
            (PmCfg::DevResJ, 4),
            (PmCfg::PmRegMisc, 1),
            (PmCfg::Reserved, 15),
            (PmCfg::SmbusBase, 4),
            (PmCfg::Reserved, 62),
            (PmCfg::SmbusHostCfg, 1),
            (PmCfg::SmbusSlaveCmd, 1),
            (PmCfg::SmbusSlaveShadow1, 1),
            (PmCfg::SmbusSlaveShadow2, 1),
            (PmCfg::SmbusRev, 1),
            (PmCfg::Reserved, 1),
        ];
        RegMap::create_packed(PMCFG_LEN, &layout, Some(PmCfg::Reserved))
    };
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum PmReg {
    PmSts,
    PmEn,
    PmCntrl,
    PmTmr,
    GpSts,
    GpEn,
    PCntrl,
    PLvl2,
    PLvl3,
    GlbSts,
    DevSts,
    GlbEn,
    GlbCtl,
    DevCtl,
    GpiReg,
    GpoReg,
    Reserved,
}

lazy_static! {
    static ref PM_REGS: RegMap<PmReg> = {
        let layout = [
            (PmReg::PmSts, 2),
            (PmReg::PmEn, 2),
            (PmReg::PmCntrl, 2),
            (PmReg::Reserved, 2),
            (PmReg::PmTmr, 4),
            (PmReg::GpSts, 2),
            (PmReg::GpEn, 2),
            (PmReg::PCntrl, 4),
            (PmReg::PLvl2, 1),
            (PmReg::PLvl3, 1),
            (PmReg::Reserved, 2),
            (PmReg::GlbSts, 2),
            (PmReg::Reserved, 2),
            (PmReg::DevSts, 4),
            (PmReg::GlbEn, 2),
            (PmReg::Reserved, 6),
            (PmReg::GlbCtl, 4),
            (PmReg::DevCtl, 4),
            (PmReg::GpiReg, 4),
            (PmReg::GpoReg, 4),
            (PmReg::Reserved, 8),
        ];
        RegMap::create_packed(
            PMBASE_LEN as usize,
            &layout,
            Some(PmReg::Reserved),
        )
    };
}
bitflags! {
    #[derive(Default, Copy, Clone)]
    struct PmSts: u16 {
        const PWRBTN_STS = 1 << 8;
    }
}
bitflags! {
    #[derive(Default, Copy, Clone)]
    struct PmEn: u16 {
        const PWRBTN_EN = 1 << 8;
    }
}
bitflags! {
    #[derive(Default, Copy, Clone)]
    struct PmCntrl: u16 {
        const SCI_EN = 1;
        const SUS_TYP = 0b111 << 10;
        const SUS_EN = 1 << 13;

    }
}

bitflags! {
    #[derive(Default)]
    struct DevResA: u32 {
        // Enable bus decodes for keyboard controller
        const KBC_EN_DEV11 = 1 << 28;
    }
}
bitflags! {
    #[derive(Default)]
    struct DevResB: u32 {
        // PCI access to keyboard controller
        const KBC_EIO_EN = 1 << 30;
        // PCI access to FDC
        const EIO_EN_DEV5 = 1 << 29;
    }
}
bitflags! {
    #[derive(Default)]
    struct DevResC: u32 {
        // PCI access to ttyb
        const EIO_EN_DEV7 = 1 << 31;
        // Configure ttyb for COM2 port
        const COMB_DEC_SEL_COM2 = 0b001 << 28;
        // PCI access to ttya
        const EIO_EN_DEV6 = 1 << 27;
        // Configure ttya for COM1 port
        const COMA_DEC_SEL_COM1 = 0b000 << 24;
    }
}

// Offset within PMBASE region corresponding to PmTmr register
const PM_TMR_OFFSET: u16 = 0x8;

#[derive(Clone, Copy)]
struct PMRegs {
    pm_base: u16,
    pm_status: PmSts,
    pm_ena: PmEn,
    pm_ctrl: PmCntrl,
}
impl Default for PMRegs {
    fn default() -> Self {
        Self {
            pm_base: PMBASE_DEFAULT,
            pm_status: PmSts::empty(),
            pm_ena: PmEn::empty(),
            pm_ctrl: PmCntrl::empty(),
        }
    }
}
impl PMRegs {
    fn reset(&mut self) {
        *self = Self::default();
    }
    fn pmtimer_port(&self) -> u16 {
        self.pm_base.checked_add(PM_TMR_OFFSET).unwrap()
    }
}

impl From<PMRegs> for migrate::Piix3PmV1 {
    fn from(value: PMRegs) -> Self {
        Self {
            pm_base: value.pm_base,
            pm_status: value.pm_status.bits(),
            pm_ena: value.pm_ena.bits(),
            pm_ctrl: value.pm_ctrl.bits(),
        }
    }
}
impl TryFrom<migrate::Piix3PmV1> for PMRegs {
    type Error = MigrateStateError;

    fn try_from(value: migrate::Piix3PmV1) -> Result<Self, Self::Error> {
        let mut regs = Self::default();

        regs.pm_base = value.pm_base;
        regs.pm_status =
            PmSts::from_bits(value.pm_status).ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PIIX3 pm_status: failed to import saved value {:#x}",
                    value.pm_status,
                ))
            })?;
        regs.pm_ena = PmEn::from_bits(value.pm_ena).ok_or_else(|| {
            MigrateStateError::ImportFailed(format!(
                "PIIX3 pm_ena: failed to import saved value {:#x}",
                value.pm_ena,
            ))
        })?;
        regs.pm_ctrl = PmCntrl::from_bits(value.pm_ctrl).ok_or_else(|| {
            MigrateStateError::ImportFailed(format!(
                "PIIX3 pm_ctrl: failed to import saved value {:#x}",
                value.pm_ctrl,
            ))
        })?;
        Ok(regs)
    }
}

pub struct Piix3PM {
    pci_state: pci::DeviceState,

    /// ACPI PM Timer
    pub pmtimer: Arc<BhyvePmTimer>,

    regs: Mutex<PMRegs>,
    power_pin: Arc<dyn IntrPin>,
    log: slog::Logger,
}
impl Piix3PM {
    pub fn create(
        hdl: Arc<VmmHdl>,
        power_pin: Arc<dyn IntrPin>,
        log: slog::Logger,
    ) -> Arc<Self> {
        let pci_state = pci::Builder::new(pci::Ident {
            vendor_id: VENDOR_INTEL,
            device_id: PIIX4_PM_DEV_ID,
            sub_vendor_id: VENDOR_OXIDE,
            sub_device_id: PIIX4_PM_SUB_DEV_ID,
            device_class: pci::bits::CLASS_BRIDGE,
            device_subclass: pci::bits::SUBCLASS_BRIDGE_OTHER,
            // Linux will complain about the PM-timer being potentially slow if
            // it detects the ACPI device exposing a revision prior to 0x3.
            revision_id: 0x3,
            ..Default::default()
        })
        .add_custom_cfg(PMCFG_OFFSET as u8, PMCFG_LEN as u8)
        // ACPI device requires lintr for SCI
        .add_lintr()
        .finish();

        let regs = PMRegs::default();
        Arc::new(Self {
            pci_state,

            pmtimer: BhyvePmTimer::create(hdl, regs.pmtimer_port()),

            regs: Mutex::new(regs),
            power_pin,
            log,
        })
    }

    pub fn attach(self: &Arc<Self>, pio: &PioBus) {
        // XXX: static registration for now
        let this = Arc::clone(&self);
        let piofn = Arc::new(move |port: u16, rwo: RWOp| this.pio_rw(port, rwo))
            as Arc<PioFn>;
        pio.register(PMBASE_DEFAULT, PMBASE_LEN, piofn).unwrap();
    }

    fn pio_rw(&self, _port: u16, mut rwo: RWOp) {
        PM_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => self.pmreg_read(id, ro),
            RWOp::Write(wo) => self.pmreg_write(id, wo),
        });
    }

    fn pmcfg_read(&self, id: &PmCfg, ro: &mut ReadOp) {
        match id {
            PmCfg::PmRegMisc => {
                // Report IO space as enabled
                ro.write_u8(0x1);
            }
            PmCfg::PmBase => {
                let regs = self.regs.lock().unwrap();

                // LSB hardwired to 1 to indicate PMBase in IO space
                ro.write_u32(u32::from(regs.pm_base) | 0x1);
            }
            PmCfg::SmbusBase => {
                // LSB hardwired to 1 to indicate PMBase in IO space
                ro.write_u32(u32::from(SMBBASE_DEFAULT) | 0x1);
            }
            PmCfg::DevResA => {
                ro.write_u32(DevResA::KBC_EN_DEV11.bits());
            }
            PmCfg::DevResB => {
                ro.write_u32(
                    (DevResB::KBC_EIO_EN | DevResB::EIO_EN_DEV5).bits(),
                );
            }
            PmCfg::DevResC => {
                ro.write_u32(
                    (DevResC::EIO_EN_DEV7
                        | DevResC::COMB_DEC_SEL_COM2
                        | DevResC::EIO_EN_DEV6
                        | DevResC::COMA_DEC_SEL_COM1)
                        .bits(),
                );
            }
            PmCfg::DevResD | PmCfg::DevResG => {
                ro.write_u16(0);
            }
            PmCfg::DevResE
            | PmCfg::DevResF
            | PmCfg::DevResH
            | PmCfg::DevResI
            | PmCfg::DevResJ => {
                ro.write_u32(0);
            }
            _ => {
                // XXX: report everything else as zeroed
                slog::info!(self.log, "piix3pm ignored cfg read";
                    "offset" => ro.offset(), "register" => ?id);
                ro.fill(0);
            }
        }
    }
    fn pmcfg_write(&self, id: &PmCfg, _wo: &WriteOp) {
        // XXX: ignore writes for now
        slog::info!(self.log, "piix3pm ignored cfg write";
            "offset" => _wo.offset(), "register" => ?id);
    }
    fn pmreg_read(&self, id: &PmReg, ro: &mut ReadOp) {
        let regs = &self.regs.lock().unwrap();
        match id {
            PmReg::PmSts => {
                ro.write_u16(regs.pm_status.bits());
            }
            PmReg::PmEn => {
                ro.write_u16(regs.pm_ena.bits());
            }
            PmReg::PmCntrl => {
                ro.write_u16(regs.pm_ctrl.bits());
            }

            PmReg::PmTmr
            | PmReg::GpSts
            | PmReg::GpEn
            | PmReg::PCntrl
            | PmReg::PLvl2
            | PmReg::PLvl3
            | PmReg::GlbSts
            | PmReg::DevSts
            | PmReg::GlbEn
            | PmReg::GlbCtl
            | PmReg::DevCtl
            | PmReg::GpiReg
            | PmReg::GpoReg => {
                // TODO: flesh out the rest of PM emulation
                slog::debug!(self.log, "piix3pm unhandled read";
                    "offset" => ro.offset(), "register" => ?id);
                ro.fill(0);
            }
            PmReg::Reserved => {
                ro.fill(0);
            }
        }
    }
    fn pmreg_write(&self, id: &PmReg, wo: &mut WriteOp) {
        let mut regs = self.regs.lock().unwrap();
        match id {
            PmReg::PmSts => {
                let val = PmSts::from_bits_truncate(wo.read_u16());
                // status bits are W1C
                regs.pm_status.remove(val);
            }
            PmReg::PmEn => {
                regs.pm_ena = PmEn::from_bits_truncate(wo.read_u16());
            }
            PmReg::PmCntrl => {
                regs.pm_ctrl = PmCntrl::from_bits_truncate(wo.read_u16());
                if regs.pm_ctrl.contains(PmCntrl::SUS_EN) {
                    // SUS_EN is write-only and should always read 0
                    regs.pm_ctrl.remove(PmCntrl::SUS_EN);

                    let suspend_type = (regs.pm_ctrl & PmCntrl::SUS_TYP).bits();
                    if suspend_type == 0 {
                        // 0b000 corresponds to soft-off
                        self.power_pin.pulse();
                    }
                }
            }
            PmReg::PmTmr
            | PmReg::GpSts
            | PmReg::GpEn
            | PmReg::PCntrl
            | PmReg::PLvl2
            | PmReg::PLvl3
            | PmReg::GlbSts
            | PmReg::DevSts
            | PmReg::GlbEn
            | PmReg::GlbCtl
            | PmReg::DevCtl
            | PmReg::GpiReg
            | PmReg::GpoReg => {
                slog::info!(self.log, "piix3pm unhandled write";
                    "offset" => wo.offset(), "register" => ?id);
            }
            PmReg::Reserved => {}
        }
    }
}
impl pci::Device for Piix3PM {
    fn device_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
    fn cfg_rw(&self, region: u8, mut rwo: RWOp) {
        assert_eq!(region as usize, PMCFG_OFFSET);

        PM_CFG_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => self.pmcfg_read(id, ro),
            RWOp::Write(wo) => self.pmcfg_write(id, wo),
        })
    }
}
impl Lifecycle for Piix3PM {
    fn type_name(&self) -> &'static str {
        "pci-piix3-pm"
    }
    fn reset(&self) {
        self.pci_state.reset(self);

        // Reset PM-specific registers.  If/when modifications to `pm_base` are
        // allowed, it will need to be more cognizant of the state inside the
        // BhyvePmTimer device.
        self.regs.lock().unwrap().reset();

        self.pmtimer.reset();
    }
    fn resume(&self) {
        self.pmtimer.resume();
    }
    fn start(&self) -> anyhow::Result<()> {
        self.pmtimer.start()
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for Piix3PM {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let regs = self.regs.lock().unwrap();
        output.push(Into::<migrate::Piix3PmV1>::into(*regs).into())?;

        MigrateMulti::export(&self.pci_state, output, ctx)?;

        MigrateMulti::export(self.pmtimer.as_ref(), output, ctx)?;

        Ok(())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let data: migrate::Piix3PmV1 = offer.take()?;
        let xlated_regs: PMRegs = data.try_into()?;

        *self.regs.lock().unwrap() = xlated_regs;

        MigrateMulti::import(&self.pci_state, offer, ctx)?;

        MigrateMulti::import(self.pmtimer.as_ref(), offer, ctx)?;

        Ok(())
    }
}

mod migrate {
    use crate::migrate::*;
    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct I440FxHostBridgeV1 {
        pub pci_cfg_addr: u32,
    }
    impl Schema<'_> for I440FxHostBridgeV1 {
        fn id() -> SchemaId {
            ("i440fx-hb", 1)
        }
    }

    #[derive(Deserialize, Serialize)]
    pub struct Piix3LpcV1 {
        pub pir_regs: [u8; super::PIR_LEN],
        pub post_code: u8,
    }
    impl Schema<'_> for Piix3LpcV1 {
        fn id() -> SchemaId {
            ("piix3-lpc", 1)
        }
    }

    #[derive(Deserialize, Serialize)]
    pub struct Piix3PmV1 {
        pub pm_base: u16,
        pub pm_status: u16,
        pub pm_ena: u16,
        pub pm_ctrl: u16,
    }
    impl Schema<'_> for Piix3PmV1 {
        fn id() -> SchemaId {
            ("piix3-pm", 1)
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::hw::pci::device::test::*;
    use crate::hw::pci::test::Scaffold;
    use crate::hw::pci::{self, Endpoint};
    use crate::intr_pins::NoOpPin;
    use crate::vmm::VmmHdl;

    use slog::{Discard, Logger};

    fn setup_attach(scaffold: &Scaffold, dev: Arc<dyn Endpoint>) -> pci::Bus {
        let bus = scaffold.create_bus();
        // just attach at slot 0 func 0
        bus.attach(pci::BusLocation::new(0, 0).unwrap(), dev, None);
        bus
    }

    fn topo_attach(topo: &pci::topology::Topology, dev: Arc<dyn Endpoint>) {
        topo.pci_attach(
            pci::topology::LogicalBusId(0),
            pci::BusLocation::new(0, 0).unwrap(),
            dev,
            None,
        )
        .unwrap();
    }

    #[test]
    fn hb_pci_cfg_read() {
        let scaffold = Scaffold::new();
        let topo = scaffold.basic_topo();

        let hb = I440FxHostBridge::create(
            topo.clone(),
            Opts { enable_pcie: false, power_pin: None, reset_pin: None },
        );
        topo_attach(&topo, hb.clone());

        cfg_read(hb.as_ref() as &dyn Endpoint);
    }

    #[test]
    fn hb_pci_cfg_write() {
        let scaffold = Scaffold::new();
        let topo = scaffold.basic_topo();

        let hb = I440FxHostBridge::create(
            topo.clone(),
            Opts { enable_pcie: false, power_pin: None, reset_pin: None },
        );
        topo_attach(&topo, hb.clone());

        cfg_write(hb.as_ref() as &dyn Endpoint);
    }

    #[test]
    fn lpc_pci_cfg_read() {
        let hdl = Arc::new(VmmHdl::new_test(0).unwrap());
        let scaffold = Scaffold::new();

        let lpc = Piix3Lpc::create(hdl);
        let _bus = setup_attach(&scaffold, lpc.clone());

        cfg_read(lpc.as_ref() as &dyn Endpoint);
    }

    #[test]
    fn lpc_pci_cfg_write() {
        let hdl = Arc::new(VmmHdl::new_test(0).unwrap());
        let scaffold = Scaffold::new();

        let lpc = Piix3Lpc::create(hdl);
        let _bus = setup_attach(&scaffold, lpc.clone());

        cfg_write(lpc.as_ref() as &dyn Endpoint);
    }

    #[test]
    fn pm_pci_cfg_read() {
        let hdl = Arc::new(VmmHdl::new_test(0).unwrap());
        let scaffold = Scaffold::new();
        let log = Logger::root(Discard, slog::o!());
        let power_pin = Arc::new(NoOpPin {});

        let pm = Piix3PM::create(hdl, power_pin, log);
        let _bus = setup_attach(&scaffold, pm.clone());

        cfg_read(pm.as_ref() as &dyn Endpoint);
    }

    #[test]
    fn pm_pci_cfg_write() {
        let hdl = Arc::new(VmmHdl::new_test(0).unwrap());
        let scaffold = Scaffold::new();
        let log = Logger::root(Discard, slog::o!());
        let power_pin = Arc::new(NoOpPin {});

        let pm = Piix3PM::create(hdl, power_pin, log);
        let _bus = setup_attach(&scaffold, pm.clone());

        cfg_write(pm.as_ref() as &dyn Endpoint);
    }
}


================================================
FILE: lib/propolis/src/hw/chipset/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::hw::pci::{Bdf, Endpoint, LintrCfg};
use crate::intr_pins::IntrPin;

pub mod i440fx;

pub trait Chipset: Send + Sync {
    fn pci_attach(
        &self,
        bdf: Bdf,
        dev: Arc<dyn Endpoint>,
        lintr_cfg: Option<LintrCfg>,
    );
    fn power_pin(&self) -> Arc<dyn IntrPin>;
    fn reset_pin(&self) -> Arc<dyn IntrPin>;
}


================================================
FILE: lib/propolis/src/hw/ibmpc.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! IO port and IRQ definitions for standard IBM PC hardware

pub const PORT_COM1: u16 = 0x3f8;
pub const PORT_COM2: u16 = 0x2f8;
pub const PORT_COM3: u16 = 0x3e8;
pub const PORT_COM4: u16 = 0x2e8;
pub const IRQ_COM1: u8 = 4;
pub const IRQ_COM2: u8 = 3;
pub const IRQ_COM3: u8 = 4;
pub const IRQ_COM4: u8 = 3;

pub const PORT_FAST_A20: u16 = 0x92;
pub const PORT_POST_CODE: u16 = 0x80;

pub const LEN_FAST_A20: u16 = 1;
pub const LEN_POST_CODE: u16 = 1;

pub const PORT_PS2_DATA: u16 = 0x60;
pub const PORT_PS2_CMD_STATUS: u16 = 0x64;

pub const IRQ_PS2_PRI: u8 = 1;
pub const IRQ_PS2_AUX: u8 = 12;


================================================
FILE: lib/propolis/src/hw/ids.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Definitions of various IDs assigned to the devices we expose to guest instances.

/// Oxide's IEE MA-L assignment.
pub const OXIDE_OUI: [u8; 3] = [0xa8, 0x40, 0x25];

/// PCI Specific IDs
pub mod pci {
    // Vendor IDs

    /// Oxide's PCI-SIG assigned Vendor ID.
    ///
    /// Devices emulating existing hardware will generally use the corresponding vendor &
    /// device IDs but specifiy a Oxide as the Subsystem Vendor. For virtual devices defined
    /// entirely by Propolis shall use Oxide's Vendor ID.
    pub const VENDOR_OXIDE: u16 = 0x1de;

    /// RedHat's PCI-SIG assigned Vendor ID, as used for Virtio devices.
    ///
    /// See Virtio 1.1 Section 4.1.2 PCI Device Discovery
    pub const VENDOR_VIRTIO: u16 = 0x1AF4;

    /// Intel's PCI-SIG assigned Vendor ID.
    pub const VENDOR_INTEL: u16 = 0x8086;

    // Emulated Device IDs

    /// PCI Device ID for the PIIX4 Host Bridge.
    pub const PIIX4_HB_DEV_ID: u16 = 0x1237;

    /// PCI Device ID for the PIIX3 ISA Controller.
    pub const PIIX3_ISA_DEV_ID: u16 = 0x7000;

    /// PCI Device ID for the PIIX4 ACPI PM Controller.
    pub const PIIX4_PM_DEV_ID: u16 = 0x7113;

    // Subsystem Device IDs (for devices emulated by propolis)

    /// PCI Subsystem Device ID for the PIIX4 Host Bridge as emulated by propolis.
    pub const PIIX4_HB_SUB_DEV_ID: u16 = 0xfffe;

    /// PCI Subsystem Device ID for the PIIX3 ISA Controller as emulated by propolis.
    pub const PIIX3_ISA_SUB_DEV_ID: u16 = 0xfffd;

    /// PCI Subsystem Device ID for the PIIX4 ACPI PM Controller as emulated by propolis.
    pub const PIIX4_PM_SUB_DEV_ID: u16 = 0xfffc;

    /// PCI Subsystem Device ID for the Propolis Virtio Network device.
    pub const VIRTIO_NET_SUB_DEV_ID: u16 = 0xfffb;

    /// PCI Subsystem Device ID for the Propolis Virtio Block device.
    pub const VIRTIO_BLOCK_SUB_DEV_ID: u16 = 0xfffa;

    // Propolis-specific Device IDs

    /// PCI Device ID for the Propolis NVMe controller.
    pub const PROPOLIS_NVME_DEV_ID: u16 = 0x0;

    /// PCI Device ID for the Propolis xHCI controller.
    pub const PROPOLIS_XHCI_DEV_ID: u16 = 0x1;

    /// PCI Device ID for the Propolis PCI-PCI bridge.
    pub const PROPOLIS_BRIDGE_DEV_ID: u16 = 0x2;
}


================================================
FILE: lib/propolis/src/hw/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod bhyve;
pub mod chipset;
pub mod ibmpc;
pub mod ids;
pub mod nvme;
pub mod pci;
pub mod ps2;
pub mod qemu;
pub mod testdev;
pub mod uart;
pub mod virtio;


================================================
FILE: lib/propolis/src/hw/nvme/admin.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::mem::size_of;

use crate::common::{GuestAddr, GuestRegion, PAGE_SIZE};
use crate::hw::nvme;
use crate::vmm::MemCtx;

use super::bits::*;
use super::queue::{
    sqid_to_block_qid, DoorbellBuffer, QueueId, ADMIN_QUEUE_ID,
};
use super::{
    cmds, NvmeCtrl, NvmeError, PciNvme, MAX_NUM_IO_QUEUES, MAX_NUM_QUEUES,
};

use zerocopy::IntoBytes;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn nvme_abort(cid: u16, devsq_id: u64) {}
}

impl NvmeCtrl {
    /// Abort command.
    ///
    /// See NVMe 1.0e Section 5.1 Abort command
    pub(super) fn acmd_abort(&self, cmd: &cmds::AbortCmd) -> cmds::Completion {
        let devsq_id = nvme::devq_id(self.device_id, cmd.sqid);
        probes::nvme_abort!(|| (cmd.cid, devsq_id));

        // Verify the SQ in question currently exists
        let sqid = cmd.sqid as usize;
        if sqid >= MAX_NUM_QUEUES || self.sqs[sqid].is_none() {
            return cmds::Completion::generic_err(STS_INVAL_FIELD).dnr();
        }

        // TODO: Support aborting in-flight commands.

        // The NVMe spec does not make any guarantees about being able to
        // successfully abort commands and allows indicating a failure to
        // do so back to the host software. We do so here by returning a
        // "success" value with bit 0 set to '1'.
        cmds::Completion::success_val(1)
    }

    /// Service Create I/O Completion Queue command.
    ///
    /// See NVMe 1.0e Section 5.3 Create I/O Completion Queue command
    pub(super) fn acmd_create_io_cq(
        &mut self,
        cmd: &cmds::CreateIOCQCmd,
        nvme: &PciNvme,
    ) -> cmds::Completion {
        // If the host hasn't specified an IOCQES, fail this request
        if self.ctrl.cc.iocqes() == 0 {
            return cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_CREATE_IO_Q_INVAL_QSIZE,
            );
        }

        if cmd.intr_vector >= super::NVME_MSIX_COUNT {
            return cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_CREATE_IO_Q_INVAL_INT_VEC,
            );
        }

        // We only support physical contiguous queues
        if !cmd.phys_contig {
            return cmds::Completion::generic_err(STS_INVAL_FIELD);
        }

        // Finally, create the Completion Queue
        match self.create_cq(
            super::queue::CreateParams {
                id: cmd.qid,
                device_id: self.device_id,
                base: GuestAddr(cmd.prp),
                size: cmd.qsize,
            },
            false,
            cmd.intr_vector,
            nvme,
        ) {
            Ok(_) => cmds::Completion::success(),
            Err(
                NvmeError::InvalidCompQueue(_)
                | NvmeError::CompQueueAlreadyExists(_),
            ) => cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_CREATE_IO_Q_INVAL_QID,
            ),
            Err(NvmeError::QueueCreateErr(err)) => err.into(),
            Err(_) => cmds::Completion::generic_err(STS_INTERNAL_ERR),
        }
    }

    /// Service I/O Create Submission Queue command.
    ///
    /// See NVMe 1.0e Section 5.4 Create I/O Submission Queue command
    pub(super) fn acmd_create_io_sq(
        &mut self,
        cmd: &cmds::CreateIOSQCmd,
        nvme: &PciNvme,
    ) -> cmds::Completion {
        // If the host hasn't specified an IOSQES, fail this request
        if self.ctrl.cc.iosqes() == 0 {
            return cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_CREATE_IO_Q_INVAL_QSIZE,
            );
        }

        // We only support physical contiguous queues
        if !cmd.phys_contig {
            return cmds::Completion::generic_err(STS_INVAL_FIELD);
        }

        // Finally, create the Submission Queue
        match self.create_sq(
            super::queue::CreateParams {
                id: cmd.qid,
                device_id: self.device_id,
                base: GuestAddr(cmd.prp),
                size: cmd.qsize,
            },
            false,
            cmd.cqid,
            nvme,
        ) {
            Ok(sq) => {
                self.io_sq_post_create(nvme, sq);
                cmds::Completion::success()
            }
            Err(NvmeError::InvalidCompQueue(_)) => {
                cmds::Completion::specific_err(
                    StatusCodeType::CmdSpecific,
                    STS_CREATE_IO_Q_INVAL_CQ,
                )
            }
            Err(
                NvmeError::InvalidSubQueue(_)
                | NvmeError::SubQueueAlreadyExists(_),
            ) => cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_CREATE_IO_Q_INVAL_QID,
            ),
            Err(NvmeError::QueueCreateErr(err)) => err.into(),
            Err(_) => cmds::Completion::generic_err(STS_INTERNAL_ERR),
        }
    }

    /// Service I/O Delete Completion Queue command.
    ///
    /// See NVMe 1.0e Section 5.5 Delete I/O Submission Queue command
    pub(super) fn acmd_delete_io_cq(
        &mut self,
        cqid: QueueId,
        nvme: &PciNvme,
    ) -> cmds::Completion {
        // Not allowed to delete the Admin Completion Queue
        if cqid == ADMIN_QUEUE_ID {
            return cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_DELETE_IO_Q_INVAL_QID,
            );
        }

        // Remove the CQ from our list of active CQs.
        // At this point, all associated SQs should've been deleted
        // otherwise we'll return an error.
        match self.delete_cq(cqid, nvme) {
            Ok(()) => cmds::Completion::success(),
            Err(NvmeError::InvalidCompQueue(_)) => {
                cmds::Completion::specific_err(
                    StatusCodeType::CmdSpecific,
                    STS_DELETE_IO_Q_INVAL_QID,
                )
            }
            Err(NvmeError::AssociatedSubQueuesStillExist(_, _)) => {
                cmds::Completion::specific_err(
                    StatusCodeType::CmdSpecific,
                    STS_DELETE_IO_Q_INVAL_Q_DELETION,
                )
            }
            _ => cmds::Completion::generic_err(STS_INTERNAL_ERR),
        }
    }

    /// Service I/O Delete Submission Queue command.
    ///
    /// See NVMe 1.0e Section 5.6 Delete I/O Submission Queue command
    pub(super) fn acmd_delete_io_sq(
        &mut self,
        sqid: QueueId,
        nvme: &PciNvme,
    ) -> cmds::Completion {
        // Not allowed to delete the Admin Submission Queue
        if sqid == ADMIN_QUEUE_ID {
            return cmds::Completion::specific_err(
                StatusCodeType::CmdSpecific,
                STS_DELETE_IO_Q_INVAL_QID,
            );
        }

        // Remove the SQ from our list of active SQs which will stop
        // us from accepting any new requests for it.
        // That should be the only strong ref left to the SubQueue
        // Any in-flight I/O requests that haven't been completed yet
        // only hold a weak ref (via CompQueueEntryPermit).
        // Note: The NVMe 1.0e spec says "The command causes all commands
        //       submitted to the indicated Submission Queue that are still in
        //       progress to be aborted."
        match self.delete_sq(sqid, nvme) {
            Ok(()) => {
                nvme.block_attach.queue_dissociate(sqid_to_block_qid(sqid));
                // TODO: wait until requests are done?
                cmds::Completion::success()
            }
            Err(NvmeError::InvalidSubQueue(_)) => {
                cmds::Completion::specific_err(
                    StatusCodeType::CmdSpecific,
                    STS_DELETE_IO_Q_INVAL_QID,
                )
            }
            _ => cmds::Completion::generic_err(STS_INTERNAL_ERR),
        }
    }

    /// Service Get Log Page command.
    ///
    /// See NVMe 1.0e Section 5.10 Get Log Page command
    pub(super) fn acmd_get_log_page(
        &self,
        cmd: &cmds::GetLogPageCmd,
        mem: &MemCtx,
    ) -> cmds::Completion {
        if let Some(regions) = cmd
            .data(mem)
            .map(|r| mem.writable_region(&r))
            .collect::<Option<Vec<_>>>()
        {
            // TODO: Keep a log to write back instead of 0s
            for region in regions {
                let _ = region.write_byte(0, region.len());
            }
            cmds::Completion::success()
        } else {
            cmds::Completion::generic_err(STS_DATA_XFER_ERR)
        }
    }

    /// Service Identify command.
    ///
    /// See NVMe 1.0e Section 5.11 Identify command
    pub(super) fn acmd_identify(
        &self,
        cmd: &cmds::IdentifyCmd,
        mem: &MemCtx,
    ) -> cmds::Completion {
        match cmd.cns {
            IDENT_CNS_NAMESPACE => match cmd.nsid {
                1 => {
                    assert!(size_of::<IdentifyNamespace>() <= PAGE_SIZE);
                    match Self::write_admin_result(
                        cmd.data(mem),
                        &self.ns_ident,
                        mem,
                    ) {
                        Some(_) => cmds::Completion::success(),
                        None => {
                            cmds::Completion::generic_err(STS_DATA_XFER_ERR)
                        }
                    }
                }
                // 0 is not a valid NSID (See NVMe 1.0e, Section 6.1 Namespaces)
                // We also don't currently support namespace management
                // and so treat the 'broadcast' NSID (0xffffffff) as invalid
                // along with any other namespace
                0 | 0xffffffff => cmds::Completion::generic_err(STS_INVALID_NS),
                _ => cmds::Completion::generic_err(STS_INVALID_NS),
            },
            IDENT_CNS_CONTROLLER => {
                assert!(size_of::<IdentifyController>() <= PAGE_SIZE);

                match Self::write_admin_result(
                    cmd.data(mem),
                    &self.ctrl_ident,
                    mem,
                ) {
                    Some(_) => cmds::Completion::success(),
                    None => cmds::Completion::generic_err(STS_DATA_XFER_ERR),
                }
            }
            // We currently present NVMe version 1.0 in which CNS is a 1-bit field
            // and hence only need to support the NAMESPACE and CONTROLLER cases
            _ => cmds::Completion::generic_err(STS_INVAL_FIELD),
        }
    }

    /// Service Get Features command.
    ///
    /// See NVMe 1.0e Section 5.9 Get Features command
    pub(super) fn acmd_get_features(
        &self,
        cmd: &cmds::GetFeaturesCmd,
    ) -> cmds::Completion {
        match cmd.fid {
            // Mandatory features
            cmds::FeatureIdent::Arbitration => {
                // no-limit for arbitration burst, all other fields zeroed
                let val = 0b111;
                cmds::Completion::success_val(val)
            }
            cmds::FeatureIdent::PowerManagement => {
                // Empty value with unspecified workload hint
                cmds::Completion::success_val(0)
            }
            cmds::FeatureIdent::TemperatureThreshold => {
                let query = cmds::FeatTemperatureThreshold(cmd.cdw11);

                use cmds::{
                    ThresholdTemperatureSelect as TempSel,
                    ThresholdTypeSelect as TypeSel,
                };
                match (query.tmpsel(), query.thsel()) {
                    (TempSel::Reserved(_), _) | (_, TypeSel::Reserved(_)) => {
                        // squawk about reserved bits being set
                        cmds::Completion::generic_err(STS_INVAL_FIELD)
                    }
                    (TempSel::Composite, typesel) => {
                        const KELVIN_0C: u16 = 273;
                        let mut out = cmds::FeatTemperatureThreshold(0);
                        out.set_tmpsel(TempSel::Composite);
                        out.set_thsel(typesel);
                        match typesel {
                            TypeSel::Over => out.set_tmpth(KELVIN_0C + 100),
                            TypeSel::Under => out.set_tmpth(0),
                            TypeSel::Reserved(_) => unreachable!(),
                        }
                        cmds::Completion::success_val(out.0)
                    }
                    (tempsel, typesel) => {
                        let mut out = cmds::FeatTemperatureThreshold(0);
                        out.set_tmpsel(tempsel);
                        out.set_thsel(typesel);
                        match typesel {
                            TypeSel::Over => out.set_tmpth(0xffff),
                            TypeSel::Under => out.set_tmpth(0),
                            TypeSel::Reserved(_) => unreachable!(),
                        }
                        cmds::Completion::success_val(out.0)
                    }
                }
            }
            cmds::FeatureIdent::ErrorRecovery => {
                // Empty value indicating we do none of this
                cmds::Completion::success_val(0)
            }
            cmds::FeatureIdent::NumberOfQueues => {
                // Until we track the maximums set by the guest, just report the
                // maximums supported
                cmds::Completion::success_val(
                    cmds::FeatNumberQueues {
                        ncq: MAX_NUM_IO_QUEUES as u16,
                        nsq: MAX_NUM_IO_QUEUES as u16,
                    }
                    .into(),
                )
            }
            cmds::FeatureIdent::InterruptCoalescing => {
                // A value of 0 indicates no configured coalescing
                cmds::Completion::success_val(0)
            }
            cmds::FeatureIdent::InterruptVectorConfiguration => {
                let cfg: cmds::FeatInterruptVectorConfig = cmd.cdw11.into();

                // report disabled coalescing for all vectors
                cmds::Completion::success_val(
                    cmds::FeatInterruptVectorConfig { iv: cfg.iv, cd: true }
                        .into(),
                )
            }
            cmds::FeatureIdent::WriteAtomicity => {
                // Value of 0 indicates no Disable Normal setting
                cmds::Completion::success_val(0)
            }
            cmds::FeatureIdent::AsynchronousEventConfiguration => {
                // None of the defined events result in AEN transmission
                cmds::Completion::success_val(0)
            }

            // Optional features
            cmds::FeatureIdent::VolatileWriteCache => {
                // TODO: wire into actual write cache state
                //
                // Until that is done, indicate an enabled write cache to ensure
                // IO flushes for the backends which require it for consistency.
                cmds::Completion::success_val(
                    cmds::FeatVolatileWriteCache { wce: true }.into(),
                )
            }

            cmds::FeatureIdent::OxideDeviceFeatures => {
                if cmd.cdw11 != 0 {
                    // We don't currently accept any parameters for this feature
                    cmds::Completion::generic_err(STS_INVAL_FIELD)
                } else {
                    cmds::Completion::success_val(
                        cmds::OxideDeviceFeatures(0)
                            .with_read_only(self.read_only)
                            .0,
                    )
                }
            }

            cmds::FeatureIdent::Reserved
            | cmds::FeatureIdent::LbaRangeType
            | cmds::FeatureIdent::SoftwareProgressMarker
            | cmds::FeatureIdent::Vendor(_) => {
                cmds::Completion::generic_err(STS_INVAL_FIELD).dnr()
            }
        }
    }

    /// Service Set Features command.
    ///
    /// See NVMe 1.0e Section 5.12 Set Features command
    pub(super) fn acmd_set_features(
        &self,
        cmd: &cmds::SetFeaturesCmd,
    ) -> cmds::Completion {
        match cmd.fid {
            cmds::FeatureIdent::NumberOfQueues => {
                let nq: cmds::FeatNumberQueues = match cmd.cdw11.try_into() {
                    Ok(f) => f,
                    Err(_) => {
                        return cmds::Completion::generic_err(STS_INVAL_FIELD);
                    }
                };

                // TODO: error if called after initialization

                // If they ask for too many queues, just return our max possible
                let clamped = cmds::FeatNumberQueues {
                    ncq: nq.ncq.min(MAX_NUM_IO_QUEUES as u16),
                    nsq: nq.nsq.min(MAX_NUM_IO_QUEUES as u16),
                };

                cmds::Completion::success_val(clamped.into())
            }
            cmds::FeatureIdent::VolatileWriteCache => {
                // NVMe 1.0e Figure 66 Identify - Identify Controller Data
                // Structure "If a volatile write cache [VWC] is present, then
                // the host may ... control whether it is enabled with Set
                // Features specifying the Volatile Write Cache feature
                // identifier."
                cmds::Completion::success()
            }
            cmds::FeatureIdent::Reserved
            | cmds::FeatureIdent::Arbitration
            | cmds::FeatureIdent::PowerManagement
            | cmds::FeatureIdent::LbaRangeType
            | cmds::FeatureIdent::TemperatureThreshold
            | cmds::FeatureIdent::ErrorRecovery
            | cmds::FeatureIdent::InterruptCoalescing
            | cmds::FeatureIdent::InterruptVectorConfiguration
            | cmds::FeatureIdent::WriteAtomicity
            | cmds::FeatureIdent::AsynchronousEventConfiguration
            | cmds::FeatureIdent::SoftwareProgressMarker
            | cmds::FeatureIdent::OxideDeviceFeatures
            | cmds::FeatureIdent::Vendor(_) => {
                cmds::Completion::generic_err(STS_INVAL_FIELD).dnr()
            }
        }
    }

    pub(super) fn acmd_doorbell_buf_cfg(
        &mut self,
        cmd: &cmds::DoorbellBufCfgCmd,
    ) -> cmds::Completion {
        let mps_mask = self.get_mps() - 1;

        if cmd.shadow_doorbell_buffer & mps_mask != 0
            || cmd.eventidx_buffer & mps_mask != 0
        {
            return cmds::Completion::generic_err(STS_INVAL_FIELD);
        }

        let db_buf = DoorbellBuffer {
            shadow: GuestAddr(cmd.shadow_doorbell_buffer),
            eventidx: GuestAddr(cmd.eventidx_buffer),
        };

        self.doorbell_buf = Some(db_buf);

        for cq in self.cqs.iter().flatten() {
            cq.set_db_buf(Some(db_buf), false);
        }
        for sq in self.sqs.iter().flatten() {
            sq.set_db_buf(Some(db_buf), false);
        }

        cmds::Completion::success()
    }

    /// Write result data from an admin command into host memory
    ///
    /// The `data` type must be `repr(packed(1))`
    ///
    /// Returns `Some(())` if successful, else None
    fn write_admin_result<T: Copy + IntoBytes>(
        prp: cmds::PrpIter,
        data: &T,
        mem: &MemCtx,
    ) -> Option<()> {
        let bufs: Vec<GuestRegion> = prp.collect();
        if size_of::<T>() > bufs.iter().map(|r| r.1).sum::<usize>() {
            // Not enough space
            return None;
        }
        let regions = bufs
            .into_iter()
            .map(|r| mem.writable_region(&r))
            .collect::<Option<Vec<_>>>()?;
        if regions.len() == 1 {
            // Can be copied to one contiguous page
            regions[0].write(data).ok()?;
            Some(())
        } else {
            // Split across multiple pages

            // Safety:
            //
            // We expect and demand that the resulting structs written through
            // this function are packed, such that there is no padding to risk
            // UB through the [u8] slice creation.
            let mut raw = unsafe {
                std::slice::from_raw_parts(
                    data as *const T as *const u8,
                    size_of::<T>(),
                )
            };
            let mut copied = 0;
            for region in regions {
                let write_len = usize::min(region.len(), raw.len());

                let to_copy;
                (to_copy, raw) = raw.split_at(write_len);
                copied += region.write_bytes(&to_copy).ok()?;

                if raw.is_empty() {
                    break;
                }
            }
            assert_eq!(copied, size_of::<T>());
            Some(())
        }
    }
}


================================================
FILE: lib/propolis/src/hw/nvme/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(dead_code)]

use crate::block::{ByteLen, ByteOffset};
use bitstruct::bitstruct;
use zerocopy::{FromBytes, IntoBytes};

/// A Submission Queue Entry as represented in memory.
///
/// See NVMe 1.0e Section 4.2 Submission Queue Entry - Command Format
#[derive(Debug, Default, Copy, Clone, FromBytes)]
#[repr(C, packed(1))]
pub struct SubmissionQueueEntry {
    /// Command Dword 0 (CDW0)
    ///
    /// Field common to all commands and defined as:
    ///
    /// Bits
    /// 31:16 - Command Identifier (CID)
    /// 15:10 - Reserved
    /// 09:08 - Fused Operation (FUSE)
    /// 07:00 - Opcode (OPC)
    ///
    /// See NVMe 1.0e Section 4.2, Figure 6 Command Dword 0
    pub cdw0: u32,

    /// Namespace Identifier (NSID)
    ///
    /// The namespace that this command applies to.
    pub nsid: u32,

    /// Reserved - Bytes 15:08
    pub rsvd: u64,

    /// Metadata Pointer (MPTR)
    ///
    /// If the command has metadata not interleaved with the logical
    /// block data, then this field contains the address of a contiguous
    /// physical buffer of metadata. The metadata pointer shall be
    /// DWORD aligned.
    ///
    /// See NVMe 1.0e Section 4.4 Metadata Region (MR)
    pub mptr: u64,

    /// The first Physical Region Page (PRP) entry for the command or a
    /// PRP List pointer.
    ///
    /// See NVMe 1.0e Section 4.3 Physical Region Page Entry and List
    pub prp1: u64,

    /// Either reserved, the second Physical Region Page (PRP) entry or
    /// a PRP List pointer.
    ///
    /// See NVMe 1.0e Section 4.3 Physical Region Page Entry and List
    pub prp2: u64,

    /// Command Dword 10 (CDW10)
    ///
    /// A command specific value.
    pub cdw10: u32,

    /// Command Dword 11 (CDW11)
    ///
    /// A command specific value.
    pub cdw11: u32,

    /// Command Dword 12 (CDW12)
    ///
    /// A command specific value.
    pub cdw12: u32,

    /// Command Dword 13 (CDW13)
    ///
    /// A command specific value.
    pub cdw13: u32,

    /// Command Dword 14 (CDW14)
    ///
    /// A command specific value.
    pub cdw14: u32,

    /// Command Dword 15 (CDW15)
    ///
    /// A command specific value.
    pub cdw15: u32,
}

impl SubmissionQueueEntry {
    /// Returns the Identifier (CID) of this Submission Queue Entry.
    ///
    /// The command identifier along with the Submission Queue ID
    /// specifiy a unique identifier for the command.
    pub fn cid(&self) -> u16 {
        (self.cdw0 >> 16) as u16
    }

    /// Returns the Opcode (OPC) of this Submission Queue Entry.
    pub fn opcode(&self) -> u8 {
        self.cdw0 as u8
    }
}

/// A Completion Queue Entry as represented in memory.
///
/// See NVMe 1.0e Section 4.5 Completion Queue Entry
#[derive(Debug, Default, Copy, Clone, IntoBytes)]
#[repr(C, packed(1))]
pub struct CompletionQueueEntry {
    /// Dword 0 (DW0)
    ///
    /// A command specific value.
    pub dw0: u32,

    /// Reserved (DW1) - Bytes 07:04
    pub rsvd: u32,

    /// Submission Queue Head Pointer (SQHD)
    ///
    /// Indicates the current Submission Queue Head pointer
    /// for the Submission Queue identified by `sqid`.
    ///
    /// Bits 15:0 of Dword 2 (DW2)
    ///
    /// See NVMe 1.0e Section 4.5, Figure 13 Completion Queue Entry: DW 2
    pub sqhd: u16,

    /// Submission Queue Identifier (SQID)
    ///
    /// Indicates the Submission Queue for which the command completed
    /// by this Completion Entry was submitted to.
    ///
    /// Bits 31:16 of Dword 2 (DW2)
    ///
    /// See NVMe 1.0e Section 4.5, Figure 13 Completion Queue Entry: DW 2
    pub sqid: u16,

    /// Command Identifier (CID)
    ///
    /// The identifier of the command completed by this Completion Entry.
    /// The command identifier along with the Submission Queue ID
    /// specifiy a unique identifier for the command.
    ///
    /// Bits 15:0 of Dword 3 (DW3)
    ///
    /// See NVMe 1.0e Section 4.5, Figure 14 Completion Queue Entry: DW 3
    pub cid: u16,

    /// The status of the command that's being completed along with
    /// the current phase tag.
    ///
    /// Bit      0 Phase Tag (P)      ===  Bit 16 of Dword 3 (DW3)
    /// Bits 15:01 Status Field (SF)  ===  Bits 31:17 of Dword 3 (DW3)
    ///
    /// See NVMe 1.0e Section 4.5.1 Status Field Definition
    /// See NVMe 1.0e Section 4.5, Figure 14 Completion Queue Entry: DW 3
    pub status_phase: u16,
}
impl CompletionQueueEntry {
    pub fn new(comp: super::cmds::Completion, cid: u16) -> Self {
        Self {
            dw0: comp.dw0,
            rsvd: 0,
            sqhd: 0,
            sqid: 0,
            cid,
            status_phase: comp.status,
        }
    }
    pub fn set_phase(&mut self, phase: bool) {
        match phase {
            true => self.status_phase |= 0b1,
            false => self.status_phase &= !0b1,
        }
    }
}

/// A Dataset Management Range Definition as represented in memory.
///
/// See NVMe 1.0e Section 6.6 Figure 114: Dataset Management – Range Definition
#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, FromBytes, IntoBytes)]
#[repr(C, packed(1))]
pub struct DatasetManagementRangeDefinition {
    /// The context attributes specified for each range provides information about how the range
    /// is intended to be used by host software. The use of this information is optional and the
    /// controller is not required to perform any specific action.
    pub context_attributes: u32,

    pub number_logical_blocks: u32,

    pub starting_lba: u64,
}
impl DatasetManagementRangeDefinition {
    pub fn new(
        context_attributes: u32,
        number_logical_blocks: u32,
        starting_lba: u64,
    ) -> Self {
        Self { context_attributes, number_logical_blocks, starting_lba }
    }

    pub fn offset_len(
        &self,
        lba_data_size: u64,
    ) -> Result<(ByteOffset, ByteLen), &'static str> {
        // Check for overflow in the byte offset calculation
        let byte_offset = self.starting_lba.checked_mul(lba_data_size).ok_or(
            "Starting LBA and LBA data size multiplication overflowed",
        )?;
        // Check for overflow in the byte length calculation
        let byte_len = (u64::from(self.number_logical_blocks))
                .checked_mul(lba_data_size)
                .ok_or("Number of logical blocks and LBA data size multiplication overflowed")?;
        // Check for overflow of offset + length
        byte_offset
            .checked_add(byte_len)
            .ok_or("Byte offset and byte length addition overflowed")?;
        Ok((byte_offset as ByteOffset, byte_len as ByteLen))
    }
}

// Register bits

bitstruct! {
    /// Representation of the Controller Capabilities (CAP) register.
    ///
    /// See NVMe 1.0e Section 3.1.1 Offset 00h: CAP - Controller Capabilities
    #[derive(Clone, Copy, Debug, Default)]
    pub struct Capabilities(pub u64) {
        /// Maximum Queue Entries Supported (MQES)
        ///
        /// The maximum individual queue size that the controller supports.
        /// This is a 0's based value and the minimum value is 1 (indicating a
        /// max size of 2).
        pub mqes: u16 = 0..16;

        /// Contiguous Queues Required  (CQR)
        ///
        /// Whether or not the controller requires I/O Completion/Submission
        /// Queues to be physically contiguous.
        pub cqr: bool = 16;

        /// Arbitration Mechanism Supported (AMS)
        ///
        /// Whether or not the controller supports Weighted Round Robin with Urgent.
        pub ams_roundrobin: bool = 17;

        /// Arbitration Mechanism Supported (AMS)
        ///
        /// Whether or not the controller supports a vendor specific arbitration mechanism.
        pub ams_vendor: bool = 18;

        /// Reserved
        reserved1: u8 = 19..24;

        /// Timeout (TO)
        ///
        /// The worst case time that host software shall wait for the controller to become ready.
        /// Specified as TO * 500ms
        pub to: u8 = 24..32;

        /// Doorbell Stride (DSTRD)
        ///
        /// Size between each completion/submission queue doorbell. Specified as 2^(2 + DSTRD) bytes.
        pub dstrd: u8 = 32..36;

        /// Reserved
        reserved2: u8 = 36;

        /// Command Sets Supported (CSS)
        ///
        /// Whether or not the controller supports NVM I/O command set.
        pub css_nvm: bool = 37;

        /// Command Sets Supported (CSS)
        ///
        /// Reserved bits for indicating other supported I/O command sets.
        css_reserved: u8 = 38..45;

        /// Reserved
        reserved3: u8 = 45..48;

        /// Memory Page Size Minimum (MPSMIN)
        ///
        /// The minimum host memory page size the controller supports.
        /// Specified as 2^(12 + MPSMIN) bytes.
        pub mpsmin: u8 = 48..52;

        /// Memory Page Size Maximum (MPSMAX)
        ///
        /// The maximum host memory page size the controller supports.
        /// Specified as 2^(12 + MPSMAX) bytes.
        pub mpsmax: u8 = 52..56;

        /// Reserved
        reserved4: u8 = 56..64;
    }
}
impl Capabilities {
    /// Size in bytes represented by the MPSMIN value
    pub fn mpsmin_sz(&self) -> usize {
        1 << (12 + self.mpsmin())
    }
}

bitstruct! {
    /// Representation of the Controller Configuration (CC) register.
    ///
    /// See NVMe 1.0e Section 3.1.5 Offset 14h: CC - Controller Configuration
    #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
    pub struct Configuration(pub u32) {
        /// Enable (EN)
        ///
        /// When set to 1, the controller shall begin to process commands based
        /// on Submission Queue Tail Doorbell writes. When cleared to 0, the
        /// controller shall not process commands nor post completion queue
        /// entries. Transitioning from 1 to 0 indicates a Controller Reset.
        pub enabled: bool = 0;

        /// Reserved
        reserved1: u8 = 1..4;

        /// I/O Command Set Selected (CSS)
        ///
        /// The I/O Command Set selected by the host. Must be a supported
        /// command set as indicated by CAP.CSS. This field shall only be
        /// changed when the controller is disabled.
        pub css: IOCommandSet = 4..7;

        /// Memory Page Size (MPS)
        ///
        /// The host memory page size, respecting CAP.MPSMIN/MAX.
        /// Specified as 2^(12 + MPS) bytes.
        pub mps: u8 = 7..11;

        /// Arbitration Mechanism Selected (AMS)
        ///
        /// The Arbitration Mechanism selected by the host. Must be a supportedd
        /// mechanism as indicated by CAP.AMS. This field shall only be changed
        /// when the controller is disabled.
        pub ams: ArbitrationMechanism = 11..14;

        /// Shutdown Notification (SHN)
        ///
        /// Host writes to this field to indicate shutdown processing.
        pub shn: ShutdownNotification = 14..16;

        /// I/O Submission Queue Entry Size (IOSQES)
        ///
        /// Defines the I/O Submission Queue Entry size.
        /// Specified as 2^IOSQES bytes.
        pub iosqes: u8 = 16..20;

        /// I/O Completion Queue Entry Size (IOCQES)
        ///
        /// Defines the I/O Completion Queue Entry size.
        /// Specified as 2^IOCQES bytes.
        pub iocqes: u8 = 20..24;

        /// Reserved
        reserved2: u8 = 24..32;
    }
}

// Selected IO Command Set
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum IOCommandSet {
    Nvm,
    Reserved(u8),
}

impl bitstruct::FromRaw<u8, IOCommandSet> for Configuration {
    fn from_raw(raw: u8) -> IOCommandSet {
        match raw {
            0b000 => IOCommandSet::Nvm,
            0b001..=0b111 => IOCommandSet::Reserved(raw),
            _ => unreachable!(),
        }
    }
}

impl bitstruct::IntoRaw<u8, IOCommandSet> for Configuration {
    fn into_raw(target: IOCommandSet) -> u8 {
        match target {
            IOCommandSet::Nvm => 0b000,
            IOCommandSet::Reserved(raw) => raw,
        }
    }
}

/// Arbitration Mechanisms
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum ArbitrationMechanism {
    RoundRobin,
    WeightedRoundRobinWithUrgent,
    Reserved(u8),
    Vendor,
}

impl bitstruct::FromRaw<u8, ArbitrationMechanism> for Configuration {
    fn from_raw(raw: u8) -> ArbitrationMechanism {
        match raw {
            0b000 => ArbitrationMechanism::RoundRobin,
            0b001 => ArbitrationMechanism::WeightedRoundRobinWithUrgent,
            0b010..=0b110 => ArbitrationMechanism::Reserved(raw),
            0b111 => ArbitrationMechanism::Vendor,
            _ => unreachable!(),
        }
    }
}

impl bitstruct::IntoRaw<u8, ArbitrationMechanism> for Configuration {
    fn into_raw(target: ArbitrationMechanism) -> u8 {
        match target {
            ArbitrationMechanism::RoundRobin => 0b000,
            ArbitrationMechanism::WeightedRoundRobinWithUrgent => 0b001,
            ArbitrationMechanism::Reserved(raw) => raw,
            ArbitrationMechanism::Vendor => 0b111,
        }
    }
}

/// Shutdown Notification Values
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ShutdownNotification {
    None,
    Normal,
    Abrupt,
    Reserved,
}

impl bitstruct::FromRaw<u8, ShutdownNotification> for Configuration {
    fn from_raw(raw: u8) -> ShutdownNotification {
        match raw {
            0b00 => ShutdownNotification::None,
            0b01 => ShutdownNotification::Normal,
            0b10 => ShutdownNotification::Abrupt,
            0b11 => ShutdownNotification::Reserved,
            _ => unreachable!(),
        }
    }
}

impl bitstruct::IntoRaw<u8, ShutdownNotification> for Configuration {
    fn into_raw(target: ShutdownNotification) -> u8 {
        match target {
            ShutdownNotification::None => 0b00,
            ShutdownNotification::Normal => 0b01,
            ShutdownNotification::Abrupt => 0b10,
            ShutdownNotification::Reserved => 0b11,
        }
    }
}

bitstruct! {
    /// Representation of the Controller Status (CSTS) register.
    ///
    /// See NVMe 1.0e Section 3.1.6 Offset 1Ch: CSTS - Controller Status
    #[derive(Clone, Copy, Debug, Default)]
    pub struct Status(pub u32) {
        /// Ready (RDY)
        ///
        /// Controller sets this field to 1 to indicate it is ready to accept
        /// Submission Queue Tail Doorbell writes.
        pub ready: bool = 0;

        /// Controller Fatal Status (CFS)
        ///
        /// Controller sets this field to 1 when a fatal error occurs.
        pub cfs: bool = 1;

        /// Shutdown Status (SHST)
        ///
        /// Indicates the current shutdown processing state.
        pub shst: ShutdownStatus = 2..4;

        /// Reserved
        reserved: u32 = 4..32;
    }
}

/// Shutdown Status
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ShutdownStatus {
    Normal,
    Processing,
    Complete,
    Reserved,
}

impl bitstruct::FromRaw<u8, ShutdownStatus> for Status {
    fn from_raw(raw: u8) -> ShutdownStatus {
        match raw {
            0b00 => ShutdownStatus::Normal,
            0b01 => ShutdownStatus::Processing,
            0b10 => ShutdownStatus::Complete,
            0b11 => ShutdownStatus::Reserved,
            _ => unreachable!(),
        }
    }
}

impl bitstruct::IntoRaw<u8, ShutdownStatus> for Status {
    fn into_raw(target: ShutdownStatus) -> u8 {
        match target {
            ShutdownStatus::Normal => 0b00,
            ShutdownStatus::Processing => 0b01,
            ShutdownStatus::Complete => 0b10,
            ShutdownStatus::Reserved => 0b11,
        }
    }
}

bitstruct! {
    /// Representation of the Admin Queue Attributes (AQA) register.
    ///
    /// See NVMe 1.0e Section 3.1.7 Offset 24h: AQA - Admin Queue Attributes
    #[derive(Clone, Copy, Debug, Default)]
    pub struct AdminQueueAttrs(pub u32) {
        /// Admin Submission Queue Size (ASQS)
        ///
        /// Defines the size of the Admin Submission Queue as a 0's
        /// based value.
        pub asqs: u16 = 0..12;

        /// Reserved
        reserved1: u8 = 12..16;

        /// Admin Completion Queue Size (ACQS)
        ///
        /// Defines the size of the Admin Completion Queue as a 0's
        /// based value.
        pub acqs: u16 = 16..28;

        /// Reserved
        reserved2: u8 = 28..32;
    }
}

// Version definitions

/// Controller Version NVM Express 1.0
///
/// Bits 31:16  Major Version Number (MJR) = "1"
/// Bits 15:00  Minor Version Number (MNR) = "0"
///
/// See NVMe 1.0e Section 3.1.2 Offset 08h: VS - Version
pub const NVME_VER_1_0: u32 = 0x00010000;

// Admin Command Opcodes
// See NVMe 1.0e Section 5, Figure 25 Opcodes for Admin Commands

/// Delete I/O Submission Queue Command Opcode
pub const ADMIN_OPC_DELETE_IO_SQ: u8 = 0x00;
/// Create I/O Submission Queue Command Opcode
pub const ADMIN_OPC_CREATE_IO_SQ: u8 = 0x01;
/// Get Log Page Command Opcode
pub const ADMIN_OPC_GET_LOG_PAGE: u8 = 0x02;
/// Delete I/O Completion Queue Command Opcode
pub const ADMIN_OPC_DELETE_IO_CQ: u8 = 0x04;
/// Create I/O Completion Queue Command Opcode
pub const ADMIN_OPC_CREATE_IO_CQ: u8 = 0x05;
/// Identify Command Opcode
pub const ADMIN_OPC_IDENTIFY: u8 = 0x06;
/// Abort Command Opcode
pub const ADMIN_OPC_ABORT: u8 = 0x08;
/// Set Feature Command Opcode
pub const ADMIN_OPC_SET_FEATURES: u8 = 0x09;
/// Get Feature Command Opcode
pub const ADMIN_OPC_GET_FEATURES: u8 = 0x0A;
/// Asynchronous Event Request Command Opcode
pub const ADMIN_OPC_ASYNC_EVENT_REQ: u8 = 0x0c;
/// Doorbell Buffer Config
pub const ADMIN_OPC_DOORBELL_BUF_CFG: u8 = 0x7c;

// NVM Command Opcodes
// See NVMe 1.0e Section 6, Figure 99 Opcodes for NVM Commands

/// Flush Command Opcode
pub const NVM_OPC_FLUSH: u8 = 0x00;
/// Write Command Opcode
pub const NVM_OPC_WRITE: u8 = 0x01;
/// Read Command Opcode
pub const NVM_OPC_READ: u8 = 0x02;
/// Dataset Mangement Command Opcode
pub const NVM_OPC_DATASET_MANAGEMENT: u8 = 0x09;

// Generic Command Status values
// See NVMe 1.0e Section 4.5.1.2.1, Figure 17 Status Code - Generic Command Status Values

/// Successful Completion
///
/// The command completed successfully.
pub const STS_SUCCESS: u8 = 0x0;

/// Invalid Command Opcode
///
/// The associated command opcode field is not valid.
pub const STS_INVAL_OPC: u8 = 0x1;

/// Invalid Field in Command
///
/// An invalid field specified in the command parameters.
pub const STS_INVAL_FIELD: u8 = 0x2;

/// Command ID Conflict
///
/// The command identifier is already in use.
pub const STS_CID_CONFLICT: u8 = 0x3;

/// Data Transfer Error
///
/// Transferring the data or metadata associated with a command had an error.
pub const STS_DATA_XFER_ERR: u8 = 0x4;

/// Commands Aborted due to Power Loss Notification
///
/// Indicates that the command was aborted due to a power loss notifiation.
pub const STS_PWR_LOSS_ABRT: u8 = 0x5;

/// Internal Device Error
///
/// The command was not completed successfully due to an internal device error.
pub const STS_INTERNAL_ERR: u8 = 0x6;

/// Command Abort Requested
///
/// The command was aborted due to a Command Abort command.
pub const STS_ABORT_REQ: u8 = 0x7;

/// Command Aborted due to SQ Deletion
///
/// The command was aborted due to a Delete I/O Submission Queue request.
pub const STS_ABORT_SQ_DEL: u8 = 0x8;

/// Command Aborted due to Failed Fused Command
///
/// The command was aborted due to the other command in a fused command failing.
pub const STS_FAILED_FUSED: u8 = 0x9;

/// Command Aborted due to Missing Fused Command
///
/// The command was aborted due to the companion fused command not being found.
pub const STS_MISSING_FUSED: u8 = 0xA;

/// Invalid Namespace or Format
///
/// The namespace or the format of that namespace is invalid.
pub const STS_INVALID_NS: u8 = 0xB;

/// Command Sequence Error
///
/// The command was aborted due to a protocol violation in a multi-command sequence.
pub const STS_COMMAND_SEQ_ERR: u8 = 0xC;

// Command Specific Status values
// See NVMe 1.0e Section 4.5.1.2.2, Figure 19 Status Code - Command Specific Status Values

/// Completion Queue Invalid
pub const STS_CREATE_IO_Q_INVAL_CQ: u8 = 0x0;

/// Invalid Queue Identifier (Queue Creation)
pub const STS_CREATE_IO_Q_INVAL_QID: u8 = 0x1;

/// Invalid Queue Size (Queue Creation)
pub const STS_CREATE_IO_Q_INVAL_QSIZE: u8 = 0x2;

/// Invalid Interrupt Vector (Queue Creation)
pub const STS_CREATE_IO_Q_INVAL_INT_VEC: u8 = 0x8;

/// Invalid Queue Identifier (Queue Deletion)
pub const STS_DELETE_IO_Q_INVAL_QID: u8 = 0x1;

/// Invalid Queue Deletion
pub const STS_DELETE_IO_Q_INVAL_Q_DELETION: u8 = 0xC;

// NVM Command Specific Status values
// See NVMe 1.0e Section 4.5.1.2.2, Figure 20 Status Code - Command Specific Status Values, NVM Command Set

/// Conflicting Attributes
pub const STS_READ_CONFLICTING_ATTRS: u8 = 0x80;

/// Invalid Protection Information
pub const STS_READ_INVALID_PROT_INFO: u8 = 0x81;

/// Attempted to Write Read Only Range
pub const STS_WRITE_READ_ONLY_RANGE: u8 = 0x82;

// Feature identifiers
// See NVMe 1.0e Section 5.12.1, Figure 73 Set Features - Feature Identifiers

/// Arbitration
///
/// See NVMe 1.0e Section 5.12.1.1 Arbitration (Feature Identifier 01h)
pub const FEAT_ID_ARBITRATION: u8 = 0x01;

/// Power Management
///
/// See NVMe 1.0e Section 5.12.1.2 Power Management (Feature Identifier 02h)
pub const FEAT_ID_POWER_MGMT: u8 = 0x02;

/// LBA Range Type
///
/// See NVMe 1.0e Section 5.12.1.3 LBA Range Type (Feature Identifier 03h)
pub const FEAT_ID_LBA_RANGE_TYPE: u8 = 0x03;

/// Temperature Threshold
///
/// See NVMe 1.0e Section 5.12.1.4 Arbitration (Feature Identifier 04h)
pub const FEAT_ID_TEMP_THRESH: u8 = 0x04;

/// Error Recovery
///
/// See NVMe 1.0e Section 5.12.1.5 Error Recovery (Feature Identifier 05h)
pub const FEAT_ID_ERROR_RECOVERY: u8 = 0x05;

/// Volatile Write Cache
///
/// See NVMe 1.0e Section 5.12.1.6 Volatile Write Cache (Feature Identifier 06h)
pub const FEAT_ID_VOLATILE_WRITE_CACHE: u8 = 0x06;

/// Number of Queues
///
/// See NVMe 1.0e Section 5.12.1.7 Number of Queues (Feature Identifier 07h)
pub const FEAT_ID_NUM_QUEUES: u8 = 0x07;

/// Interrupt Coalescing
///
/// See NVMe 1.0e Section 5.12.1.8 Interrupt Coalescing (Feature Identifier 08h)
pub const FEAT_ID_INTR_COALESCE: u8 = 0x08;

/// Interrupt Vector Configuration
///
/// See NVMe 1.0e Section 5.12.1.9 Interrupt Vector Configuration (Feature Identifier 09h)
pub const FEAT_ID_INTR_VEC_CFG: u8 = 0x09;

/// Write Atomicity
///
/// See NVMe 1.0e Section 5.12.1.10 Write Atomicity (Feature Identifier 0Ah)
pub const FEAT_ID_WRITE_ATOMIC: u8 = 0x0A;

/// Asynchronous Event Configuration
///
/// See NVMe 1.0e Section 5.12.1.11 Asynchronous Event Configuration (Feature Identifier 0Bh)
pub const FEAT_ID_ASYNC_EVENT_CFG: u8 = 0x0B;

/// Oxide-specific feature.
///
/// Provides device-specific features beyond the standard NVMe capabilities as
/// a single Dword result:
///   Bit 0 [ReadOnly] - If set, the device will complete all writes with
///                      STS_WRITE_READ_ONLY_RANGE.
///   Bits 31-1        - Reserved.
pub const FEAT_ID_OXIDE_DEVICE_FEATURES: u8 = 0xF0;

// Identify CNS values

/// Identify - Namespace Structure
///
/// Return the Identify Namespace data structure in response to Identify command.
/// See NVMe 1.0e Section 5.11
pub const IDENT_CNS_NAMESPACE: u8 = 0x0;

/// Identify - Controller Structure
///
/// Return the Identify Controller data structure in response to Identify command.
/// See NVMe 1.0e Section 5.11
pub const IDENT_CNS_CONTROLLER: u8 = 0x1;

/// The type of value specified in the Status Field (SF) of a command completion.
///
/// See NVMe 1.0e Section 4.5.1.1 Status Code Type (SCT)
#[derive(Copy, Clone, Debug, PartialEq)]
#[repr(u8)]
pub enum StatusCodeType {
    Generic = 0,
    CmdSpecific = 1,
    MediaDataIntegrity = 2,
    VendorSpecific = 7,
}

/// Power State Descriptor (PSD) Data Structure
///
/// Describes the characteristics of a specific power state.
///
/// See NVMe 1.0e Section 5.11, Figure 67 Identify - Power State Descriptor Data Structure
#[derive(Default, Copy, Clone, IntoBytes)]
#[repr(C, packed(1))]
pub struct PowerStateDescriptor {
    /// Maximum Power
    ///
    /// Maximum power consumed by NVM subsystem in this power state.
    /// The value multiplied by 0.01 is equal to the power in Watts.
    pub mp: u16,

    /// Reserved - Bits 31:16
    pub _resv1: u16,

    /// Entry Latency (ENLAT)
    ///
    /// The maximum entry latency in microseconds.
    pub enlat: u32,

    /// Exit Latency (EXLAT)
    ///
    /// The maximum exit latency in microseconds.
    pub exlat: u32,

    /// Relative Read Throughput (RRT)
    ///
    /// Must be less than the number of supported power states.
    ///
    /// Top 3 bits are reserved - Bits 103:101
    pub rrt: u8,

    /// Relative Read Latency (RRL)
    ///
    /// Must be less than the number of supported power states.
    ///
    /// Top 3 bits are reserved - Bits 111:109
    pub rrl: u8,

    /// Relative Write Throughput (RWT)
    ///
    /// Must be less than the number of supported power states.
    ///
    /// Top 3 bits are reserved - Bits 119:117
    pub rwt: u8,

    /// Relative Write Latency (RWL)
    ///
    /// Must be less than the number of supported power states.
    ///
    /// Top 3 bits are reserved - Bits 127:125
    pub rwl: u8,

    /// Reserved - Bits 255:128
    pub _resv: [u8; 16],
}

bitstruct! {
    /// Queue Entry Size Required & Maximum (both Completion & Submission)
    ///
    /// Defines the required and maximum Queue entry sizes when using the NVM Command Set.
    #[derive(Copy, Clone, IntoBytes)]
    pub struct NvmQueueEntrySize(pub u8) {
        /// The required (minimum) Queue Entry Size.
        ///
        /// Specified as 2^required bytes. It shall be 6 (64 bytes).
        pub required: u8 = 0..4;

        /// The maximum Queue Entry Size and is >= the required size.
        ///
        /// Specified as 2^required bytes.
        /// The recommended maximum is 6 (64 bytes) for standad NVM Command Set.
        /// Controllers with proprietary extensions may support a larger size.
        pub maximum: u8 = 4..8;
    }
}

/// Identify Controller Data Structure
///
/// Describes the characteristics of the controller.
///
/// See NVMe 1.0e Section 5.11, Figure 66 Identify - Identify Controller Data Structure
#[derive(Copy, Clone, IntoBytes)]
#[repr(C, packed(1))]
pub struct IdentifyController {
    // bytes 0-255 - Controller Capabilities and Features
    /// PCI Vendor ID (VID)
    ///
    /// Same value reported in ID register.
    /// See NVMe 1.0e Section 2.1.1 Offset 00h: ID - Identifiers
    pub vid: u16,
    /// PCI Subsystem Vendor ID (SSVID)
    ///
    /// Same value reported in SS register.
    /// See NVMe 1.0e Section 2.1.17 Offset 2Ch: ID - Sub System Identifiers
    pub ssvid: u16,
    /// Serial Number (SN)
    ///
    /// See NVMe 1.0e Section 7.7 Unique Identifier
    pub sn: [u8; 20],
    /// Model Number (MN)
    ///
    /// See NVMe 1.0e Section 7.7 Unique Identifier
    pub mn: [u8; 40],
    /// Firmware Revision (FR)
    ///
    /// Same revision information returned via Get Log Page command.
    /// See NVMe 1.0e Section 5.10.1.3 Firmware Slot Information (Log Identifier 03h)
    pub fr: [u8; 8],
    /// Recommended Arbitration Burst (RAB)
    ///
    /// See NVMe 1.0e Section 4.7 Command Arbitration
    pub rab: u8,
    /// IEEE OUI Identifier (IEEE)
    pub ieee: [u8; 3],
    /// Multi-Interface Capabilities (MIC)
    ///
    /// Whether there are multiple physical PCIe interfaces and associated capabilities.
    /// Bits 7:1 are optional.
    pub cmic: u8,
    /// Maximum Data Transfer Size (MDTS)
    ///
    /// The host (VM) should not submit a command that exceeds this transfer size.
    /// The value is in unites of the minimum memory page size (CAP.MPSMIN) and is
    /// reported as a power of two (2^n). A value of 0h indicates no restrictions on
    /// transfer size. The restrictions includes interleaved metadata.
    pub mdts: u8,
    /// Reserved - Bytes 255:78
    pub _resv1: [u8; 178],

    // bytes 256-511 - Admin Command Set Attributes & Optional Controller Capabilities
    /// Optional Admin Command Support (OACS)
    ///
    /// Bits 15:3 are reserved.
    /// Bit 2 indicates Firmware Activate & Download command support.
    /// Bit 1 indicates Format NVM command support.
    /// Bit 0 indicates Security Send/Receive command support.
    pub oacs: u16,
    /// Abort Command Limit (ACL)
    ///
    /// Maximum number of concurrently outstanding Abort commands supported.
    /// This is a 0's based value.
    /// See NVMe 1.0e Section 5.1 Abort command
    pub acl: u8,
    /// Asynchronous Event Request Limit (AERL)
    ///
    /// Maximum number of concurrently outstanding Asynchronous Event Request commands
    /// supported.
    /// This is a 0's based value.
    /// See NVMe 1.0e Section 5.2 Asynchronous Event Request command
    pub aerl: u8,
    /// Firmware Updates (FRMW)
    ///
    /// Bits 7:4 are reserved.
    /// Bits 3:1 indicate number of firmware slots device supports (between 1-7, inclusive)
    /// Bit 0 indicates if the first firmware slot (slot 1) is read-only.
    /// See NVMe 1.0e Section 8.1 Firmware Update Process
    pub frmw: u8,
    /// Log Page Attributes (LPA)
    ///
    /// Bits 7:1 are reserved.
    /// Bit 0 indicated per-namespace SMART/Health information log support.
    pub lpa: u8,
    /// Error Log Page Entries (ELPE)
    ///
    /// Number of Error Information log entries that are stored by the controller.
    /// This is a 0's based value.
    pub elpe: u8,
    /// Number of Power States Support (NPSS)
    ///
    /// Number of NVMe power states supported by the controller (up to 32 total).
    /// This is a 0's based value, i.e. 1 minimum.
    /// See NVMe 1.0e Section 8.4 Power Management
    pub npss: u8,
    /// Admin Vendor Specific Command Configuration (AVSCC)
    ///
    /// Bits 7:1 are reserved.
    /// Bit 0 indicates that all Admin Vendor Specific Commands use format in Figure 8.
    /// See NVMe 1.0e Section 4.2, Figure 8 Command Format - Admin and NVM Vendor Specific Commands (Optional)
    pub avscc: u8,
    /// Reserved
    pub _resv2: [u8; 247],

    // bytes 512-2047 - NVM Command Set Attributes
    /// Submission Queue Entry Size (SQES)
    ///
    /// Defines the required and maximum Submission Queue entry sizes when using the NVM Command Set.
    /// Bits 7:4 define the maximum SQES and is >= the required SQES.
    /// Bits 3:0 define the required (minimum) SQES. It shall be 6 (64 bytes).
    ///
    /// The recommended maximum SQES is 6 (64 bytes) for standard NVM Command Set.
    /// Controllers with proprietary extensions may support a larger size.
    /// Both the required and maximum SQES values are in bytes and reported as powers of two (2^n).
    pub sqes: NvmQueueEntrySize,
    /// Completion Queue Entry Size (CQES)
    ///
    /// Defines the required and maximum Completion Queue entry sizes when using the NVM Command Set.
    /// Bits 7:4 define the maximum CQES and is >= the required CQES.
    /// Bits 3:0 define the required (minimum) CQES. It shall be 4 (16 bytes).
    ///
    /// The recommended maximum CQES is 4 (16 bytes) for standard NVM Command Set.
    /// Controllers with proprietary extensions may support a larger size.
    /// Both the required and maximum CQES values are in bytes and reported as powers of two (2^n).
    pub cqes: NvmQueueEntrySize,
    /// Reserved - Bytes 515:514
    pub _resv3: [u8; 2],
    /// Number of Namespaces (NN)
    ///
    /// The number of valid namespaces present for the controller. Namespaces shall start
    /// with namespace ID 1 and be packed sequentially.
    pub nn: u32,
    /// Option NVM Command Support (ONCS)
    ///
    /// Bits 15:3 are reserved.
    /// Bit 2 indicates Dataset Management command support.
    /// Bit 1 indicates Write Uncorrectable command support.
    /// Bit 0 indicates Compare command support.
    pub oncs: u16,
    /// Fused Operation Support (FUSES)
    ///
    /// Bits 15:1 are reserved.
    /// Bit 0 indicates Compare and Write fused operation support.
    pub fuses: u16,
    /// Format NVM Attributes (FNA)
    ///
    /// Bits 7:3 are reserved.
    /// Bit 2 indicates cryptographic erase support.
    /// Bit 1 indicates whether secure erase is a global (1) or per-namespace (0) operation.
    /// Bit 0 indicates whether format is a global (1) or per-namespace (0) operation.
    pub fna: u8,
    /// Volatile Write Cache (VWC)
    ///
    /// Bits 7:1 are reserved.
    /// Bit 0 indicates whether a volatile write cache is present.
    pub vwc: u8,
    /// Atomic Write Unit Normal (AWUN)
    ///
    /// Indicates the atomic write size for the controller during normal operation.
    /// This field is specified in logical blocks and is a 0's based value.
    pub awun: u16,
    /// Atomic Write Unit Power Fail (AWUPF)
    ///
    /// Indicates the atomic write size for the controller during a power fail condition.
    /// This field is specified in logical blocks and is a 0's based value.
    pub awupf: u16,
    /// NVM Vendor Specific Command Configuration (NVSCC)
    ///
    /// Bits 7:1 are reserved
    /// Bit 0 indicates that all NVM Vendor Specific Commands use format in Figure 8.
    /// See NVMe 1.0e Section 4.2, Figure 8 Command Format - Admin and NVM Vendor Specific Commands (Optional)
    /// See NVMe 1.0e Section 8.7 Standard Vendor Specific Command Format
    pub nvscc: u8,
    /// Reserved - Bytes 703:531
    pub _resv4: [u8; 173],
    /// Reserved (I/O Command Set Attributes) - Bytes 2047:704
    pub _resv5: [u8; 1344],

    // bytes 2048-3071 - Power State Descriptors
    /// Power State Descriptors (PSD0-PSD31)
    pub psd: [PowerStateDescriptor; 32],

    // bytes 3072-4095 - Vendor Specific
    /// Vendor Specific (VS)
    pub vs: [u8; 1024],
}

// We can't derive Default since Default isn't impl'd
// for [T; N] where N > 32 yet (rust #61415)
impl Default for IdentifyController {
    fn default() -> Self {
        Self {
            vid: 0,
            ssvid: 0,
            sn: [0; 20],
            mn: [0; 40],
            fr: [0; 8],
            rab: 0,
            ieee: [0; 3],
            cmic: 0,
            mdts: 0,
            oacs: 0,
            acl: 0,
            aerl: 0,
            frmw: 0,
            lpa: 0,
            elpe: 0,
            npss: 0,
            avscc: 0,
            sqes: NvmQueueEntrySize(0),
            cqes: NvmQueueEntrySize(0),
            nn: 0,
            oncs: 0,
            fuses: 0,
            fna: 0,
            vwc: 0,
            awun: 0,
            awupf: 0,
            nvscc: 0,
            psd: [PowerStateDescriptor::default(); 32],
            vs: [0; 1024],

            _resv1: [0; 178],
            _resv2: [0; 247],
            _resv3: [0; 2],
            _resv4: [0; 173],
            _resv5: [0; 1344],
        }
    }
}

/// LBA Format Data Structure
///
/// Describes a specific Logical Block Address (LBA) format.
/// See NVMe 1.0e Section 5.11, Figure 69 Identify - LBA Format Data Structure, NVM Command Set Specific
#[derive(Default, Copy, Clone, IntoBytes)]
#[repr(C, packed(1))]
pub struct LbaFormat {
    /// Metadata Size (MS)
    ///
    /// The number of metadata bytes provided per LBA.
    pub ms: u16,
    /// LBA Data Size (LBADS)
    ///
    /// The LBA data size supported and reported in terms of power of two (2^n).
    /// The minimum required value is 9 (512 bytes).
    /// If the value is 0h, then the LBA format is not supported.
    pub lbads: u8,
    /// Relative Performance (RP)
    ///
    /// Bits 7:2 are reserved.
    /// Bits 1:0 indicate the performance of this LBA format relative to other supported LBA formats.
    ///     00b = Best Performance
    ///     01b = Better Performance
    ///     10b = Good Performance
    ///     11b = Degraded Performance
    pub rp: u8,
}

/// Identify Namespace Data Structure
///
/// Describes the characteristics of a namespace.
///
/// See NVMe 1.0e Section 5.11, Figure 68 Identify - Identify Namespace Data Structure, NVM Command Set Specific
#[derive(Copy, Clone, IntoBytes)]
#[repr(C, packed(1))]
pub struct IdentifyNamespace {
    /// Namespace Size (NSZE)
    ///
    /// The total size of the namespace in logical blocks. A namespace of size
    /// n consists of Logical Block Addresses (LBA) 0 through n-1.
    pub nsze: u64,
    /// Namespace Capacity (NCAP)
    ///
    /// NCAP <= NSZE.
    /// The maximum number of logical blocks that may be allocated in the namespace
    /// at any point in time. This field is used in the case of thin provisioning.
    /// A logical block is allocated when written with a Write (Uncorrectable) command.
    /// A value of 0h indicates that the namespace is not available for use.
    pub ncap: u64,
    /// Namespace Utilization (NUSE)
    ///
    /// NUSE <= NCAP
    /// The current number of logical blocks allocated in the namespace.
    pub nuse: u64,
    /// Namespace Features (NSFEAT)
    ///
    /// Bits 7:1 are reserved.
    /// Bit 0 indicates whether the namespace supports thin provisioning.
    pub nsfeat: u8,
    /// Number of LBA Formats (NLBAF)
    ///
    /// The number of supported LBA data size and metadata size combinations.
    /// LBA formats shall be allocated (starting with 0) and packed sequentially.
    /// This is a 0's based value.
    /// The maximum number of LBA formats that may be indicated as supported is 16.
    /// The supported LBA formats are defined in the `lbaf` field.
    pub nlbaf: u8,
    /// Formatted LBA Size (FLBAS)
    ///
    /// Bits 7:5 are reserved.
    /// Bit 4 indicates that the metadata is transferred at the end of the data LBA.
    /// Bits 3:0 indicate one of the 16 supported combinations in `lbaf`.
    pub flbas: u8,
    /// Metadata Capabilities (MC)
    ///
    /// Bits 7:2 are reserved
    /// Bit 1 indicated whether namespace supports transferring maetadata in a separate buffer.
    /// Bit 0 indicates whether namespace supports transferring metadata as part of extended data LBA.
    pub mc: u8,
    /// End-to-end Data Protection Capabilities (DPC)
    ///
    /// Bits 7:5 are reserved.
    /// Bit 4 indicates whether namespace supports protection information in last 8 bytes of metadata.
    /// Bit 3 indicates whether namespace supports protection information in first 8 bytes of metadata.
    /// Bit 2 indicates whether namespace supports Protection Information Type 3.
    /// Bit 1 indicates whether namespace supports Protection Information Type 2.
    /// Bit 0 indicates whether namespace supports Protection Information Type 1.
    /// See NVMe 1.0e Section 8.3 End-to-end Data Protection (Optional)
    pub dpc: u8,
    /// End-to-end Data Protection Type Settings (DPS)
    ///
    /// Bits 7:4 are reserved
    /// Bit 3 indicates that the protection information, if enabled, is transferred as first 8 bytes of metadata (1) or last 8 bytes (0).
    /// Bits 2:0 indicate whether Protection Information is enabled and the type.
    ///     000b = Protection information is not enabled
    ///     001b = Protection information is enabled, Type 1
    ///     010b = Protection information is enabled, Type 2
    ///     011b = Protection information is enabled, Type 3
    ///     100b-111b = Reserved
    /// See NVMe 1.0e Section 8.3 End-to-end Data Protection (Optional)
    pub dps: u8,
    /// Reserved - Bytes 127:30
    pub _resv1: [u8; 98],
    /// LBA Formats (LBAF0-LBAF15)
    ///
    /// The list of supported LBA formats.
    pub lbaf: [LbaFormat; 16],
    /// Reserved - Bytes 383:192
    pub _resv2: [u8; 192],
    /// Vendor Specific (VS)
    pub vs: [u8; 3712],
}

// We can't derive Default since Default isn't impl'd
// for [T; N] where N > 32 yet (rust #61415)
impl Default for IdentifyNamespace {
    fn default() -> Self {
        Self {
            nsze: 0,
            ncap: 0,
            nuse: 0,
            nsfeat: 0,
            nlbaf: 0,
            flbas: 0,
            mc: 0,
            dpc: 0,
            dps: 0,
            lbaf: [LbaFormat::default(); 16],
            vs: [0; 3712],

            _resv1: [0; 98],
            _resv2: [0; 192],
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use std::mem::size_of;

    #[test]
    fn entry_sizing() {
        assert_eq!(size_of::<SubmissionQueueEntry>(), 64);
        assert_eq!(size_of::<CompletionQueueEntry>(), 16);
        assert_eq!(size_of::<PowerStateDescriptor>(), 32);
        assert_eq!(size_of::<IdentifyController>(), 4096);
        assert_eq!(size_of::<LbaFormat>(), 4);
        assert_eq!(size_of::<IdentifyNamespace>(), 4096);
        assert_eq!(size_of::<DatasetManagementRangeDefinition>(), 16);
    }
}


================================================
FILE: lib/propolis/src/hw/nvme/cmds.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::bits::{
    self, DatasetManagementRangeDefinition, StatusCodeType,
    SubmissionQueueEntry,
};
use super::queue::{QueueCreateErr, QueueId};
use crate::block;
use crate::common::*;
use crate::vmm::MemCtx;

use bitstruct::bitstruct;
use thiserror::Error;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn nvme_prp_entry(iter: u64, prp: u64) {}
    fn nvme_prp_list(iter: u64, prp: u64, idx: u16) {}
    fn nvme_prp_error(err: &'static str) {}
}

/// Errors that may be encountered during command parsing.
#[derive(Debug, Error)]
pub enum ParseErr {
    /// We do not currently support fused operations
    #[error("Fused ops not supported")]
    Fused,

    /// An invalid value was specified in the FUSE bits of `CDW0`.
    #[error("reserved FUSE value specified")]
    ReservedFuse,

    /// A reserved field was set to a non-zero value.
    #[error("reserved field value specified")]
    Reserved,
}

/// A parsed Admin Command
#[derive(Debug)]
pub enum AdminCmd {
    /// Delete the specified I/O Submission Queue
    DeleteIOSubQ(QueueId),
    /// Create the specified I/O Submission Queue
    CreateIOSubQ(CreateIOSQCmd),
    /// Get Log Page Command
    GetLogPage(GetLogPageCmd),
    /// Delete the specified I/O Completion Queue
    DeleteIOCompQ(QueueId),
    /// Create the specified I/O Completion Queue
    CreateIOCompQ(CreateIOCQCmd),
    /// Identify Command
    Identify(IdentifyCmd),
    /// Abort Command
    Abort(AbortCmd),
    /// Set Features Command
    SetFeatures(SetFeaturesCmd),
    /// Get Features Command
    GetFeatures(GetFeaturesCmd),
    /// Asynchronous Event Request Command
    AsyncEventReq,
    /// Doorbell Buffer Config Command
    DoorbellBufCfg(DoorbellBufCfgCmd),
    /// An unknown admin command
    Unknown(#[allow(dead_code)] GuestData<SubmissionQueueEntry>),
}

impl AdminCmd {
    /// Try to parse an `AdminCmd` out of a raw Submission Entry.
    pub fn parse(
        raw: GuestData<SubmissionQueueEntry>,
    ) -> Result<Self, ParseErr> {
        let cmd = match raw.opcode() {
            bits::ADMIN_OPC_DELETE_IO_SQ => {
                AdminCmd::DeleteIOSubQ(raw.cdw10 as u16)
            }
            bits::ADMIN_OPC_CREATE_IO_SQ => {
                let queue_prio = match (raw.cdw11 & 0b110) >> 1 {
                    0b00 => QueuePriority::Urgent,
                    0b01 => QueuePriority::High,
                    0b10 => QueuePriority::Medium,
                    0b11 => QueuePriority::Low,
                    _ => unreachable!(),
                };
                AdminCmd::CreateIOSubQ(CreateIOSQCmd {
                    prp: raw.prp1,
                    qsize: (raw.cdw10 >> 16) + 1, // Convert from 0's based
                    qid: raw.cdw10 as u16,
                    cqid: (raw.cdw11 >> 16) as u16,
                    queue_prio,
                    phys_contig: (raw.cdw11 & 1) != 0,
                })
            }
            bits::ADMIN_OPC_GET_LOG_PAGE => {
                AdminCmd::GetLogPage(GetLogPageCmd {
                    nsid: raw.nsid,
                    // Convert from 0's based dword
                    len: (((raw.cdw10 & 0xFFF) >> 16) + 1) * 4,
                    log_page_ident: LogPageIdent::from(raw.cdw10 as u8),
                    prp1: raw.prp1,
                    prp2: raw.prp2,
                })
            }
            bits::ADMIN_OPC_DELETE_IO_CQ => {
                AdminCmd::DeleteIOCompQ(raw.cdw10 as u16)
            }
            bits::ADMIN_OPC_CREATE_IO_CQ => {
                AdminCmd::CreateIOCompQ(CreateIOCQCmd {
                    prp: raw.prp1,
                    qsize: (raw.cdw10 >> 16) + 1, // Convert from 0's based
                    qid: raw.cdw10 as u16,
                    intr_vector: (raw.cdw11 >> 16) as u16,
                    intr_enable: (raw.cdw11 & 0b10) != 0,
                    phys_contig: (raw.cdw11 & 0b1) != 0,
                })
            }
            bits::ADMIN_OPC_IDENTIFY => AdminCmd::Identify(IdentifyCmd {
                // Only the last bit is used for NVMe 1.0e
                cns: raw.cdw10 as u8 & 0b1,
                nsid: raw.nsid,
                prp1: raw.prp1,
                prp2: raw.prp2,
            }),
            bits::ADMIN_OPC_ABORT => AdminCmd::Abort(AbortCmd {
                cid: (raw.cdw10 >> 16) as u16,
                sqid: raw.cdw10 as u16,
            }),
            bits::ADMIN_OPC_SET_FEATURES => {
                AdminCmd::SetFeatures(SetFeaturesCmd {
                    fid: FeatureIdent::from(raw.cdw10 as u8),
                    cdw11: raw.cdw11,
                })
            }
            bits::ADMIN_OPC_GET_FEATURES => {
                AdminCmd::GetFeatures(GetFeaturesCmd {
                    fid: FeatureIdent::from(raw.cdw10 as u8),
                    cdw11: raw.cdw11,
                })
            }
            bits::ADMIN_OPC_ASYNC_EVENT_REQ => AdminCmd::AsyncEventReq,
            bits::ADMIN_OPC_DOORBELL_BUF_CFG => {
                AdminCmd::DoorbellBufCfg(DoorbellBufCfgCmd {
                    shadow_doorbell_buffer: raw.prp1,
                    eventidx_buffer: raw.prp2,
                })
            }
            _ => AdminCmd::Unknown(raw),
        };
        let _fuse = match (raw.cdw0 >> 8) & 0b11 {
            0b00 => Ok(()),               // Normal (non-fused) operation
            0b01 => Err(ParseErr::Fused), // First fused op
            0b10 => Err(ParseErr::Fused), // Second fused op
            _ => Err(ParseErr::ReservedFuse),
        }?;
        Ok(cmd)
    }
}

/// Create I/O Completion Queue Command Parameters
#[allow(dead_code)]
#[derive(Debug)]
pub struct CreateIOCQCmd {
    /// PRP Entry 1 (PRP1)
    ///
    /// If the queue is physically contiguous, then this is the 64-bit base address
    /// of the Completion Queue in guest memory. Otherwise it a PRP List pointer of
    /// the pages that constitute the Completion Queue.
    pub prp: u64,

    /// Queue Size (QSIZE)
    ///
    /// The size of the Completion Queue to be created.
    /// See NVMe 1.0e Section 4.1.3 Queue Size
    /// NOTE: This has already been converted from a 0's based value.
    pub qsize: u32,

    /// Queue Identifier (QID)
    ///
    /// The identifier to assign to the Completion Queue to be created.
    /// See NVMe 1.0e Section 4.1.4 Queue Identifier
    pub qid: QueueId,

    /// Interrupt Vector (IV)
    ///
    /// The Interrupt Vector used to signal to the host (VM) upon pushing
    /// entries onto the Completion Queue.
    pub intr_vector: u16,

    /// Interrupts Enabled (IEN)
    ///
    /// Whether or not interrupts are enabled for this Completion Queue.
    pub intr_enable: bool,

    /// Physically Contiguous (PC)
    ///
    /// Whether or not the Completion Queue is physically contiguous in memory.
    pub phys_contig: bool,
}

/// Create I/O Submission Queue Command Parameters
#[allow(dead_code)]
#[derive(Debug)]
pub struct CreateIOSQCmd {
    /// PRP Entry 1 (PRP1)
    ///
    /// If the queue is physically contiguous, then this is the 64-bit base address
    /// of the Submission Queue in guest memory. Otherwise it a PRP List pointer of
    /// the pages that constitute the Completion Queue.
    pub prp: u64,

    /// Queue Size (QSIZE)
    ///
    /// The size of the Completion Queue to be created.
    /// See NVMe 1.0e Section 4.1.3 Queue Size
    /// NOTE: This has already been converted from a 0's based value.
    pub qsize: u32,

    /// Queue Identifier (QID)
    ///
    /// The identifier to assign to the Completion Queue to be created.
    /// See NVMe 1.0e Section 4.1.4 Queue Identifier
    pub qid: QueueId,

    /// Completion Queue Identifier (CQID)
    ///
    /// The ID of the corresponding Completion Queue for this Submission Queue.
    pub cqid: QueueId,

    /// Queue Priority (QPRIO)
    ///
    /// The priority service class for commands within this Submission Queue.
    /// Only used when the weighted round robin with an urgent priority service
    /// class is the arbitration mechanism selected.
    /// See NVMe 1.0e Section 4.7 Command Arbitration
    pub queue_prio: QueuePriority,

    /// Physically Contiguous (PC)
    ///
    /// Whether or not the Submission Queue is physically contiguous in memory.
    pub phys_contig: bool,
}

/// Priority Levels
#[derive(Debug)]
pub enum QueuePriority {
    /// Highest strict priority class (excluding Commands submitted to Admin Submission Queue)
    Urgent,
    /// Lowest strict priority class: Level - High
    High,
    /// Lowest strict priority class: Level - Medium
    Medium,
    /// Lowest strict priority class: Level - Low
    Low,
}

/// Get Log Page Command Parameters
#[allow(dead_code)]
#[derive(Debug)]
pub struct GetLogPageCmd {
    /// Namespace Identifier (NSID)
    ///
    /// The namespace that this command applies to.
    pub nsid: u32,

    /// The number of bytes to return.
    pub len: u32,

    /// Log Page Identifier (LID)
    ///
    /// The ID of the log page to retrieve.
    pub log_page_ident: LogPageIdent,

    /// PRP Entry 1 (PRP1)
    ///
    /// The first PRP entry specifying the start of the data buffer.
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// If PRP1 specifies enough space, then PRP2 is reserved. Otherwise
    /// PRP2 specifies the second page and remainder of the data. It may
    /// not be a PRP List.
    prp2: u64,
}

impl GetLogPageCmd {
    /// Returns an Iterator that yields [`GuestRegion`]'s to write the log page
    /// data to.
    ///
    /// The expected size of the memory covered by the PRPs is defined by
    /// `NUMD`, stored as bytes (rather than number Dwords) in [`Self::len`]
    pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> {
        PrpIter::new(u64::from(self.len), self.prp1, self.prp2, mem)
    }
}

/// The type of Log pages that may be retrieved with the Get Log Page command.
///
/// See NVMe 1.0e Section 5.10.1, Figure 58 Get Log Page - Log Page Identifiers
#[allow(dead_code)]
#[derive(Debug)]
pub enum LogPageIdent {
    /// Reserved Log Page
    Reserved,
    /// Error Information Log Page
    Error,
    /// SMART / Health Information Log Page
    Smart,
    /// Firmware Slot Information Log PAge
    Firmware,
    /// I/O Command Set Specific Log Page
    IOSpecifc(u8),
    /// Vendor Specific Log Page
    Vendor(u8),
}

impl From<u8> for LogPageIdent {
    fn from(ident: u8) -> Self {
        match ident {
            0 => LogPageIdent::Reserved,
            1 => LogPageIdent::Error,
            2 => LogPageIdent::Smart,
            3 => LogPageIdent::Firmware,
            0x04..=0x7F => LogPageIdent::Reserved,
            0x80..=0xBF => LogPageIdent::IOSpecifc(ident),
            0xC0..=0xFF => LogPageIdent::Vendor(ident),
        }
    }
}

/// Identify Command Parameters
#[derive(Debug)]
pub struct IdentifyCmd {
    /// The type of Identify data structure to return
    pub cns: u8,

    /// Namespace Identifier (NSID)
    ///
    /// The namespace that this command applies to.
    pub nsid: u32,

    /// PRP Entry 1 (PRP1)
    ///
    /// The first PRP entry specifying the start of the data buffer.
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// If PRP1 specifies enough space, then PRP2 is reserved. Otherwise
    /// PRP2 specifies the second page and remainder of the data. It may
    /// not be a PRP List.
    prp2: u64,
}

impl IdentifyCmd {
    /// Returns an Iterator that yields [`GuestRegion`]'s to write the identify structure data to.
    pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> {
        PrpIter::new(PAGE_SIZE as u64, self.prp1, self.prp2, mem)
    }
}

/// Abort Command Parameters
#[derive(Debug)]
pub struct AbortCmd {
    /// The command identifier of the command to be aborted.
    pub cid: u16,

    /// The ID of the Submission Queue asssociated with the command to be
    /// aborted.
    pub sqid: u16,
}

/// Get Features Command Parameters
#[derive(Debug)]
pub struct GetFeaturesCmd {
    /// Feature Identifier (FID)
    ///
    /// The feature that attributes are being specified for.
    pub fid: FeatureIdent,

    pub cdw11: u32,
}

/// Set Features Command Parameters
#[derive(Debug)]
pub struct SetFeaturesCmd {
    /// Feature Identifier (FID)
    ///
    /// The feature that attributes are being specified for.
    pub fid: FeatureIdent,

    pub cdw11: u32,
}

/// Doorbell Buffer Config Comannd Parameters
#[derive(Debug)]
pub struct DoorbellBufCfgCmd {
    pub shadow_doorbell_buffer: u64,
    pub eventidx_buffer: u64,
}

/// Feature Identifiers
///
/// See NVMe 1.0e Section 5.12.1, Figure 73 Set Features - Feature Identifiers
/// TODO: Fill out parameters for rest of variants
#[derive(Debug)]
pub enum FeatureIdent {
    /// Reserved or unknown feature identifier
    Reserved,
    /// Arbitration
    ///
    /// Controls command arbitration.
    /// See NVMe 1.0e Section 4.7 Command Arbitration
    Arbitration,
    /// Power Management
    ///
    /// Allows configuring power state.
    PowerManagement,
    /// LBA Range Type
    ///
    /// Indicates the type and attributes of LBA ranges that part of the specified namespace.
    LbaRangeType,
    /// Temperature Threshold
    ///
    /// Indicates the threshold for the temperature of the overall device (controller and NVM) in Kelvin.
    TemperatureThreshold,
    /// Error Recovery
    ///
    /// Controls error recovery attributes.
    ErrorRecovery,
    /// Volatile Write Cache
    ///
    /// Control the volatile write cache, if present.
    VolatileWriteCache,
    /// Number of Queues
    ///
    /// Indicates the number of queues requested to the controller.
    /// Only allowed during initialization and cannot change between resets.
    NumberOfQueues,
    /// Interrupt Coalescing
    ///
    /// Allows configuring interrupt coalescing settings.
    InterruptCoalescing,
    /// Interrupt Vector Configuration
    ///
    /// Allows confuring settings specific to a particular interrupt vector.
    InterruptVectorConfiguration,
    /// Write Atomicity
    ///
    /// Control write atomicity.
    WriteAtomicity,
    /// Asynchronous Event Configuration
    ///
    /// Controls the events that trigger an asynchronous event notification.
    AsynchronousEventConfiguration,
    /// Software Progress Marker
    ///
    /// This feature is persistent across power states.
    /// See NVMe 1.0e Section 7.6.1.1 Software Progress Marker
    SoftwareProgressMarker,

    // Vendor specific features.
    /// Oxide-specific feature - returns relevant device features.
    OxideDeviceFeatures,

    /// All other vendor specific features.
    Vendor(#[allow(dead_code)] u8),
}

impl From<u8> for FeatureIdent {
    fn from(fid: u8) -> Self {
        use super::bits::*;
        use FeatureIdent::*;
        match fid {
            0 => Reserved,
            FEAT_ID_ARBITRATION => Arbitration,
            FEAT_ID_POWER_MGMT => PowerManagement,
            FEAT_ID_LBA_RANGE_TYPE => LbaRangeType,
            FEAT_ID_TEMP_THRESH => TemperatureThreshold,
            FEAT_ID_ERROR_RECOVERY => ErrorRecovery,
            FEAT_ID_VOLATILE_WRITE_CACHE => VolatileWriteCache,
            FEAT_ID_NUM_QUEUES => NumberOfQueues,
            FEAT_ID_INTR_COALESCE => InterruptCoalescing,
            FEAT_ID_INTR_VEC_CFG => InterruptVectorConfiguration,
            FEAT_ID_WRITE_ATOMIC => WriteAtomicity,
            FEAT_ID_ASYNC_EVENT_CFG => AsynchronousEventConfiguration,
            0xC..=0x7F => Reserved,
            0x80 => SoftwareProgressMarker,
            0x81..=0xBF => Reserved,
            FEAT_ID_OXIDE_DEVICE_FEATURES => OxideDeviceFeatures,
            0xC0..=0xFF => Vendor(fid),
        }
    }
}

bitstruct! {
    #[derive(Clone, Copy, Default)]
    pub struct FeatTemperatureThreshold(pub u32) {
        /// Temperature Threshold (TMPTH)
        pub tmpth: u16 = 0..16;

        /// Threshold Temperature Select (TMPSEL)
        pub tmpsel: ThresholdTemperatureSelect = 16..20;

        /// Threshold Type Select (THSEL)
        pub thsel: ThresholdTypeSelect = 20..22;

        /// Reserved
        reserved: u32 = 22..32;
    }
}

#[derive(Copy, Clone, PartialEq, Eq)]
pub enum ThresholdTemperatureSelect {
    Composite,
    Sensor1,
    Sensor2,
    Sensor3,
    Sensor4,
    Sensor5,
    Sensor6,
    Sensor7,
    Sensor8,
    Reserved(u8),
    All,
}
impl bitstruct::FromRaw<u8, ThresholdTemperatureSelect>
    for FeatTemperatureThreshold
{
    fn from_raw(raw: u8) -> ThresholdTemperatureSelect {
        use ThresholdTemperatureSelect::*;
        match raw {
            0b0000 => Composite,
            0b0001 => Sensor1,
            0b0010 => Sensor2,
            0b0011 => Sensor3,
            0b0100 => Sensor4,
            0b0101 => Sensor5,
            0b0110 => Sensor6,
            0b0111 => Sensor7,
            0b1000 => Sensor8,
            0b1111 => All,
            val => Reserved(val),
        }
    }
}
impl bitstruct::IntoRaw<u8, ThresholdTemperatureSelect>
    for FeatTemperatureThreshold
{
    fn into_raw(target: ThresholdTemperatureSelect) -> u8 {
        use ThresholdTemperatureSelect::*;
        match target {
            Composite => 0b0000,
            Sensor1 => 0b0001,
            Sensor2 => 0b0010,
            Sensor3 => 0b0011,
            Sensor4 => 0b0100,
            Sensor5 => 0b0101,
            Sensor6 => 0b0110,
            Sensor7 => 0b0111,
            Sensor8 => 0b1000,
            All => 0b1111,
            Reserved(val) => val,
        }
    }
}

#[derive(Copy, Clone, PartialEq, Eq)]
pub enum ThresholdTypeSelect {
    Over,
    Under,
    Reserved(u8),
}
impl bitstruct::FromRaw<u8, ThresholdTypeSelect> for FeatTemperatureThreshold {
    fn from_raw(raw: u8) -> ThresholdTypeSelect {
        use ThresholdTypeSelect::*;
        match raw {
            0b00 => Over,
            0b01 => Under,
            val => Reserved(val),
        }
    }
}
impl bitstruct::IntoRaw<u8, ThresholdTypeSelect> for FeatTemperatureThreshold {
    fn into_raw(target: ThresholdTypeSelect) -> u8 {
        use ThresholdTypeSelect::*;
        match target {
            Over => 0b00,
            Under => 0b01,
            Reserved(val) => val,
        }
    }
}

pub(crate) struct FeatVolatileWriteCache {
    pub wce: bool,
}
impl From<u32> for FeatVolatileWriteCache {
    fn from(cdw11: u32) -> Self {
        Self { wce: cdw11 & 0b1 != 0 }
    }
}
impl From<FeatVolatileWriteCache> for u32 {
    fn from(value: FeatVolatileWriteCache) -> Self {
        u32::from(value.wce)
    }
}

pub(crate) struct FeatNumberQueues {
    /// Number of I/O Completion Queues Requested/Allocated (NCQR/NCQA)
    ///
    /// Does not include Admin Completion Queue. Minimum of 2 shall be requested.
    pub ncq: u16,
    /// Number of I/O Submission Queues Requested/Allocated (NSQR/NSQA)
    ///
    /// Does not include Admin Submission Queue. Minimum of 2 shall be requested.
    pub nsq: u16,
}
impl TryFrom<u32> for FeatNumberQueues {
    type Error = ();

    fn try_from(cdw11: u32) -> Result<Self, Self::Error> {
        let ncqr = (cdw11 >> 16) as u16;
        let nsqr = cdw11 as u16;

        if ncqr == u16::MAX || nsqr == u16::MAX {
            // A max value of 65534 is allowed, implying 65535 queues of the
            // respective type(s).  Reject requests which exceed that.
            Err(())
        } else {
            Ok(Self {
                // Convert from 0's based values
                ncq: ncqr + 1,
                nsq: nsqr + 1,
            })
        }
    }
}
// Usable for both the return from SetFeature and GetFeature for NumberOfQueues,
// we reuse this struct as the format is the same.
impl From<FeatNumberQueues> for u32 {
    fn from(value: FeatNumberQueues) -> Self {
        // Convert to 0's based DW0
        (u32::from(value.ncq.saturating_sub(1)) << 16)
            | u32::from(value.nsq.saturating_sub(1))
    }
}

pub(crate) struct FeatInterruptVectorConfig {
    /// Interrupt Vector (IV)
    pub iv: u16,
    /// Coalescing Disable (CD)
    pub cd: bool,
}
impl From<u32> for FeatInterruptVectorConfig {
    fn from(cdw11: u32) -> Self {
        Self { iv: cdw11 as u16, cd: cdw11 & (1 << 16) != 0 }
    }
}
impl From<FeatInterruptVectorConfig> for u32 {
    fn from(value: FeatInterruptVectorConfig) -> Self {
        u32::from(value.iv) | (u32::from(value.cd) << 16)
    }
}

bitstruct! {
    pub struct OxideDeviceFeatures(pub u32) {
        /// Indicates the device is read-only and will complete all attempted
        /// writes with `STS_WRITE_READ_ONLY_RANGE`.
        pub read_only: bool = 0;

        /// Reserved
        reserved: u32 = 1..32;
    }
}

/// A parsed NVM Command
#[allow(dead_code)]
#[derive(Debug)]
pub enum NvmCmd {
    /// Commit data and metadata
    Flush,
    /// Write data and metadata
    Write(WriteCmd),
    /// Read data and metadata
    Read(ReadCmd),
    /// Dataset Management Command
    DatasetManagement(DatasetManagementCmd),
    /// An unknown NVM command
    Unknown(GuestData<SubmissionQueueEntry>),
}

impl NvmCmd {
    /// Try to parse an `NvmCmd` out of a raw Submission Entry.
    pub fn parse(
        raw: GuestData<SubmissionQueueEntry>,
    ) -> Result<Self, ParseErr> {
        let _fuse = match (raw.cdw0 >> 8) & 0b11 {
            0b00 => Ok(()),               // Normal (non-fused) operation
            0b01 => Err(ParseErr::Fused), // First fused op
            0b10 => Err(ParseErr::Fused), // Second fused op
            _ => Err(ParseErr::ReservedFuse),
        }?;
        let cmd = match raw.opcode() {
            bits::NVM_OPC_FLUSH => NvmCmd::Flush,
            bits::NVM_OPC_WRITE => NvmCmd::Write(WriteCmd {
                slba: (u64::from(raw.cdw11) << 32) | u64::from(raw.cdw10),
                // Convert from 0's based value
                nlb: raw.cdw12 as u16 + 1,
                prp1: raw.prp1,
                prp2: raw.prp2,
            }),
            bits::NVM_OPC_READ => NvmCmd::Read(ReadCmd {
                slba: (u64::from(raw.cdw11) << 32) | u64::from(raw.cdw10),
                // Convert from 0's based value
                nlb: raw.cdw12 as u16 + 1,
                prp1: raw.prp1,
                prp2: raw.prp2,
            }),
            bits::NVM_OPC_DATASET_MANAGEMENT => {
                if (raw.cdw11 & !0b111) != 0 {
                    // Only the lowest 3 bits of CDW11 are used for Dataset
                    // Management, so reject if any other bits are set.
                    return Err(ParseErr::Reserved);
                }
                NvmCmd::DatasetManagement(DatasetManagementCmd {
                    prp1: raw.prp1,
                    prp2: raw.prp2,
                    // Convert from 0's based value
                    nr: (raw.cdw10 & 0xFF) as u16 + 1,
                    ad: raw.cdw11 & (1 << 2) != 0,
                    _idw: raw.cdw11 & (1 << 1) != 0,
                    _idr: raw.cdw11 & (1 << 0) != 0,
                })
            }
            _ => NvmCmd::Unknown(raw),
        };
        Ok(cmd)
    }
}

/// Write Command Parameters
#[derive(Debug)]
pub struct WriteCmd {
    /// Starting LBA (SLBA)
    ///
    /// 64-bit base address of the first logical block to be written.
    pub slba: u64,

    /// Number of Logical Blocks (NLB)
    ///
    /// The number of logical blocks to be written.
    pub nlb: u16,

    /// PRP Entry 1 (PRP1)
    ///
    /// The first PRP entry specifying the start of the data buffer to be transferred from.
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// If PRP1 specifies enough space, then PRP2 is reserved. Otherwise
    /// PRP2 may either be another PRP entry or a PRP list as necessary.
    prp2: u64,
}

impl WriteCmd {
    /// Returns an Iterator that yields [`GuestRegion`]'s to read the data to transfer out.
    pub fn data<'a>(&self, sz: u64, mem: &'a MemCtx) -> PrpIter<'a> {
        PrpIter::new(sz, self.prp1, self.prp2, mem)
    }
}

/// Read Command Parameters
#[derive(Debug)]
pub struct ReadCmd {
    /// Starting LBA (SLBA)
    ///
    /// 64-bit base address of the first logical block to be read.
    pub slba: u64,

    /// Number of Logical Blocks (NLB)
    ///
    /// The number of logical blocks to be read.
    pub nlb: u16,

    /// PRP Entry 1 (PRP1)
    ///
    /// The first PRP entry specifying the start of the data buffer to be transferred to.
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// If PRP1 specifies enough space, then PRP2 is reserved. Otherwise
    /// PRP2 may either be another PRP entry or a PRP list as necessary.
    prp2: u64,
}

impl ReadCmd {
    /// Returns an Iterator that yields [`GuestRegion`]'s to write the data to transfer in.
    pub fn data<'a>(&self, sz: u64, mem: &'a MemCtx) -> PrpIter<'a> {
        PrpIter::new(sz, self.prp1, self.prp2, mem)
    }
}

/// Dataset Management Command Parameters
#[derive(Debug)]
pub struct DatasetManagementCmd {
    /// PRP Entry 1 (PRP1)
    ///
    /// Indicates a data buffer that contains the LBA range information.
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// Indicates a second data buffer that contains LBA range information.  It may not be a PRP
    /// List.
    prp2: u64,

    /// Number of Ranges (NR)
    ///
    /// Indicates the number of 16 byte range sets that are specified in the command.
    pub nr: u16,

    /// Attribute – Deallocate (AD)
    ///
    /// If set to ‘1’ then the NVM subsystem may deallocate all provided ranges. If a read occurs
    /// to a deallocated range, the NVM Express subsystem shall return all zeros, all ones, or
    /// the last data written to the associated LBA.
    ///
    /// Note: The operation of the Deallocate function is similar to the ATA DATA SET MANAGEMENT
    /// with Trim feature described in ACS-2 and SCSI UNMAP command described in SBC-3.
    ad: bool,

    /// Attribute – Integral Dataset for Write (IDW)
    ///
    /// If set to ‘1’ then the dataset should be optimized for write access as an integral unit.
    /// The host expects to perform operations on all ranges provided as an integral unit for
    /// writes, indicating that if a portion of the dataset is written it is expected that all of
    /// the ranges in the dataset are going to be written.
    ///
    /// Note: this field is advisory, and we ignore it.
    _idw: bool,

    /// Attribute – Integral Dataset for Read (IDR)
    ///
    /// If set to ‘1’ then the dataset should be optimized for read access as an integral unit.
    /// The host expects to perform operations on all ranges provided as an integral unit for
    /// reads, indicating that if a portion of the dataset is read it is expected that all of the
    /// ranges in the dataset are going to be read.
    ///
    /// Note: this field is advisory, and we ignore it.
    _idr: bool,
}

impl DatasetManagementCmd {
    /// Returns an Iterator that yields [`GuestRegion`]'s which contain the array of LBA ranges.
    pub fn data<'a>(&self, mem: &'a MemCtx) -> PrpIter<'a> {
        PrpIter::new(
            // given that self.nr is at most 256, the multiplication here cannot overflow a u64
            u64::from(self.nr)
                * size_of::<DatasetManagementRangeDefinition>() as u64,
            self.prp1,
            self.prp2,
            mem,
        )
    }

    /// Returns an Iterator that yields the LBA ranges specified in this command.  If any of the
    /// ranges cannot be read from guest memory, yields an error for that range instead.
    pub fn ranges<'a>(
        &self,
        mem: &'a MemCtx,
    ) -> impl Iterator<
        Item = Result<DatasetManagementRangeDefinition, &'static str>,
    > + 'a {
        self.data(mem).flat_map(|region| {
            if let Some(Ok(defs)) = mem
                .readable_region(&region)
                .map(|mapping| mapping.read_many_owned())
            {
                defs.into_iter().map(Ok).collect::<Vec<_>>().into_iter()
            } else {
                vec![Err("Failed to read LBA range")].into_iter()
            }
        })
    }

    pub fn is_deallocate(&self) -> bool {
        self.ad
    }
}

/// Indicates the possible states of a [`PrpIter`].
#[derive(Clone, Copy, Eq, PartialEq, Debug)]
enum PrpNext {
    Prp1,
    Prp2,
    List(u64, u16),
    Done,
}

/// The last valid PRP list entry index in a single Physical Region Page (PRP) List
///
/// 512 64-bit entries in a PRP list
/// Note this relies on a Page Size of 4k (2^12).
/// More generally: 2^(lg(PAGE_SIZE)-1-2)
///     - 2 because PRP entries are expected to be 32-bit aligned.
///
/// See NVMe 1.0e Section 4.3 Physical Region Page Entry and List
const PRP_LIST_MAX: u16 = 511;

/// A helper object for iterator over a single, 2 or a list of PRPs.
pub struct PrpIter<'a> {
    /// PRP Entry 1 (PRP1)
    ///
    /// The first PRP entry specifying the start of the data buffer or
    prp1: u64,

    /// PRP Entry 2 (PRP2)
    ///
    /// The second PRP entry or a PRP list pointer or reserved if PRP1 was enough.
    prp2: u64,

    /// Handle to Guest's [`MemCtx`]
    mem: &'a MemCtx,

    /// How many bytes remaining to be read/written
    remain: u64,

    /// The next PRP state
    next: PrpNext,

    /// Any error we might've encountered
    error: Option<&'static str>,
}

impl<'a> PrpIter<'a> {
    /// Create a new `PrpIter` object.
    ///
    /// See corresponding `data` methods on any relevant commands.
    pub fn new(size: u64, prp1: u64, prp2: u64, mem: &'a MemCtx) -> Self {
        // prp1 and prp2 are expected to be 32-bit aligned
        assert!(prp1 & 0b11 == 0);
        assert!(prp2 & 0b11 == 0);
        Self { prp1, prp2, mem, remain: size, next: PrpNext::Prp1, error: None }
    }
}

impl PrpIter<'_> {
    /// Grab the next memory region to read/write
    fn get_next(&mut self) -> Result<GuestRegion, &'static str> {
        assert!(self.remain > 0);
        assert!(self.error.is_none());

        // PRP Entry Layout
        // | 63 . . . . . . . . . . . . . . . n + 1 | n . . . . . . 2 | 1 0 |
        // |       page base address                |      offset     | 0 0 |
        let (addr, size, next) = match self.next {
            PrpNext::Prp1 => {
                // The first PRP entry contained within the command may have a
                // non-zero offset within the memory page.
                probes::nvme_prp_entry!(|| (
                    self as *const Self as u64,
                    self.prp1
                ));
                let offset = self.prp1 & PAGE_OFFSET as u64;
                let size = u64::min(PAGE_SIZE as u64 - offset, self.remain);
                let after = self.remain - size;
                let next = if after == 0 {
                    PrpNext::Done
                } else if after <= PAGE_SIZE as u64 {
                    // Remaining data can be covered by single additional PRP
                    // entry which should be present in PRP2
                    PrpNext::Prp2
                } else {
                    // If the remaining length is larger than the page size,
                    // PRP2 points to a list.
                    //
                    // The first PRP List entry:
                    // - shall be Qword aligned, and
                    // - may also have a non-zero offset within the memory page.
                    if !self.prp2.is_multiple_of(8) {
                        return Err("PRP2 not Qword aligned!");
                    }

                    // PRP2 is allowed a non-zero offset into the page, meaning
                    // this operation's PRP list could start in the middle of
                    // the page. For example, idx below could be anywhere from 0
                    // to PRP_LIST_MAX:
                    //
                    //                | idx
                    //  --------------| ---
                    //  PRP List base | 0
                    //  PRP List base | 1
                    //  PRP List base | 2
                    //  ...
                    //  PRP List base | PRP_LIST_MAX - 1
                    //  PRP List base | PRP_LIST_MAX
                    //
                    // Note that lists cannot cross page boundaries. If idx =
                    // PRP_LIST_MAX is reached, the last entry will point to
                    // another list (unless the remaining size is satisfied by
                    // the end of the list).
                    let base = self.prp2 & (PAGE_MASK as u64);
                    let idx = (self.prp2 & PAGE_OFFSET as u64) / 8;
                    probes::nvme_prp_list!(|| (
                        self as *const Self as u64,
                        base,
                        idx as u16,
                    ));
                    PrpNext::List(base, idx as u16)
                };
                (self.prp1, size, next)
            }
            PrpNext::Prp2 => {
                // If a second PRP entry is present within a command, it shall
                // have a memory page offset of 0h
                if self.prp2 & PAGE_OFFSET as u64 != 0 {
                    return Err("Inappropriate PRP2 offset");
                }
                probes::nvme_prp_entry!(|| (
                    self as *const Self as u64,
                    self.prp2
                ));
                let size = self.remain;
                assert!(size <= PAGE_SIZE as u64);
                (self.prp2, size, PrpNext::Done)
            }
            PrpNext::List(base, idx) => {
                assert!(idx <= PRP_LIST_MAX);
                let entry_addr = base + u64::from(idx) * 8;
                let entry: GuestData<u64> = self
                    .mem
                    .read(GuestAddr(entry_addr))
                    .ok_or_else(|| "Unable to read PRP list entry")?;
                probes::nvme_prp_entry!(
                    || (self as *const Self as u64, entry,)
                );

                if *entry & PAGE_OFFSET as u64 != 0 {
                    return Err("Inappropriate PRP list entry offset");
                }

                if self.remain <= PAGE_SIZE as u64 {
                    (*entry, self.remain, PrpNext::Done)
                } else if idx != PRP_LIST_MAX {
                    (*entry, PAGE_SIZE as u64, PrpNext::List(base, idx + 1))
                } else {
                    // The last PRP in this list chains to another
                    // (page-aligned) list with the next PRP.
                    self.next = PrpNext::List(*entry, 0);
                    probes::nvme_prp_list!(|| (
                        self as *const Self as u64,
                        *entry,
                        0,
                    ));
                    return self.get_next();
                }
            }
            PrpNext::Done => {
                // prior checks of self.remain should prevent us from ever
                // reaching this
                panic!()
            }
        };

        assert!(size <= self.remain);
        if size == self.remain {
            assert_eq!(next, PrpNext::Done);
        }
        self.remain -= size;
        self.next = next;

        Ok(GuestRegion(GuestAddr(addr), size as usize))
    }
}

impl Iterator for PrpIter<'_> {
    type Item = GuestRegion;

    fn next(&mut self) -> Option<Self::Item> {
        if self.remain == 0 || self.error.is_some() {
            return None;
        }
        match self.get_next() {
            Ok(res) => Some(res),
            Err(e) => {
                probes::nvme_prp_error!(|| e);
                self.error = Some(e);
                None
            }
        }
    }
}

/// A Command Completion result
#[derive(Debug)]
pub struct Completion {
    /// Status Code Type and Status Code
    pub status: u16,
    /// Command Specific Result (DW0)
    pub dw0: u32,
}

impl Completion {
    /// Create a successful Completion result
    pub fn success() -> Self {
        Self {
            dw0: 0,
            status: Self::status_field(
                StatusCodeType::Generic,
                bits::STS_SUCCESS,
            ),
        }
    }

    /// Create a successful Completion result with a specific value
    pub fn success_val(cdw0: u32) -> Self {
        Self {
            dw0: cdw0,
            status: Self::status_field(
                StatusCodeType::Generic,
                bits::STS_SUCCESS,
            ),
        }
    }

    /// Create an error Completion result with a specific type and status
    pub fn specific_err(sct: StatusCodeType, status: u8) -> Self {
        // success doesn't belong in an error
        assert_ne!((sct, status), (StatusCodeType::Generic, bits::STS_SUCCESS));

        Self { dw0: 0, status: Self::status_field(sct, status) }
    }

    /// Create a generic error Completion result with a specific status
    pub fn generic_err(status: u8) -> Self {
        // success doesn't belong in an error
        assert_ne!(status, bits::STS_SUCCESS);

        Self {
            dw0: 0,
            status: Self::status_field(StatusCodeType::Generic, status),
        }
    }

    /// Set do-not-retry bit on a Completion already bearing an error status
    pub fn dnr(mut self) -> Self {
        assert_ne!(
            (self.status >> 1) as u8,
            bits::STS_SUCCESS,
            "cannot set DNR on non-error"
        );
        self.status |= 1 << 15;
        self
    }

    /// Helper method to combine [StatusCodeType] and status code
    const fn status_field(sct: StatusCodeType, sc: u8) -> u16 {
        ((sc as u16) << 1) | (((sct as u8) as u16) << 9)
        // ((more as u16) << 14) | ((dnr as u16) << 15)
    }
}

impl From<QueueCreateErr> for Completion {
    fn from(e: QueueCreateErr) -> Self {
        match e {
            QueueCreateErr::InvalidBaseAddr => {
                Completion::generic_err(bits::STS_INVAL_FIELD)
            }
            QueueCreateErr::InvalidSize => Completion::specific_err(
                StatusCodeType::CmdSpecific,
                bits::STS_CREATE_IO_Q_INVAL_QSIZE,
            ),
            QueueCreateErr::SubQueueIdAlreadyExists(_) => {
                Completion::specific_err(
                    StatusCodeType::CmdSpecific,
                    bits::STS_CREATE_IO_Q_INVAL_QID,
                )
            }
        }
    }
}

impl From<block::Result> for Completion {
    fn from(res: block::Result) -> Completion {
        match res {
            block::Result::Success => Completion::success(),
            block::Result::Failure => {
                Completion::generic_err(bits::STS_DATA_XFER_ERR)
            }
            block::Result::ReadOnly => Completion::specific_err(
                bits::StatusCodeType::CmdSpecific,
                bits::STS_WRITE_READ_ONLY_RANGE,
            ),
            block::Result::Unsupported => Completion::specific_err(
                bits::StatusCodeType::CmdSpecific,
                bits::STS_READ_CONFLICTING_ATTRS,
            ),
        }
    }
}

#[cfg(test)]
mod test {
    use crate::accessors::MemAccessor;
    use crate::common::*;
    use crate::hw::nvme::bits::DatasetManagementRangeDefinition;
    use crate::hw::nvme::cmds::DatasetManagementCmd;
    use crate::vmm::mem::PhysMap;

    use super::PrpIter;

    const VM_SIZE: usize = 256 * PAGE_SIZE;
    const PRP_PER_PAGE: usize = PAGE_SIZE / 8;

    fn setup() -> (PhysMap, MemAccessor) {
        let mut pmap = PhysMap::new_test(VM_SIZE);
        pmap.add_test_mem("lowmem".to_string(), 0, VM_SIZE)
            .expect("lowmem seg creation should succeed");

        let acc_mem = pmap.finalize();
        (pmap, acc_mem)
    }

    // Simple helpers to make math below more terse
    const fn region(addr: u64, sz: u64) -> GuestRegion {
        GuestRegion(GuestAddr(addr), sz as usize)
    }
    const fn pages(c: u64) -> u64 {
        PAGE_SIZE as u64 * c
    }

    #[test]
    fn test_prp_single() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        // Basic single page
        let mut iter = PrpIter::new(pages(1), 0x1000, 0, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000, pages(1))));
        assert_eq!(iter.next(), None);

        // Sub-single page
        let sub = 0x200;
        let mut iter = PrpIter::new(pages(1) - sub, 0x1000, 0, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000, pages(1) - sub)));
        assert_eq!(iter.next(), None);

        // Sub-single page (with offset)
        let mut iter = PrpIter::new(pages(1) - sub, 0x1000 + sub, 0, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000 + sub, pages(1) - sub)));
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn test_prp_dual() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        // Basic dual page
        let mut iter = PrpIter::new(pages(2), 0x1000, 0x2000, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x2000, pages(1))));
        assert_eq!(iter.next(), None);

        // single page, split by offset
        let sub = 0x200;
        let mut iter = PrpIter::new(pages(1), 0x1000 + sub, 0x2000, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000 + sub, pages(1) - sub)));
        assert_eq!(iter.next(), Some(region(0x2000, sub)));
        assert_eq!(iter.next(), None);

        // less than single page, split by offset
        let sz = pages(1) - 0x200;
        let off = 0x400;
        let mut iter = PrpIter::new(sz, 0x1000 + off, 0x2000, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000 + off, pages(1) - off)));
        assert_eq!(iter.next(), Some(region(0x2000, sz - (pages(1) - off))));
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn test_prp_list() {
        // Basic triple page (aligned prplist)
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listprps: [u64; 2] = [0x2000, 0x3000];
        let listaddr = 0x80000;
        memctx.write(GuestAddr(listaddr), &listprps);
        let mut iter = PrpIter::new(pages(3), 0x1000, listaddr, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x2000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x3000, pages(1))));
        assert_eq!(iter.next(), None);

        // Basic triple page (offset prplist)
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listprps: [u64; 2] = [0x2000, 0x3000];
        let listaddr = 0x80010;
        memctx.write(GuestAddr(listaddr), &listprps);
        let mut iter = PrpIter::new(pages(3), 0x1000, listaddr, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x2000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x3000, pages(1))));
        assert_eq!(iter.next(), None);

        // Offset triple page
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listprps: [u64; 3] = [0x2000, 0x3000, 0x4000];
        let listaddr = 0x80000;
        let off = 0x200;
        memctx.write(GuestAddr(listaddr), &listprps);
        let mut iter = PrpIter::new(pages(3), 0x1000 + off, listaddr, &memctx);
        assert_eq!(iter.next(), Some(region(0x1000 + off, pages(1) - off)));
        assert_eq!(iter.next(), Some(region(0x2000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x3000, pages(1))));
        assert_eq!(iter.next(), Some(region(0x4000, off)));
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn test_prp_list_offset_last() {
        // List with offset, where last entry covers less than one page
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listaddr = 0x80000u64;
        let mut prps: Vec<u64> = Vec::with_capacity(PRP_PER_PAGE);
        let mut bufaddr = 0x2000u64;
        for _idx in 0..PRP_PER_PAGE {
            prps.push(bufaddr);
            bufaddr += pages(1);
        }
        memctx.write_many(GuestAddr(listaddr), &prps);
        let off = 0x200;
        let mut iter = PrpIter::new(
            pages(PRP_PER_PAGE as u64),
            0x1000 + off,
            listaddr,
            &memctx,
        );
        assert_eq!(
            iter.next(),
            Some(region(0x1000 + off, pages(1) - off)),
            "prp1 entry incorrect"
        );

        let mut bufaddr = 0x2000u64;
        for idx in 0..(PRP_PER_PAGE - 1) {
            assert_eq!(
                iter.next(),
                Some(region(bufaddr, pages(1))),
                "bad prp at idx: {idx}"
            );
            bufaddr += pages(1);
        }
        // last prplist entry should be the remaining bytes left over from
        // offsetting the prp1 entry
        assert_eq!(iter.next(), Some(region(bufaddr, off)));
    }

    #[test]
    fn test_prp_multiple() {
        // Basic multiple-page prplist
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listaddrs = [0x80000u64, 0x81000u64];
        let mut prps: Vec<u64> = Vec::with_capacity(PRP_PER_PAGE);
        let mut bufaddr = 0x2000u64;

        let entries_first = PRP_PER_PAGE - 1;
        for _idx in 0..entries_first {
            prps.push(bufaddr);
            bufaddr += pages(1);
        }
        // Link to the next list page
        prps.push(listaddrs[1]);
        memctx.write_many(GuestAddr(listaddrs[0]), &prps[..]);

        // populate a few more entries in the next prplist
        let entries_second = 4;
        for idx in 0..entries_second {
            prps[idx] = bufaddr;
            bufaddr += pages(1);
        }
        memctx.write_many(GuestAddr(listaddrs[1]), &prps[..entries_second]);

        let total_entries = 1 + entries_first + entries_second;
        let mut iter = PrpIter::new(
            pages(total_entries as u64),
            0x1000,
            listaddrs[0],
            &memctx,
        );

        let mut bufaddr = 0x1000u64;
        for idx in 0..total_entries {
            assert_eq!(
                iter.next(),
                Some(region(bufaddr, pages(1))),
                "bad prp at idx: {idx}"
            );
            bufaddr += pages(1);
        }
        assert_eq!(iter.next(), None);
    }

    static RANGES: [DatasetManagementRangeDefinition; 3] = [
        DatasetManagementRangeDefinition {
            context_attributes: 0,
            starting_lba: 0x1000,
            number_logical_blocks: 0x10,
        },
        DatasetManagementRangeDefinition {
            context_attributes: 0,
            starting_lba: 0x2000,
            number_logical_blocks: 0x20,
        },
        DatasetManagementRangeDefinition {
            context_attributes: 0,
            starting_lba: 0x3000,
            number_logical_blocks: 0x30,
        },
    ];

    #[test]
    fn test_dsmgmt_ranges() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listaddr = 0x80000u64;
        memctx.write_many(GuestAddr(listaddr), &RANGES);

        let cmd = DatasetManagementCmd {
            prp1: listaddr,
            prp2: 0,
            nr: RANGES.len() as u16,
            ad: true,
            _idw: false,
            _idr: false,
        };

        let mut iter = cmd.ranges(&memctx);
        for expected in &RANGES {
            assert_eq!(
                iter.next(),
                Some(Ok(*expected)),
                "bad range definition"
            );
        }
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn test_dsmgmt_ranges_dual() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listaddr1 = 0x80FF0u64;
        let listaddr2 = 0x90000u64;
        memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]);
        memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]);

        let cmd = DatasetManagementCmd {
            prp1: listaddr1,
            prp2: listaddr2,
            nr: RANGES.len() as u16,
            ad: true,
            _idw: false,
            _idr: false,
        };

        let mut iter = cmd.ranges(&memctx);
        for expected in &RANGES {
            assert_eq!(
                iter.next(),
                Some(Ok(*expected)),
                "bad range definition"
            );
        }
        assert_eq!(iter.next(), None);
    }

    #[test]
    fn test_dsmgmt_ranges_bad_dual() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let listaddr1 = 0x80FF8u64;
        let listaddr2 = 0x90000u64;
        memctx.write_many(GuestAddr(listaddr1), &RANGES[0..1]);
        memctx.write_many(GuestAddr(listaddr2), &RANGES[1..]);

        let cmd = DatasetManagementCmd {
            prp1: listaddr1,
            prp2: listaddr2,
            nr: RANGES.len() as u16,
            ad: true,
            _idw: false,
            _idr: false,
        };

        let mut iter = cmd.ranges(&memctx);
        match iter.next() {
            Some(Err(_)) => {}
            other => panic!("expected alignment error, got {other:?}"),
        }
    }

    #[test]
    fn test_dsmgmt_ranges_bad_address() {
        let (_pmap, acc_mem) = setup();
        let memctx = acc_mem.access().unwrap();

        let cmd = DatasetManagementCmd {
            prp1: VM_SIZE as u64, // out of bounds
            prp2: 0,
            nr: 1,
            ad: true,
            _idw: false,
            _idr: false,
        };

        let mut iter = cmd.ranges(&memctx);
        match iter.next() {
            Some(Err(_)) => {}
            other => panic!("expected alignment error, got {other:?}"),
        }
    }
}


================================================
FILE: lib/propolis/src/hw/nvme/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::convert::TryInto;
use std::mem::size_of;
use std::num::NonZeroUsize;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex, MutexGuard, Weak};

use crate::accessors::Guard;
use crate::block;
use crate::common::*;
use crate::hw::ids::pci::{PROPOLIS_NVME_DEV_ID, VENDOR_OXIDE};
use crate::hw::ids::OXIDE_OUI;
use crate::hw::pci;
use crate::migrate::*;
use crate::util::id::define_id;
use crate::util::regmap::RegMap;
use crate::vmm::MemAccessed;

use futures::future::BoxFuture;
use lazy_static::lazy_static;
use thiserror::Error;

mod admin;
mod bits;
mod cmds;
mod queue;
mod requests;

use bits::*;
use queue::{CompQueue, QueueId, SubQueue};

define_id! {
    /// Identifier for which NVMe controller in the VM an operation is happening
    /// on.
    ///
    /// This is mostly useful for NVMe-related DTrace probes, where otherwise a
    /// queue number or command ID may be ambiguous across distinct NVMe
    /// controllers in a VM.
    #[derive(Copy, Clone)]
    pub struct DeviceId(u32);
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn nvme_doorbell(off: u64, devq_id: u64, is_cq: u8, val: u16) {}
    fn nvme_doorbell_admin_cq(val: u16) {}
    fn nvme_doorbell_admin_sq(val: u16) {}
    fn nvme_admin_cmd(opcode: u8, prp1: u64, prp2: u64) {}
    fn nvme_block_notify(devsq_id: u64, block_devqid: u64, occupied_hint: u16) {
    }
}

/// Combine an NVMe device and queue ID into a single u64 for probes
pub(crate) fn devq_id(dev: DeviceId, queue: QueueId) -> u64 {
    // We'll use the low 16 bits for the queue ID. Assert at compile time that
    // queue IDs cannot go above that. Clippy helpfully asserts this is an
    // absurd comparison, so silence that. If you see a rustc error here you
    // must have changed the type of QueueId such this is no longer absurd!
    #[allow(clippy::absurd_extreme_comparisons)]
    {
        static_assertions::const_assert!(QueueId::MAX <= u16::MAX);
    }

    ((dev.0 as u64) << 16) | (queue as u64)
}

/// The max number of MSI-X interrupts we support
const NVME_MSIX_COUNT: u16 = 1024;

/// NVMe errors
#[derive(Debug, Error)]
pub enum NvmeError {
    /// Unsupported CQES requested
    #[error("the requested CQ entry size is unsupported")]
    UnsupportedCompQueueEntrySize,

    /// Unsupported SQES requested
    #[error("the requested SQ entry size is unsupported")]
    UnsupportedSubQueueEntrySize,

    /// Unsupported AMS requested
    #[error("the requested arbitration mechanism is unsupported")]
    UnsupportedArbitrationMechanism,

    /// Unsupported MPS requested
    #[error("the requested memory page size is unsupported")]
    UnsupportedMemPageSize,

    /// Unsupported CSS requested
    #[error("the requested command set is unsupported")]
    UnsupportedCommandSet,

    /// The specified Completion Queue ID did not correspond to a valid Completion Queue
    #[error("the completion queue specified ({0}) is invalid")]
    InvalidCompQueue(QueueId),

    /// The specified Submission Queue ID did not correspond to a valid Completion Queue
    #[error("the submission queue specified ({0}) is invalid")]
    InvalidSubQueue(QueueId),

    /// The specified Completion Queue ID already exists
    #[error("the completion queue specified ({0}) already exists")]
    CompQueueAlreadyExists(QueueId),

    /// The specified Submission Queue ID already exists
    #[error("the submission queue specified ({0}) already exists")]
    SubQueueAlreadyExists(QueueId),

    /// Can't delete a CQ with associated SQs
    #[error("the completion queue specified ({0}) still has ({1}) associated submission queue(s)")]
    AssociatedSubQueuesStillExist(QueueId, usize),

    /// Failed to create Queue
    #[error("failed to create queue: {0}")]
    QueueCreateErr(#[from] queue::QueueCreateErr),

    #[error("failed to update queue: {0}")]
    QueueUpdateError(#[from] queue::QueueUpdateError),

    /// MSI-X Interrupt handle is unavailable
    #[error("the MSI-X interrupt handle is unavailable")]
    MsixHdlUnavailable,

    /// Couln't parse command
    #[error("failed to parse command: {0}")]
    CommandParseErr(#[from] cmds::ParseErr),

    /// Maximum number of namespaces already attached to controller
    #[error("maximum number of namespaces already attached to controller")]
    TooManyNamespaces,

    /// The specified Namespace ID did not correspond to a valid Namespace
    #[error("the namespace specified ({0}) is invalid")]
    InvalidNamespace(u32),

    /// Controller cannot access guest memory
    #[error("memory access inaccessible")]
    MemoryInaccessible,
}

/// Internal NVMe Controller State
#[derive(Debug, Default)]
struct CtrlState {
    /// Controller Capabilities
    cap: Capabilities,

    /// Controller Configuration
    cc: Configuration,

    /// Controller Status
    csts: Status,

    /// Admin Queue Attributes
    aqa: AdminQueueAttrs,

    /// The 64-bit Guest address for the Admin Submission Queue
    ///
    /// ASQB
    /// See NVMe 1.0e Section 3.1.8 Offset 28h: ASQ - Admin Submission Queue
    /// Base Address
    admin_sq_base: u64,

    /// The 64-bit Guest address for the Admin Completion Queue
    ///
    /// ACQB
    /// See NVMe 1.0e Section 3.1.9 Offset 30h: ACQ - Admin Completion Queue
    /// Base Address
    admin_cq_base: u64,
}

/// The max number of completion or submission queues we support.
/// Note: This includes the admin completion/submission queues.
const MAX_NUM_QUEUES: usize = 16;

/// The max number of I/O completion or submission queues we support.
/// Always 1 less than the total number w/ the admin queues.
const MAX_NUM_IO_QUEUES: usize = MAX_NUM_QUEUES - 1;

/// NVMe Controller
struct NvmeCtrl {
    /// A distinguishing identifier for this NVMe controller across the VM.
    /// Useful mostly to distinguish queues and commands as seen in probes.
    /// `device_id` is held constant across NVMe resets, but not persisted
    /// across export and import.
    device_id: DeviceId,

    /// Internal NVMe Controller state
    ctrl: CtrlState,

    /// Doorbell Buffer Config state
    doorbell_buf: Option<queue::DoorbellBuffer>,

    /// MSI-X Interrupt Handle to signal VM
    msix_hdl: Option<pci::MsixHdl>,

    /// The list of Completion Queues handled by the controller
    cqs: [Option<Arc<CompQueue>>; MAX_NUM_QUEUES],

    /// The list of Submission Queues handled by the controller
    sqs: [Option<Arc<SubQueue>>; MAX_NUM_QUEUES],

    /// The Identify structure returned for Identify controller commands
    ctrl_ident: IdentifyController,

    /// The Identify structure returned for Identify namespace commands
    ns_ident: IdentifyNamespace,

    read_only: bool,
}

impl NvmeCtrl {
    /// Creates the admin completion and submission queues.
    ///
    /// Admin queues are always created with `cqid`/`sqid` `0`.
    fn create_admin_queues(&mut self, nvme: &PciNvme) -> Result<(), NvmeError> {
        // Admin CQ uses interrupt vector 0 (See NVMe 1.0e Section 3.1.9 ACQ)
        self.create_cq(
            queue::CreateParams {
                id: queue::ADMIN_QUEUE_ID,
                device_id: self.device_id,
                base: GuestAddr(self.ctrl.admin_cq_base),
                // Convert from 0's based
                size: u32::from(self.ctrl.aqa.acqs()) + 1,
            },
            true,
            0,
            nvme,
        )?;
        self.create_sq(
            queue::CreateParams {
                id: queue::ADMIN_QUEUE_ID,
                device_id: self.device_id,
                base: GuestAddr(self.ctrl.admin_sq_base),
                // Convert from 0's based
                size: u32::from(self.ctrl.aqa.asqs()) + 1,
            },
            true,
            queue::ADMIN_QUEUE_ID,
            nvme,
        )?;
        Ok(())
    }

    /// Creates a new [completion queue](CompQueue) for the controller.
    ///
    /// The CQ ID must not already be in use.  For the admin queue, it must be
    /// 0, while for IO queues, it must _not_ be 0.  This is explicitly enforced
    /// through the `is_admin` argument.
    fn create_cq(
        &mut self,
        params: queue::CreateParams,
        is_admin: bool,
        iv: u16,
        nvme: &PciNvme,
    ) -> Result<Arc<CompQueue>, NvmeError> {
        let cqid = params.id;
        if (cqid as usize) >= MAX_NUM_QUEUES {
            return Err(NvmeError::InvalidCompQueue(cqid));
        }
        if is_admin {
            // Creating admin queue(s) with wrong ID is programmer error
            assert_eq!(cqid, 0);
        } else if cqid == 0 {
            // Guest requests to create an IO queue with the ID belonging to the
            // admin queue is explicitly disallowed.
            return Err(NvmeError::InvalidCompQueue(cqid));
        }
        if self.cqs[cqid as usize].is_some() {
            return Err(NvmeError::CompQueueAlreadyExists(cqid));
        }
        let msix_hdl = self
            .msix_hdl
            .as_ref()
            .ok_or(NvmeError::MsixHdlUnavailable)?
            .clone();
        let cq = Arc::new(CompQueue::new(
            params,
            iv,
            msix_hdl,
            nvme.pci_state.acc_mem.child(Some(format!("CompQueue-{cqid}"))),
        )?);
        if self.doorbell_buf.is_some() {
            cq.set_db_buf(self.doorbell_buf, false);
        }
        self.cqs[cqid as usize] = Some(cq.clone());
        nvme.queues.set_cq_slot(cqid, Some(cq.clone()));
        Ok(cq)
    }

    /// Creates a new [submission queue](SubQueue) for the controller.
    ///
    /// The SQ ID must not already be in use.  For the admin queue, it must be
    /// 0, while for IO queues, it must _not_ be 0.  This is explicitly enforced
    /// through the `is_admin` argument.  The `cqid` to which this SQ will be
    /// associated must correspond to an existing CQ.
    fn create_sq(
        &mut self,
        params: queue::CreateParams,
        is_admin: bool,
        cqid: QueueId,
        nvme: &PciNvme,
    ) -> Result<Arc<SubQueue>, NvmeError> {
        let sqid = params.id;
        if (sqid as usize) >= MAX_NUM_QUEUES {
            return Err(NvmeError::InvalidSubQueue(sqid));
        }
        if is_admin {
            // Creating admin queue(s) with wrong ID is programmer error
            assert_eq!(sqid, 0);
            // So too is associating an admin SQ to an IO CQ
            assert_eq!(cqid, 0);
        } else {
            if sqid == 0 {
                // Guest requests to create an IO queue with the ID belonging to
                // the admin queue is not allowed.
                return Err(NvmeError::InvalidSubQueue(cqid));
            }
            if cqid == 0 {
                // Guest requests to associate the to-be-created IO SQ with an
                // admin CQ is not allowed.
                return Err(NvmeError::InvalidCompQueue(cqid));
            }
        }
        if self.sqs[sqid as usize].is_some() {
            return Err(NvmeError::SubQueueAlreadyExists(sqid));
        }
        let cq = self.get_cq(cqid)?;
        let sq = SubQueue::new(
            params,
            cq,
            nvme.pci_state.acc_mem.child(Some(format!("SubQueue-{sqid}"))),
        )?;
        if self.doorbell_buf.is_some() {
            sq.set_db_buf(self.doorbell_buf, false);
        }
        self.sqs[sqid as usize] = Some(sq.clone());
        nvme.queues.set_sq_slot(sqid, Some(sq.clone()));
        Ok(sq)
    }

    /// Removes the [`CompQueue`] which corresponds to the given completion
    /// queue id (`cqid`).
    fn delete_cq(
        &mut self,
        cqid: QueueId,
        nvme: &PciNvme,
    ) -> Result<(), NvmeError> {
        if (cqid as usize) >= MAX_NUM_QUEUES
            || self.cqs[cqid as usize].is_none()
        {
            return Err(NvmeError::InvalidCompQueue(cqid));
        }

        // Make sure this CQ has no more associated SQs
        let sqs = self.cqs[cqid as usize].as_ref().unwrap().associated_sqs();
        if sqs > 0 {
            return Err(NvmeError::AssociatedSubQueuesStillExist(cqid, sqs));
        }

        // Remove it from the authoritative list of CQs
        self.cqs[cqid as usize] = None;
        nvme.queues.set_cq_slot(cqid, None);
        Ok(())
    }

    /// Removes the [`SubQueue`] which corresponds to the given submission queue id (`sqid`).
    ///
    /// **NOTE:** This only removes the SQ from our list of active SQ and there may still be
    ///           in-flight IO requests for this SQ. But after this call, we'll no longer
    ///           answer any new doorbell requests for this SQ.
    fn delete_sq(
        &mut self,
        sqid: QueueId,
        nvme: &PciNvme,
    ) -> Result<(), NvmeError> {
        if (sqid as usize) >= MAX_NUM_QUEUES
            || self.sqs[sqid as usize].is_none()
        {
            return Err(NvmeError::InvalidSubQueue(sqid));
        }

        // Remove it from the authoritative list of SQs
        self.sqs[sqid as usize] = None;
        nvme.queues.set_sq_slot(sqid, None);
        Ok(())
    }

    /// Returns a reference to the [`CompQueue`] which corresponds to the given completion queue id (`cqid`).
    fn get_cq(&self, cqid: QueueId) -> Result<Arc<CompQueue>, NvmeError> {
        if (cqid as usize) >= MAX_NUM_QUEUES {
            return Err(NvmeError::InvalidCompQueue(cqid));
        }
        self.cqs[cqid as usize].clone().ok_or(NvmeError::InvalidCompQueue(cqid))
    }

    /// Returns a reference to the [`SubQueue`] which corresponds to the given submission queue id (`cqid`).
    fn get_sq(&self, sqid: QueueId) -> Result<Arc<SubQueue>, NvmeError> {
        if (sqid as usize) >= MAX_NUM_QUEUES {
            return Err(NvmeError::InvalidSubQueue(sqid));
        }
        self.sqs[sqid as usize].clone().ok_or(NvmeError::InvalidSubQueue(sqid))
    }

    /// Returns a reference to the Admin [`CompQueue`].
    fn get_admin_cq(&self) -> Result<Arc<CompQueue>, NvmeError> {
        self.get_cq(queue::ADMIN_QUEUE_ID)
    }

    /// Returns a reference to the Admin [`SubQueue`].
    fn get_admin_sq(&self) -> Result<Arc<SubQueue>, NvmeError> {
        self.get_sq(queue::ADMIN_QUEUE_ID)
    }

    /// Perform necessary setup tasks after an IO SQ has been created.
    fn io_sq_post_create(&self, nvme: &PciNvme, sq: Arc<SubQueue>) {
        let sqid = sq.id();
        assert!(sqid != 0, "attempting IO SQ setup on admin SQ");
        sq.update_params(self.transfer_params());
        nvme.block_attach.queue_associate(
            queue::sqid_to_block_qid(sqid),
            requests::NvmeBlockQueue::new(
                sq,
                nvme.pci_state.acc_mem.child(Some(format!("SubQueue-{sqid}"))),
            ),
        );
    }

    /// Configure Controller
    fn configure(&mut self, cc: Configuration) -> Result<(), NvmeError> {
        let mut inner = || {
            // Make sure the requested Queue sizes match our expectations
            // Note: we only compare to `required` as we mandate that
            //       required == maximum. See `Capabilities::mqes` value.
            if cc.iocqes() > 0 {
                if cc.iocqes() != self.ctrl_ident.cqes.required() {
                    return Err(NvmeError::UnsupportedCompQueueEntrySize);
                }
                self.ctrl.cc.set_iocqes(cc.iocqes());
            }
            if cc.iosqes() > 0 {
                if cc.iosqes() != self.ctrl_ident.sqes.required() {
                    return Err(NvmeError::UnsupportedSubQueueEntrySize);
                }
                self.ctrl.cc.set_iosqes(cc.iosqes());
            }

            // These may only be configured while we're disabled
            if !self.ctrl.cc.enabled() {
                // We only support round robin arbitration
                if cc.ams() != ArbitrationMechanism::RoundRobin {
                    return Err(NvmeError::UnsupportedArbitrationMechanism);
                }

                // We only supported an MPS of 0 (4K pages)
                if cc.mps() < self.ctrl.cap.mpsmin()
                    || cc.mps() > self.ctrl.cap.mpsmax()
                {
                    return Err(NvmeError::UnsupportedMemPageSize);
                }

                // No non-standard command sets
                if cc.css() != IOCommandSet::Nvm {
                    return Err(NvmeError::UnsupportedCommandSet);
                }

                self.ctrl.cc.set_ams(cc.ams());
                self.ctrl.cc.set_mps(cc.mps());
                self.ctrl.cc.set_css(cc.css());
            }

            Ok(())
        };

        if let Err(e) = inner() {
            // Got some bad config, set Controller Fail Status
            self.ctrl.csts.set_cfs(true);
            Err(e)
        } else {
            Ok(())
        }
    }

    /// Get the controller in a state ready to process requests
    fn enable(&mut self, nvme: &PciNvme) -> Result<(), NvmeError> {
        // Create the Admin Queues
        self.create_admin_queues(nvme)?;

        Ok(())
    }

    /// Performs a Controller Reset.
    ///
    /// The reset deletes all I/O Submission & Completion Queues, resets
    /// the Admin Submission & Completion Queues, and brings the hardware
    /// to an idle state. The reset does not affect PCI Express registers
    /// nor the Admin Queue registers (AQA, ASQ, or ACQ).  All other
    /// controller registers and internal controller state that are not
    /// persistent across power states) are reset to their default values.
    /// The controller shall ensure that there is no data loss for commands
    /// that have had corresponding completion queue entries posted to an I/O
    /// Completion Queue prior to the reset operation.
    fn reset(&mut self, nvme: &PciNvme) {
        // Immediately mark the controller as disabled to prevent any inbound
        // doorbells from being accepted on the queues we are about to delete.
        nvme.is_enabled.store(false, Ordering::Release);

        // Remove our references to the Qs which should be the only strong refs
        // at this point. Any in-flight I/O commands will just implicitly be
        // aborted once they try to issue their completions.
        for (sqid, state_slot) in self.sqs.iter_mut().enumerate() {
            if let Some(_sq) = state_slot.take() {
                let sqid = sqid as QueueId;
                if sqid != queue::ADMIN_QUEUE_ID {
                    // TODO: cancel any existing requests?
                    nvme.block_attach
                        .queue_dissociate(queue::sqid_to_block_qid(sqid));
                }
                nvme.queues.set_sq_slot(sqid, None);
            }
        }
        for (cqid, state_slot) in self.cqs.iter_mut().enumerate() {
            if let Some(_cq) = state_slot.take() {
                nvme.queues.set_cq_slot(cqid as QueueId, None);
            }
        }

        // Clear the CC & CSTS registers
        // Sets CC.EN=0 and CSTS.RDY=0
        self.ctrl.cc = Configuration(0);
        self.ctrl.csts = Status(0);

        // Other bits which are cleared on reset
        self.doorbell_buf = None;

        // The other registers (e.g. CAP/VS) we never modify
        // and thus don't need to do anything on reset
    }

    /// Calculate parameters for Submission Queue data transfer, derived from
    /// the LBA configuration as well as MDTS
    fn transfer_params(&self) -> queue::TransferParams {
        let lba_data_size = 1u64
            << (self.ns_ident.lbaf[(self.ns_ident.flbas & 0xF) as usize]).lbads;
        let max_data_transfer_size = match self.ctrl_ident.mdts {
            0 => u64::MAX,
            mdts => (self.ctrl.cap.mpsmin_sz() as u64) << mdts,
        };
        queue::TransferParams { lba_data_size, max_data_transfer_size }
    }

    fn update_block_info(&mut self, info: block::DeviceInfo) {
        let nsze = info.total_size;
        self.ns_ident = bits::IdentifyNamespace {
            // No thin provisioning so nsze == ncap == nuse
            nsze,
            ncap: nsze,
            nuse: nsze,
            ..self.ns_ident
        };
        self.ns_ident.lbaf[0].lbads = info.block_size.trailing_zeros() as u8;

        // Communicate new parameters to SQs
        let params = self.transfer_params();
        self.sqs
            .iter()
            .filter_map(Option::as_ref)
            .for_each(|sq| sq.update_params(params));

        self.read_only = info.read_only;
    }

    /// Get Memory Page Size (MPS), expressed in bytes
    fn get_mps(&self) -> u64 {
        // "The memory page size is (2 ^ (12 + MPS))"
        1u64 << (12 + self.ctrl.cc.mps())
    }

    fn export(&self) -> migrate::NvmeCtrlV1 {
        let cqs = self.cqs.iter().flatten().map(|cq| cq.export()).collect();
        let sqs = self.sqs.iter().flatten().map(|sq| sq.export()).collect();
        let (dbbuf_shadow, dbbuf_evtidx) = self
            .doorbell_buf
            .map(|buf| (buf.shadow.0, buf.eventidx.0))
            .unwrap_or_else(Default::default);
        migrate::NvmeCtrlV1 {
            cap: self.ctrl.cap.0,
            cc: self.ctrl.cc.0,
            csts: self.ctrl.csts.0,
            aqa: self.ctrl.aqa.0,
            acq_base: self.ctrl.admin_cq_base,
            asq_base: self.ctrl.admin_sq_base,
            dbbuf_shadow,
            dbbuf_evtidx,
            cqs,
            sqs,
        }
    }

    fn import(
        &mut self,
        state: migrate::NvmeCtrlV1,
        nvme: &PciNvme,
    ) -> Result<(), MigrateStateError> {
        // TODO: verify that controller state is consistent with SQ/CQs defined
        // in the payload

        // If any queues exist, clear them out first through a reset.
        self.reset(nvme);

        // TODO: bitstruct doesn't have a validation routine?
        self.ctrl.cap.0 = state.cap;
        self.ctrl.cc.0 = state.cc;
        self.ctrl.csts.0 = state.csts;
        self.ctrl.aqa.0 = state.aqa;

        self.ctrl.admin_cq_base = state.acq_base;
        self.ctrl.admin_sq_base = state.asq_base;

        // Begin with empty DoorbellBuffer state, so it is not automatically
        // configured as we are creating CQs & SQs.
        self.doorbell_buf = None;

        for cqi in state.cqs {
            let is_admin_queue = cqi.id == 0;
            self.create_cq(
                queue::CreateParams {
                    id: cqi.id,
                    device_id: self.device_id,
                    base: GuestAddr(cqi.base),
                    size: cqi.size,
                },
                is_admin_queue,
                cqi.iv,
                nvme,
            )
            .map_err(|e| {
                MigrateStateError::ImportFailed(format!(
                    "NVMe: failed to create CQ: {}",
                    e
                ))
            })?
            .import(cqi)?;
        }

        for sqi in state.sqs {
            let is_admin_queue = sqi.id == 0;
            let sq = self
                .create_sq(
                    queue::CreateParams {
                        id: sqi.id,
                        device_id: self.device_id,
                        base: GuestAddr(sqi.base),
                        size: sqi.size,
                    },
                    is_admin_queue,
                    sqi.cq_id,
                    nvme,
                )
                .map_err(|e| {
                    MigrateStateError::ImportFailed(format!(
                        "NVMe: failed to create SQ: {}",
                        e
                    ))
                })?;
            if !is_admin_queue {
                self.io_sq_post_create(nvme, sq.clone());
            }
            sq.import(sqi)?;
        }

        // With the queues created, we can inject any Doorbell Buffer state.
        //
        // When a guest enables this feature, it results in a write to the
        // buffer page with the current state.  We explicitly skip that step
        // (specifying `is_import = true`) since copying of the guest memory
        // pages will have migrated that state already.
        if state.dbbuf_shadow != 0 && state.dbbuf_evtidx != 0 {
            self.doorbell_buf = Some(queue::DoorbellBuffer {
                shadow: GuestAddr(state.dbbuf_shadow),
                eventidx: GuestAddr(state.dbbuf_evtidx),
            });
            for cq in self.cqs.iter().flatten() {
                cq.set_db_buf(self.doorbell_buf, true);
            }
            for sq in self.sqs.iter().flatten() {
                sq.set_db_buf(self.doorbell_buf, true);
            }
        };

        Ok(())
    }
}

#[derive(Default)]
struct NvmeQueues {
    sqs: [Mutex<Option<Arc<SubQueue>>>; MAX_NUM_QUEUES],
    cqs: [Mutex<Option<Arc<CompQueue>>>; MAX_NUM_QUEUES],
}
impl NvmeQueues {
    /// Replace the contents of a [SubQueue] slot.
    fn set_sq_slot(&self, sqid: QueueId, queue: Option<Arc<SubQueue>>) {
        let replace_some = queue.is_some();

        let old = std::mem::replace(
            &mut *self
                .sqs
                .get(sqid as usize)
                .expect("sqid should be valid")
                .lock()
                .unwrap(),
            queue,
        );

        // We should either be filling an empty slot with a new SQ (during queue
        // creation) or vacating a populated slot (during queue deletion).
        //
        // Swapping an existing SQ for a differing one in a single step would be
        // an unexpected operation.
        if replace_some {
            assert!(old.is_none(), "SQ slot should be empty");
        } else {
            assert!(old.is_some(), "SQ slot should be occupied");
        }
    }

    /// Replace the contents of a [CompQueue] slot.
    fn set_cq_slot(&self, cqid: QueueId, queue: Option<Arc<CompQueue>>) {
        let replace_some = queue.is_some();

        let old = std::mem::replace(
            &mut *self
                .cqs
                .get(cqid as usize)
                .expect("cqid should be valid")
                .lock()
                .unwrap(),
            queue,
        );

        // Same justification in set_sq_slot() above applies to CQs as well
        if replace_some {
            assert!(old.is_none(), "CQ slot should be empty");
        } else {
            assert!(old.is_some(), "CQ slot should be occupied");
        }
    }

    /// Get the slot guard for a given `sqid`, but only if that slot is already
    /// occupied by a [SubQueue].
    ///
    /// (A returned `Some(guard)`  implies `guard.is_some()`)
    fn get_sq(
        &self,
        sqid: QueueId,
    ) -> Option<MutexGuard<'_, Option<Arc<SubQueue>>>> {
        let guard = self.sqs.get(sqid as usize)?.lock().unwrap();
        guard.is_some().then_some(guard)
    }

    /// Get the slot guard for a given `cqid`, but only if that slot is already
    /// occupied by a [CompQueue].
    ///
    /// (A returned `Some(guard)`  implies `guard.is_some()`)
    fn get_cq(
        &self,
        cqid: QueueId,
    ) -> Option<MutexGuard<'_, Option<Arc<CompQueue>>>> {
        let guard = self.cqs.get(cqid as usize)?.lock().unwrap();
        guard.is_some().then_some(guard)
    }
}

/// NVMe over PCIe
pub struct PciNvme {
    /// NVMe Controller
    state: Mutex<NvmeCtrl>,

    /// Duplicate of the controller-enabled (`CC.EN`) state, but not requiring
    /// locking [NvmeCtrl] to read.  It is used to gate per-queue doorbell
    /// accesses without stacking them up behind one central lock.
    is_enabled: AtomicBool,

    /// Duplicate of the controller NVMe device ID, but not requiring locking
    /// [NvmeCtrl] to read.  This is used to provide additional context in
    /// NVMe-related probes.
    device_id: DeviceId,

    /// Access to NVMe Submission and Completion queues.
    ///
    /// These are protected with per-slot (queue ID) locks, so actions taken on
    /// a single queue will not contend with others.  The queue references
    /// contained within are kept in sync with those housed in the [NvmeCtrl]
    /// state.
    queues: NvmeQueues,

    /// PCI device state
    pci_state: pci::DeviceState,

    /// Block attachment point
    pub block_attach: block::DeviceAttachment,

    /// Logger resource
    log: slog::Logger,
}

impl PciNvme {
    /// Create a new pci-nvme device with the given values
    pub fn create(
        serial_number: &[u8; 20],
        mdts: Option<u8>,
        log: slog::Logger,
    ) -> Arc<Self> {
        let builder = pci::Builder::new(pci::Ident {
            vendor_id: VENDOR_OXIDE,
            device_id: PROPOLIS_NVME_DEV_ID,
            sub_vendor_id: VENDOR_OXIDE,
            sub_device_id: PROPOLIS_NVME_DEV_ID,
            device_class: pci::bits::CLASS_STORAGE,
            device_subclass: pci::bits::SUBCLASS_STORAGE_NVM,
            prog_if: pci::bits::PROGIF_ENTERPRISE_NVME,
            ..Default::default()
        });

        // We have unit tests that these are 16 and 64 bytes, respectively
        // But just make sure as we specify these as powers of 2 in places
        debug_assert!(size_of::<CompletionQueueEntry>().is_power_of_two());
        debug_assert!(size_of::<SubmissionQueueEntry>().is_power_of_two());
        let cqes = size_of::<CompletionQueueEntry>().trailing_zeros() as u8;
        let sqes = size_of::<SubmissionQueueEntry>().trailing_zeros() as u8;

        // Initialize the Identify structure returned when the host issues
        // an Identify Controller command.
        let ctrl_ident = bits::IdentifyController {
            vid: VENDOR_OXIDE,
            ssvid: VENDOR_OXIDE,
            sn: *serial_number,
            ieee: OXIDE_OUI,
            mdts: mdts.unwrap_or(0),
            // We use standard Completion/Submission Queue Entry structures with no extra
            // data, so required (minimum) == maximum
            sqes: NvmQueueEntrySize(0).with_maximum(sqes).with_required(sqes),
            cqes: NvmQueueEntrySize(0).with_maximum(cqes).with_required(cqes),
            // Supporting multiple namespaces complicates I/O dispatching,
            // so for now we limit the device to a single namespace.
            nn: 1,
            // bit 2 indicates support for the Dataset Management command
            oncs: (1 << 2),
            // bit 0 indicates volatile write cache is present
            vwc: 1,
            // bit 8 indicates Doorbell Buffer support
            oacs: (1 << 8),
            ..Default::default()
        };

        // The Identify structure (returned by Identify command issued by guest)
        // will be further updated when a backend is attached to make the
        // underlying device info available.
        let ns_ident = bits::IdentifyNamespace {
            nlbaf: 0, // We only support a single LBA format (1 but 0-based)
            flbas: 0, // And it is at index 0 in the lbaf array
            ..Default::default()
        };

        // Initialize the CAP "register" leaving most values
        // at their defaults (0):
        //  TO      = 0 => 0ms to wait for controller to be ready
        //  DSTRD   = 0 => 2^(2+0) byte stride for doorbell registers
        //  MPSMIN  = 0 => 2^(12+0) bytes, 4K
        //  MPSMAX  = 0 => 2^(12+0) bytes, 4K
        let cap = Capabilities(0)
            // Allow up to the spec max supported queue size
            // converted to 0's based
            .with_mqes((queue::MAX_QUEUE_SIZE - 1) as u16)
            // I/O Queues must be physically contiguous
            .with_cqr(true)
            // We support the NVM command set
            .with_css_nvm(true);

        // Initialize the CC "register"
        //  EN      = 0 => controller initially disabled
        //  CSS     = 0 => NVM Command Set selected
        //  MPS     = 0 => 2^(12+0) bytes, 4K pages
        //  AMS     = 0 => Round Robin Arbitration
        //  SHN     = 0 => Shutdown Notification Cleared
        //  IOCQES  = 0 => No I/O CQ Entry Size set yet
        //  IOSQES  = 0 => No I/O SQ Entry Size set yet
        let cc = Configuration(0);

        // Initialize the CSTS "register" leaving most values
        // at their defaults (0):
        //  RDY     = 0 => controller not ready
        //  CFS     = 0 => no fatal controller errors
        //  SHST    = 0 => no shutdown in process, normal operation
        let csts = Status(0);

        let state = NvmeCtrl {
            device_id: DeviceId::new(),
            ctrl: CtrlState { cap, cc, csts, ..Default::default() },
            doorbell_buf: None,
            msix_hdl: None,
            cqs: Default::default(),
            sqs: Default::default(),
            ctrl_ident,
            ns_ident,
            read_only: false,
        };

        let pci_state = builder
            // BAR0/1 are used for the main config and doorbell registers
            .add_bar_mmio64(pci::BarN::BAR0, CONTROLLER_REG_SZ as u64)
            // BAR2 is for the optional index/data registers
            // Place MSIX in BAR4 for now
            .add_cap_msix(pci::BarN::BAR4, NVME_MSIX_COUNT)
            .finish();

        let block_attach = block::DeviceAttachment::new(
            NonZeroUsize::new(MAX_NUM_IO_QUEUES).unwrap(),
            pci_state.acc_mem.child(Some("block backend".to_string())),
        );

        Arc::new_cyclic(move |self_weak: &Weak<PciNvme>| {
            let this = self_weak.clone();
            block_attach.on_attach(Box::new(move |info| {
                if let Some(this) = Weak::upgrade(&this) {
                    this.state.lock().unwrap().update_block_info(info);
                }
            }));

            // Cache device ID before we move it into the Mutex below.
            let device_id = state.device_id;

            PciNvme {
                state: Mutex::new(state),
                is_enabled: AtomicBool::new(false),
                device_id,
                pci_state,
                queues: NvmeQueues::default(),
                block_attach,
                log,
            }
        })
    }

    /// Service a write to the NVMe Controller Configuration from the VM
    fn ctrlr_cfg_write(&self, new: Configuration) -> Result<(), NvmeError> {
        let mut state = self.state.lock().unwrap();

        // Propogate any CC changes first
        if state.ctrl.cc != new {
            state.configure(new)?;
        }

        let cur = state.ctrl.cc;
        if new.enabled() && !cur.enabled() {
            slog::debug!(self.log, "Enabling controller");

            // Get the controller ready to service requests
            if let Err(e) = state.enable(self) {
                // Couldn't enable controller, set Controller Fail Status
                state.ctrl.csts.set_cfs(true);
                return Err(e);
            } else {
                // Controller now ready to start servicing requests
                // Set CC.EN=1 and CSTS.RDY=1
                state.ctrl.cc.set_enabled(true);
                state.ctrl.csts.set_ready(true);
                self.is_enabled.store(true, Ordering::Release);
            }
        } else if !new.enabled() && cur.enabled() {
            slog::debug!(self.log, "Disabling controller");

            // Reset controller state which will set CC.EN=0 and CSTS.RDY=0
            state.reset(self);
        }

        let shutdown = new.shn() != ShutdownNotification::None;
        if shutdown && state.ctrl.csts.shst() == ShutdownStatus::Normal {
            // Host has indicated to shutdown
            // TODO: Issue flush to underlying block devices
            state.ctrl.csts.set_shst(ShutdownStatus::Complete);
        } else if !shutdown && state.ctrl.csts.shst() != ShutdownStatus::Normal
        {
            state.ctrl.csts.set_shst(ShutdownStatus::Normal);
        }

        Ok(())
    }

    /// Service an NVMe register read from the VM
    fn reg_ctrl_read(
        &self,
        id: &CtrlrReg,
        ro: &mut ReadOp,
    ) -> Result<(), NvmeError> {
        match id {
            CtrlrReg::CtrlrCaps => {
                let state = self.state.lock().unwrap();
                ro.write_u64(state.ctrl.cap.0);
            }
            CtrlrReg::Version => {
                ro.write_u32(NVME_VER_1_0);
            }

            CtrlrReg::IntrMaskSet | CtrlrReg::IntrMaskClear => {
                // Only MSI-X is exposed for now, so this is undefined
                ro.fill(0);
            }

            CtrlrReg::CtrlrCfg => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.ctrl.cc.0);
            }
            CtrlrReg::CtrlrStatus => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.ctrl.csts.0);
            }
            CtrlrReg::AdminQueueAttr => {
                let state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    ro.write_u32(state.ctrl.aqa.0);
                }
            }
            CtrlrReg::AdminSubQAddr => {
                let state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    ro.write_u64(state.ctrl.admin_sq_base);
                }
            }
            CtrlrReg::AdminCompQAddr => {
                let state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    ro.write_u64(state.ctrl.admin_cq_base);
                }
            }
            CtrlrReg::Reserved => {
                ro.fill(0);
            }
            CtrlrReg::DoorBellAdminSQ
            | CtrlrReg::DoorBellAdminCQ
            | CtrlrReg::IOQueueDoorBells => {
                // The host should not read from the doorbells, and the contents
                // can be vendor/implementation specific (in our case, zeroed).
                ro.fill(0);
            }
        }

        Ok(())
    }

    /// Service an NVMe register write from the VM
    fn reg_ctrl_write(
        &self,
        id: &CtrlrReg,
        wo: &mut WriteOp,
    ) -> Result<(), NvmeError> {
        match id {
            CtrlrReg::CtrlrCaps
            | CtrlrReg::Version
            | CtrlrReg::CtrlrStatus
            | CtrlrReg::Reserved => {
                // Read-only registers
            }
            CtrlrReg::IntrMaskSet | CtrlrReg::IntrMaskClear => {
                // Only MSI-X is exposed for now, so this is undefined
            }

            CtrlrReg::CtrlrCfg => {
                self.ctrlr_cfg_write(Configuration(wo.read_u32()))?;
            }
            CtrlrReg::AdminQueueAttr => {
                let mut state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    state.ctrl.aqa = AdminQueueAttrs(wo.read_u32());
                }
            }
            CtrlrReg::AdminSubQAddr => {
                let mut state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    state.ctrl.admin_sq_base = wo.read_u64() & PAGE_MASK as u64;
                }
            }
            CtrlrReg::AdminCompQAddr => {
                let mut state = self.state.lock().unwrap();
                if !state.ctrl.cc.enabled() {
                    state.ctrl.admin_cq_base = wo.read_u64() & PAGE_MASK as u64;
                }
            }

            CtrlrReg::DoorBellAdminSQ => {
                // 32-bit register but ignore reserved top 16-bits
                let val = wo.read_u32() as u16;
                probes::nvme_doorbell_admin_sq!(|| val);
                let state = self.state.lock().unwrap();

                if !state.ctrl.cc.enabled() {
                    slog::warn!(
                        self.log,
                        "Doorbell write while controller is disabled"
                    );
                    return Err(NvmeError::InvalidSubQueue(
                        queue::ADMIN_QUEUE_ID,
                    ));
                }

                let admin_sq = state.get_admin_sq()?;
                admin_sq.notify_tail(val)?;

                // Process any new SQ entries
                self.process_admin_queue(state, admin_sq)?;
            }
            CtrlrReg::DoorBellAdminCQ => {
                // 32-bit register but ignore reserved top 16-bits
                let val = wo.read_u32() as u16;
                probes::nvme_doorbell_admin_cq!(|| val);
                let state = self.state.lock().unwrap();

                if !state.ctrl.cc.enabled() {
                    slog::warn!(
                        self.log,
                        "Doorbell write while controller is disabled"
                    );
                    return Err(NvmeError::InvalidCompQueue(
                        queue::ADMIN_QUEUE_ID,
                    ));
                }

                let admin_cq = state.get_admin_cq()?;
                admin_cq.notify_head(val)?;

                // We may have skipped pulling entries off the admin sq
                // due to no available completion entry permit, so just
                // kick it here again in case.
                if admin_cq.kick().is_some() {
                    let admin_sq = state.get_admin_sq()?;
                    self.process_admin_queue(state, admin_sq)?;
                }
            }

            CtrlrReg::IOQueueDoorBells => {
                // Submission Queue y Tail Doorbell offset
                //  = 0x1000 + (2y * (4 << CAP.DSTRD))
                // Completion Queue y Head Doorbell offset
                //  = 0x1000 + ((2y + 1) * (4 << CAP.DSTRD))
                //
                // See NVMe 1.0e Section 3.1.10 & 3.1.11
                //
                // But note that we only support CAP.DSTRD = 0
                //
                // NOTE: Normally the `wo.offset()` would be relative to the
                // beginning of the RegMap-ed register, but writes to the
                // doorbells have a special fast path via PciNvme::bar_write().
                let off = wo.offset() - 0x1000;

                let is_cq = (off >> 2) & 0b1 == 0b1;
                let qid = if is_cq { (off - 4) >> 3 } else { off >> 3 };

                // Queue IDs should be 16-bit and we know
                // `off <= CONTROLLER_REG_SZ (0x4000)`
                let qid = qid.try_into().unwrap();

                // 32-bit register but ignore reserved top 16-bits
                let val = wo.read_u32() as u16;

                // Mix in the device ID for probe purposes
                let devq_id = devq_id(self.device_id, qid);

                probes::nvme_doorbell!(|| (
                    off as u64,
                    devq_id,
                    u8::from(is_cq),
                    val
                ));
                self.ring_doorbell(qid, is_cq, val)?;
            }
        }

        Ok(())
    }

    /// Perform the actual work of a doorbell ring
    fn ring_doorbell(
        &self,
        qid: QueueId,
        is_cq: bool,
        val: u16,
    ) -> Result<(), NvmeError> {
        if !self.is_enabled.load(Ordering::Acquire) {
            slog::debug!(
                self.log,
                "Doorbell write while controller is disabled"
            );
            return Err(if is_cq {
                NvmeError::InvalidCompQueue(qid)
            } else {
                NvmeError::InvalidSubQueue(qid)
            });
        }

        // Note:
        //
        // When notifying SQs as part of a doorbell ring, it is necessary to
        // drop the locks required to access said SQs.
        //
        // Without the protection of those locks, it is possible that racing
        // guest operations to destroy/create IO queues could cause the SQIDs on
        // which we are notifying to be "stale".  This is not a concern, as
        // spurious notifications to the block layer do not have ill effects.

        if is_cq {
            // Completion Queue y Head Doorbell
            let guard = self
                .queues
                .get_cq(qid)
                .ok_or(NvmeError::InvalidCompQueue(qid))?;
            let cq = guard.as_ref().unwrap();

            cq.notify_head(val)?;

            // If this CQ was previously full, SQs may have become corked while
            // trying to get permits.  Notify them that there may now be
            // capacity.
            let Some(to_notify) = cq.kick() else {
                // No associated SQs to notify about
                return Ok(());
            };

            // Querying of SQs (for number of entries available to block layer)
            // and delivery of said notifications must be done with neither CQ
            // or SQ locks held.
            drop(guard);

            for (sqid, num_occupied) in
                to_notify.into_iter().filter_map(|sqid| {
                    assert_ne!(
                        sqid,
                        queue::ADMIN_QUEUE_ID,
                        "IO queues should not associate with Admin queue IDs"
                    );
                    let sq_guard = self.queues.get_sq(sqid)?;
                    let sq = sq_guard.as_ref().unwrap();
                    Some((sqid, sq.num_occupied()))
                })
            {
                let block_qid = queue::sqid_to_block_qid(sqid);
                let devsq_id = devq_id(self.device_id, sqid);
                let block_devqid =
                    block::devq_id(self.block_attach.device_id(), block_qid);
                probes::nvme_block_notify!(|| (
                    devsq_id,
                    block_devqid,
                    num_occupied
                ));
                self.block_attach.notify(
                    block_qid,
                    NonZeroUsize::new(num_occupied as usize),
                );
            }
        } else {
            // Submission Queue y Tail Doorbell
            let guard = self
                .queues
                .get_sq(qid)
                .ok_or(NvmeError::InvalidSubQueue(qid))?;
            let sq = guard.as_ref().unwrap();

            let num_occupied = sq.notify_tail(val)?;
            let devsq_id = sq.devq_id();
            // Notification to block layer cannot be issued with SQ lock held
            drop(guard);

            assert_ne!(qid, queue::ADMIN_QUEUE_ID);
            let block_qid = queue::sqid_to_block_qid(qid);
            let block_devqid =
                block::devq_id(self.block_attach.device_id(), block_qid);
            probes::nvme_block_notify!(|| (
                devsq_id,
                block_devqid,
                num_occupied
            ));
            self.block_attach
                .notify(block_qid, NonZeroUsize::new(num_occupied as usize));
        };
        Ok(())
    }

    /// Process any new entries in the Admin Submission Queue
    fn process_admin_queue(
        &self,
        mut state: MutexGuard<NvmeCtrl>,
        sq: Arc<SubQueue>,
    ) -> Result<(), NvmeError> {
        // Grab the Admin CQ too
        let cq = state.get_admin_cq()?;

        let mem = self.mem_access();
        if mem.is_none() {
            // XXX: set controller error state?
        }
        let mem = mem.unwrap();

        while let Some((sub, permit, _idx)) = sq.pop() {
            use cmds::AdminCmd;

            probes::nvme_admin_cmd!(|| (sub.opcode(), sub.prp1, sub.prp2));
            let cmd = AdminCmd::parse(sub).unwrap_or_else(|_e| {
                // Since unknown admin commands are already parsed into
                // AdminCmd::Unknown, we only need to worry about invalid field
                // contents (such as the fuse bits being set).
                //
                // XXX: set the controller into an error state instead of
                // reacting in the same manner as unknown command?
                AdminCmd::Unknown(sub)
            });
            let comp = match cmd {
                AdminCmd::Abort(cmd) => state.acmd_abort(&cmd),
                AdminCmd::CreateIOCompQ(cmd) => {
                    state.acmd_create_io_cq(&cmd, self)
                }
                AdminCmd::CreateIOSubQ(cmd) => {
                    state.acmd_create_io_sq(&cmd, self)
                }
                AdminCmd::GetLogPage(cmd) => {
                    state.acmd_get_log_page(&cmd, &mem)
                }
                AdminCmd::Identify(cmd) => state.acmd_identify(&cmd, &mem),
                AdminCmd::GetFeatures(cmd) => state.acmd_get_features(&cmd),
                AdminCmd::SetFeatures(cmd) => state.acmd_set_features(&cmd),
                AdminCmd::DeleteIOCompQ(cqid) => {
                    state.acmd_delete_io_cq(cqid, self)
                }
                AdminCmd::DeleteIOSubQ(sqid) => {
                    state.acmd_delete_io_sq(sqid, self)
                }
                AdminCmd::AsyncEventReq => {
                    // async event requests do not appear to be an optional
                    // feature but are not yet supported. The only
                    // command-specific error we could return is "async event
                    // limit exceeded".
                    //
                    // qemu's emulated NVMe also does not support async events
                    // but returns invalid opcode with the do-not-retry flag
                    // set. Do the same so that guest drivers that check for
                    // this can detect it and stop posting async events.
                    cmds::Completion::generic_err(bits::STS_INVAL_OPC).dnr()
                }
                AdminCmd::DoorbellBufCfg(cmd) => {
                    state.acmd_doorbell_buf_cfg(&cmd)
                }
                AdminCmd::Unknown(_) => {
                    cmds::Completion::generic_err(bits::STS_INTERNAL_ERR)
                }
            };

            permit.complete(comp);
        }

        // Notify for any newly added completions
        cq.fire_interrupt();

        Ok(())
    }

    fn mem_access(&self) -> Option<Guard<'_, MemAccessed>> {
        self.pci_state.acc_mem.access()
    }
}

impl pci::Device for PciNvme {
    fn bar_rw(&self, bar: pci::BarN, mut rwo: RWOp) {
        assert_eq!(bar, pci::BarN::BAR0);
        let f = |id: &CtrlrReg, mut rwo: RWOp<'_, '_>| {
            let res = match &mut rwo {
                RWOp::Read(ro) => self.reg_ctrl_read(id, ro),
                RWOp::Write(wo) => self.reg_ctrl_write(id, wo),
            };
            // TODO: is there a better way to report errors
            if let Err(err) = res {
                slog::error!(self.log, "nvme reg r/w failure";
                    "offset" => rwo.offset(),
                    "register" => ?id,
                    "error" => %err
                );
            }
        };

        if rwo.offset() >= CONTROLLER_REGS.db_offset {
            // This is an I/O DoorBell op, so skip RegMaps's process
            f(&CtrlrReg::IOQueueDoorBells, rwo);
        } else {
            // Otherwise deal with every other register as normal
            CONTROLLER_REGS.map.process(&mut rwo, f)
        }
    }

    fn attach(&self) {
        // TODO: Update the controller logic to reach out to `pci_state` to get
        // access to the MSIX handle, rather than caching it internally
        let mut state = self.state.lock().unwrap();
        state.msix_hdl = self.pci_state.msix_hdl();
        assert!(state.msix_hdl.is_some());
    }

    fn device_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
}

impl MigrateMulti for PciNvme {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let ctrl = self.state.lock().unwrap();
        output.push(ctrl.export().into())?;
        drop(ctrl);

        MigrateMulti::export(&self.pci_state, output, ctx)?;

        Ok(())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let input: migrate::NvmeCtrlV1 = offer.take()?;

        let mut ctrl = self.state.lock().unwrap();
        ctrl.import(input, self)?;
        drop(ctrl);

        MigrateMulti::import(&self.pci_state, offer, ctx)?;

        Ok(())
    }
}

impl Lifecycle for PciNvme {
    fn type_name(&self) -> &'static str {
        "pci-nvme"
    }

    fn reset(&self) {
        let mut ctrl = self.state.lock().unwrap();
        ctrl.reset(self);
        self.pci_state.reset(self);
    }

    fn pause(&self) {
        self.block_attach.pause();
    }

    fn resume(&self) {
        self.block_attach.resume();
    }

    fn paused(&self) -> BoxFuture<'static, ()> {
        Box::pin(self.block_attach.none_processing())
    }

    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}

pub mod migrate {
    use crate::migrate::*;

    use serde::{Deserialize, Serialize};

    use super::queue::migrate::{NvmeCompQueueV1, NvmeSubQueueV1};

    #[derive(Deserialize, Serialize)]
    pub struct NvmeCtrlV1 {
        pub cap: u64,
        pub cc: u32,
        pub csts: u32,
        pub aqa: u32,

        pub acq_base: u64,
        pub asq_base: u64,

        pub dbbuf_shadow: u64,
        pub dbbuf_evtidx: u64,

        pub cqs: Vec<NvmeCompQueueV1>,
        pub sqs: Vec<NvmeSubQueueV1>,
    }
    impl Schema<'_> for NvmeCtrlV1 {
        fn id() -> SchemaId {
            ("nvme-ctrl", 1)
        }
    }
}

/// NVMe Controller Registers
///
/// See NVMe 1.0e Section 3.1 Register Definition
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum CtrlrReg {
    /// Reserved register.
    Reserved,

    /// Controller Capabilities (CAP)
    ///
    /// See NVMe 1.0e Section 3.1.1 Offset 00h: CAP - Controller Capabilities
    CtrlrCaps,
    /// Version (VS)
    ///
    /// See NVMe 1.0e Section 3.1.2 Offset 08h: VS - Version
    Version,
    /// Interrupt Mask Set (INTMS)
    ///
    /// See NVMe 1.0e Section 3.1.3 Offset 0Ch: INTMS - Interrupt Mask Set
    IntrMaskSet,
    /// Interrupt Mask Clear (INTMC)
    ///
    /// See NVMe 1.0e Section 3.1.4 Offset 10h: INTMC - Interrupt Mask Clear
    IntrMaskClear,
    /// Controller Configuration (CC)
    ///
    /// See NVMe 1.0e Section 3.1.5 Offset 14h: CC - Controller Configuration
    CtrlrCfg,
    /// Controller Status (CSTS)
    ///
    /// See NVMe 1.0e Section 3.1.6 Offset 1Ch: CSTS - Controller Status
    CtrlrStatus,
    /// Admin Queue Attributes (AQA)
    ///
    /// See NVMe 1.0e Section 3.1.7 Offset 24h: AQA - Admin Queue Attributes
    AdminQueueAttr,
    /// Admin Submission Queue Base Address (ASQ)
    ///
    /// See NVMe 1.0e Section 3.1.8 Offset 28h: ASQ - Admin Submission Queue Base Address
    AdminSubQAddr,
    /// Admin Completion Queue Base Address (ACQ)
    ///
    /// See NVMe 1.0e Section 3.1.9 Offset 30h: ACQ - Admin Completion Queue Base Addres
    AdminCompQAddr,

    /// Admin Submission Queue Tail Doorbell
    ///
    /// See NVMe 1.0e Section 3.1.10
    DoorBellAdminSQ,
    /// Admin Completion Queue Head Doorbell
    ///
    /// See NVMe 1.0e Section 3.1.11
    DoorBellAdminCQ,

    /// I/O Submission Tail and Completion Head Doorbells
    ///
    /// See NVMe 1.0e Section 3.1.10 & 3.1.11
    IOQueueDoorBells,
}

/// Size of the Controller Register space
///
/// We specify a size of 0x4000 even though we're not using anywhere near that much
/// space because the NVMe spec requires that bits 13:04 of MLBAR be R/O and 0 on reset.
/// We do that by basically returning a size of 0x4000 which makes us ignore any writes
/// to the bottom 14 bits as needed. See `pci::Bars::reg_write`.
///
/// See NVMe 1.0e Section 2.1.10 Offset 10h: MLBAR (BAR0) - Memory Register Base Address, lower 32 bits
const CONTROLLER_REG_SZ: usize = 0x4000;

struct CtrlRegs {
    map: RegMap<CtrlrReg>,
    db_offset: usize,
}
lazy_static! {
    static ref CONTROLLER_REGS: CtrlRegs = {
        let mut layout = [
            (CtrlrReg::CtrlrCaps, 8),
            (CtrlrReg::Version, 4),
            (CtrlrReg::IntrMaskSet, 4),
            (CtrlrReg::IntrMaskClear, 4),
            (CtrlrReg::CtrlrCfg, 4),
            (CtrlrReg::Reserved, 4),
            (CtrlrReg::CtrlrStatus, 4),
            (CtrlrReg::Reserved, 4),
            (CtrlrReg::AdminQueueAttr, 4),
            (CtrlrReg::AdminSubQAddr, 8),
            (CtrlrReg::AdminCompQAddr, 8),
            (CtrlrReg::Reserved, 0xec8),
            (CtrlrReg::Reserved, 0x100),
            // CAP.DSTRD = 0 hence 0 stride and doorbells are 4 bytes apart
            (CtrlrReg::DoorBellAdminSQ, 4),
            (CtrlrReg::DoorBellAdminCQ, 4),
            (CtrlrReg::IOQueueDoorBells, 8 * MAX_NUM_IO_QUEUES),
            // Left as 0 and adjusted below
            (CtrlrReg::Reserved, 0),
        ];

        // Update the last `Reserved` slot to pad out the rest of the controller register space
        let regs_sz = layout.iter().map(|(_, sz)| sz).sum::<usize>();
        assert!(regs_sz <= CONTROLLER_REG_SZ);
        layout.last_mut().unwrap().1 = CONTROLLER_REG_SZ - regs_sz;

        // Find the offset of IOQueueDoorBells
        let db_offset = layout
            .iter()
            .take_while(|&(r,_)| *r != CtrlrReg::IOQueueDoorBells)
            .map(|&(_, sz)| sz)
            .sum();

        CtrlRegs {
            map: RegMap::create_packed(
                CONTROLLER_REG_SZ,
                &layout,
                Some(CtrlrReg::Reserved),
            ),
            db_offset,
        }
    };
}


================================================
FILE: lib/propolis/src/hw/nvme/queue.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::{HashMap, HashSet};
use std::fmt::{self, Debug};
use std::sync::atomic::{fence, AtomicU16, Ordering};
use std::sync::{Arc, Mutex, MutexGuard, Weak};

use super::bits::{CompletionQueueEntry, SubmissionQueueEntry};
use super::cmds::Completion;
use crate::accessors::MemAccessor;
use crate::block;
use crate::common::*;
use crate::hw::nvme::DeviceId;
use crate::hw::pci;
use crate::migrate::MigrateStateError;
use crate::vmm::MemCtx;

use thiserror::Error;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn nvme_cqe(devcq_id: u64, idx: u16, phase: u8) {}
    fn nvme_sq_dbbuf_read(devsq_id: u64, val: u32, tail: u16) {}
    fn nvme_sq_dbbuf_write(devsq_id: u64, head: u16) {}
    fn nvme_sq_dbbuf_write_shadow(devsq_id: u64, head: u16) {}
    fn nvme_cq_dbbuf_read(devcq_id: u64, val: u32, tail: u16) {}
    fn nvme_cq_dbbuf_write(devcq_id: u64, head: u16) {}
    fn nvme_cq_dbbuf_write_shadow(devcq_id: u64, head: u16) {}
}

/// Each queue is identified by a 16-bit ID.
///
/// See NVMe 1.0e Section 4.1.4 Queue Identifier
///
/// Submission and completion queue IDs are distinct namespaces, so a device
/// can have both a "Submission Queue 1" and "Completion Queue 1".
///
/// For USDT probes, we combine this ID with an NVMe controller ID to produce a
/// `devq_id`. This combined identifier is still ambiguous beteen one submission
/// queue and one completion queue. Contextually there is typically only one
/// reasonable interpretation of the ID, but the probe arguments are named
/// `devsq_id` or `devcq_id` to be explicit about identifying a Submission or
/// Completion queue, respectively.
pub type QueueId = u16;

/// The minimum number of entries in either a Completion or Submission Queue.
///
/// Note: One entry will always be unavailable for use due to the Head and Tail
///       entry pointer definitions.
/// See NVMe 1.0e Section 4.1.3 Queue Size
const MIN_QUEUE_SIZE: u32 = 2;

/// The maximum number of entries in either a Completion or Submission Queue.
///
/// See NVMe 1.0e Section 4.1.3 Queue Size
pub const MAX_QUEUE_SIZE: u32 = 1 << 16;

/// The maximum number of entries in the Admin Completion or Admin Submission Queues.
///
/// See NVMe 1.0e Section 4.1.3 Queue Size
const MAX_ADMIN_QUEUE_SIZE: u32 = 1 << 12;

/// The Admin Completion and Submission are defined to have ID 0.
///
/// See NVMe 1.0e Section 1.6.1 Admin Queue
pub const ADMIN_QUEUE_ID: QueueId = 0;

/// Completion Queue State
struct CompQueueState {
    /// Number of entries that are available for use.
    ///
    /// Starts off as queue size - 1 and gets decremented for each corresponding
    /// Submission Queue entry we begin servicing.
    avail: u16,

    /// The current phase tag.
    ///
    /// The Phase Tag is used to identify to the host (VM) that a Completion
    /// entry is new. Flips every time the Tail entry pointer wraps around.
    ///
    /// See NVMe 1.0e Section 4.5 Completion Queue Entry - Phase Tag (P)
    phase: bool,

    /// Collection of Submission Queues (IDs) which issue their completions to
    /// this CQ, and which became "corked": unable to acquire a permit while
    /// attempting to process a pending entry.
    corked: HashSet<QueueId>,
}

/// Submission Queue State
struct SubQueueState {
    params: TransferParams,
}

/// Helper for manipulating Completion/Submission Queues
///
/// The type parameter `QS` is used to constrain the set of methods exposed
/// based on whether the queue in question is a Completion or Submission queue.
///
/// Use either [CompQueueState] or [SubQueueState].
struct QueueState<QS> {
    /// The size of the queue in question.
    ///
    /// See NVMe 1.0e Section 4.1.3 Queue Size
    size: u32,

    /// The actual queue state that is updated during the normal course of
    /// operation.
    ///
    /// Either `CompQueueState` for a Completion Queue or
    /// a `SubQueueState` for a Submission Queue.
    inner: Mutex<QueueInner<QS>>,

    /// This queue's memory accessor node.
    ///
    /// Be careful about lock ordering when using this accessor; access_locked()
    /// holds this node's lock. If a user of this queue state requires both
    /// `access_locked()` and `QueueInner`, the protocol is to lock queue
    /// state first and this accessor second.
    acc_mem: MemAccessor,
}
impl<QS> QueueState<QS> {
    fn new(size: u32, acc_mem: MemAccessor, inner: QS) -> Self {
        assert!(size >= MIN_QUEUE_SIZE && size <= MAX_QUEUE_SIZE);
        Self {
            size,
            inner: Mutex::new(QueueInner {
                head: 0,
                tail: 0,
                db_buf: None,
                inner,
            }),
            acc_mem,
        }
    }
    fn lock(&self) -> QueueGuard<'_, QS> {
        QueueGuard {
            size: &self.size,
            acc_mem: &self.acc_mem,
            state: self.inner.lock().unwrap(),
        }
    }
}

fn wrap_add(size: u32, idx: u16, off: u16) -> u16 {
    debug_assert!(u32::from(idx) < size);
    debug_assert!(u32::from(off) < size);

    let res = u32::from(idx) + u32::from(off);
    if res >= size {
        (res - size) as u16
    } else {
        res as u16
    }
}
fn wrap_sub(size: u32, idx: u16, off: u16) -> u16 {
    debug_assert!(u32::from(idx) < size);
    debug_assert!(u32::from(off) < size);

    if off > idx {
        ((u32::from(idx) + size) - u32::from(off)) as u16
    } else {
        idx - off
    }
}

/// Validates whether the given parameters may be used to create an SQ/CQ
fn validate(
    is_admin_queue: bool,
    base: GuestAddr,
    size: u32,
) -> Result<(), QueueCreateErr> {
    if (base.0 & PAGE_OFFSET as u64) != 0 {
        return Err(QueueCreateErr::InvalidBaseAddr);
    }
    let max =
        if is_admin_queue { MAX_ADMIN_QUEUE_SIZE } else { MAX_QUEUE_SIZE };
    if size < MIN_QUEUE_SIZE || size > max {
        return Err(QueueCreateErr::InvalidSize);
    }
    Ok(())
}

struct QueueInner<QS> {
    /// The Queue Head entry pointer.
    ///
    /// The consumer of entries on a queue uses the current Head entry pointer
    /// to identify the next entry to be pulled off the queue.
    ///
    /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition
    head: u16,

    /// The Queue Tail entry pointer.
    ///
    /// The submitter of entries to a queue uses the current Tail entry pointer
    /// to identify the next open queue entry space.
    ///
    /// See NVMe 1.0e Section 4.1 Submission Queue & Completion Queue Definition
    tail: u16,

    /// Doorbell Buffer for assisting in elision of doorbell ringing
    db_buf: Option<DoorbellBuffer>,

    /// Additional state specific to the queue type (completion or submission)
    inner: QS,
}

struct QueueGuard<'a, QS> {
    state: MutexGuard<'a, QueueInner<QS>>,
    size: &'a u32,
    acc_mem: &'a MemAccessor,
}
impl<QS> QueueGuard<'_, QS> {
    /// Returns if the queue is currently empty with the given head and tail
    /// pointers.
    ///
    /// A queue is empty when the Head entry pointer equals the Tail entry
    /// pointer.
    ///
    /// See: NVMe 1.0e Section 4.1.1 Empty Queue
    fn is_empty(&self) -> bool {
        self.state.head == self.state.tail
    }

    /// Returns if the queue is currently full with the given head and tail
    /// pointers.
    ///
    /// The queue is full when the Head entry pointer equals one more than the
    /// Tail entry pointer. The number of entries in a queue will always be 1
    /// less than the queue size.
    ///
    /// See: NVMe 1.0e Section 4.1.2 Full Queue
    fn is_full(&self) -> bool {
        let state = &self.state;

        (state.head > 0 && state.tail == (state.head - 1))
            || (state.head == 0 && state.tail == (*self.size - 1) as u16)
    }

    /// How many queue entries are currently occupied?
    fn num_occupied(&self) -> u16 {
        wrap_sub(*self.size, self.state.tail, self.state.head)
    }

    /// Helper method to calculate a positive offset for a given index, wrapping
    /// at the size of the queue.
    fn idx_add(&self, idx: u16, off: u16) -> u16 {
        wrap_add(*self.size, idx, off)
    }

    /// Helper method to calculate a negative offset for a given index, wrapping
    /// at the size of the queue.
    fn idx_sub(&self, idx: u16, off: u16) -> u16 {
        wrap_sub(*self.size, idx, off)
    }
}
impl QueueGuard<'_, CompQueueState> {
    /// Attempt to return the Tail entry pointer and then move it forward by 1.
    ///
    /// If the queue is full this method returns [`None`].
    /// Otherwise, this method returns the current Tail entry pointer and then
    /// increments the Tail entry pointer by 1 (wrapping if necessary).
    fn push_tail(&mut self) -> Option<(u16, bool)> {
        if self.is_full() {
            return None;
        }
        let tail = self.state.tail;
        let phase = self.state.inner.phase;

        self.state.tail = self.idx_add(tail, 1);
        if self.state.tail < tail {
            // We wrapped, so flip phase
            self.state.inner.phase = !self.state.inner.phase;
        }
        Some((tail, phase))
    }

    /// How many slots are occupied between the head and the tail i.e., how
    /// many entries can we read from the queue currently.
    fn avail_occupied(&self) -> u16 {
        self.idx_sub(self.state.tail, self.state.head)
    }

    /// Attempt to move the Head entry pointer forward to the given index.
    ///
    /// The given index must be less than the size of the queue. The queue
    /// must have enough occupied slots otherwise we return an error.
    /// Conceptually this method indicates some entries have been consumed
    /// from the queue.
    fn pop_head_to(&mut self, idx: u16) -> Result<(), QueueUpdateError> {
        if u32::from(idx) >= *self.size {
            return Err(QueueUpdateError::InvalidEntry);
        }
        let pop_count = self.idx_sub(idx, self.state.head);
        if pop_count > self.avail_occupied() {
            return Err(QueueUpdateError::TooManyEntries);
        }
        // Replace head with given idx and update the number of available slots
        self.state.head = idx;
        self.state.inner.avail += pop_count;

        Ok(())
    }

    /// Is there available space in the CQ to push an entry?
    fn has_avail(&self) -> bool {
        self.state.inner.avail != 0
    }

    fn take_avail(&mut self, sq: &Arc<SubQueue>) -> bool {
        if let Some(avail) = self.state.inner.avail.checked_sub(1) {
            self.state.inner.avail = avail;
            true
        } else {
            // Make sure we kick the SQs when we have space available again
            self.record_corked(sq);
            false
        }
    }

    fn release_avail(&mut self) {
        if let Some(avail) = self.state.inner.avail.checked_add(1) {
            assert!(
                u32::from(avail) < *self.size,
                "attempted to overflow CQ available size"
            );
            self.state.inner.avail = avail;
        } else {
            panic!("attempted to overflow CQ available");
        }
    }

    /// Record an SQ as being corked on this CQ due to lack of permit capacity.
    fn record_corked(&mut self, sq: &Arc<SubQueue>) {
        self.state.inner.corked.insert(sq.id);
    }

    /// Get list of SQ IDs which were corked on this CQ
    fn kick(&mut self) -> Option<Vec<QueueId>> {
        if !self.state.inner.corked.is_empty() {
            Some(self.state.inner.corked.drain().collect())
        } else {
            None
        }
    }

    /// Write update to the EventIdx in Doorbell Buffer page, if possible
    fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            // Update EventIdx as far as we're willing to forego doorbell
            // updates about the CQ head.  We are not especially concerned with
            // receiving timely doorbell updates from the guest about the head.
            // We keep our own tally of how many entries in the CQ are available
            // for completions to land.  In the typical case, we will read the
            // shadow doorbell from db_buf JIT to update the available CQ space.
            //
            // However, simply keeping EventIdx matching the queue tail means we
            // opt out of *any* notification that a completion is posted.  Then,
            // if an I/O was submitted, not processed immediately due to
            // insufficient CQ space, but the guest submits no further I/Os on
            // that queue, we would never notice that there is space in the CQ.
            // The I/O would go unfulfilled forever.
            //
            // For now, leave EventIdx one before the actual CQ tail, so that
            // making the CQ empty requires a doorbell notify.  This is
            // excessive; there are many cases where we don't care if the CQ is
            // to be emptied.
            if self.avail_occupied() <= 1 {
                // The queue was empty and just became non-empty.  So `tail` has
                // not advanced far enough that we can actually advance
                // EventIdx.
                return;
            }

            let next_evtidx = self.idx_sub(self.state.tail, 1);

            probes::nvme_cq_dbbuf_write!(|| (devq_id, next_evtidx));
            fence(Ordering::Release);
            mem.write(db_buf.eventidx, &next_evtidx);
        }
    }

    /// Write update to the Shadow Doorbell Buffer page, if possible.
    ///
    /// We would expect the guest driver to keep this value in a valid state per
    /// the specification, but qemu notes that certain consumers fail to do so
    /// on the admin queue.  We follow their lead to avoid issues.
    fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            probes::nvme_cq_dbbuf_write_shadow!(|| (devq_id, self.state.head));
            fence(Ordering::Release);
            mem.write(db_buf.shadow, &self.state.head);
        }
    }

    /// Read update from the Shadow in Doorbell Buffer page, if possible
    fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            if let Some(new_head) = mem.read::<u32>(db_buf.shadow) {
                let new_head = *new_head;
                probes::nvme_cq_dbbuf_read!(|| (
                    devq_id,
                    new_head,
                    self.state.head
                ));
                fence(Ordering::Acquire);
                // TODO: roll back on bad input?
                let _ = self.pop_head_to(new_head as u16);
            }
        }
    }
}

impl CompQueueState {
    /// Create a new `QueueState` for a Completion Queue
    fn new(size: u32, acc_mem: MemAccessor) -> QueueState<CompQueueState> {
        QueueState::new(
            size,
            acc_mem,
            CompQueueState {
                avail: (size - 1) as u16,
                // As the device side, we start with our phase tag as asserted (1)
                // since the host side (VM) will create all the Completion Queue
                // entries with the phase initially zeroed out.
                phase: true,
                corked: HashSet::new(),
            },
        )
    }
}

impl SubQueueState {
    /// Create a new `QueueState` for a Submission Queue
    fn new(size: u32, acc_mem: MemAccessor) -> QueueState<SubQueueState> {
        QueueState::new(
            size,
            acc_mem,
            SubQueueState { params: Default::default() },
        )
    }
}
impl QueueGuard<'_, SubQueueState> {
    /// How many slots are empty between the tail and the head i.e., how many
    /// entries can we write to the queue currently.
    fn avail_empty(&self) -> u16 {
        self.idx_sub(self.idx_sub(self.state.head, 1), self.state.tail)
    }

    /// Attempt to return the Head entry pointer and then move it forward by 1.
    ///
    /// If the queue is empty this method returns [`None`].
    /// Otherwise, this method returns the current Head entry pointer and then
    /// increments the Head entry pointer by 1 (wrapping if necessary).
    fn pop_head(&mut self, last_head: &AtomicU16) -> Option<u16> {
        if self.is_empty() {
            return None;
        } else {
            let old_head = self.state.head;
            self.state.head = self.idx_add(old_head, 1);
            last_head.store(self.state.head, Ordering::Release);
            Some(old_head)
        }
    }

    /// Attempt to move the Tail entry pointer forward to the given index.
    ///
    /// The given index must be less than the size of the queue. The queue must
    /// have enough empty slots available otherwise we return an error.
    /// Conceptually this method indicates new entries have been added to the
    /// queue.
    fn push_tail_to(&mut self, idx: u16) -> Result<(), QueueUpdateError> {
        if u32::from(idx) >= *self.size {
            return Err(QueueUpdateError::InvalidEntry);
        }
        let push_count = self.idx_sub(idx, self.state.tail);
        if push_count > self.avail_empty() {
            return Err(QueueUpdateError::TooManyEntries);
        }
        // Replace tail with given idx
        self.state.tail = idx;

        Ok(())
    }

    /// Write update to the EventIdx in Doorbell Buffer page, if possible
    fn db_buf_write(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            probes::nvme_sq_dbbuf_write!(|| (devq_id, self.state.head));
            // Keep EventIdx populated with the position of the SQ head.  As
            // long as there are entries available between the head and tail, we
            // do not want the guest taking exits for ultimately redundant
            // doorbells.
            //
            // We proactively read from the db_buf shadow while attempted to pop
            // entries submitted to the queue.  Only once it is empty, with the
            // head/tail being equal, do we want doorbell calls from the guest.
            fence(Ordering::Release);
            mem.write(db_buf.eventidx, &self.state.head);
        }
    }

    /// Write update to the Shadow Doorbell Buffer page, if possible.
    ///
    /// See [QueueGuard<SubQueueState>::db_buf_write_shadow()] for why we would
    /// write to a "guest-owned" page.
    fn db_buf_write_shadow(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            probes::nvme_sq_dbbuf_write_shadow!(|| (devq_id, self.state.tail));
            fence(Ordering::Release);
            mem.write(db_buf.shadow, &self.state.tail);
        }
    }

    /// Read update from the Shadow in Doorbell Buffer page, if possible
    fn db_buf_read(&mut self, devq_id: u64, mem: &MemCtx) {
        if let Some(db_buf) = self.state.db_buf {
            if let Some(new_tail) = mem.read::<u32>(db_buf.shadow) {
                let new_tail = *new_tail;
                probes::nvme_sq_dbbuf_read!(|| (
                    devq_id,
                    new_tail,
                    self.state.head
                ));
                fence(Ordering::Acquire);
                // TODO: roll back on bad input?
                let _ = self.push_tail_to(new_tail as u16);
            }
        }
    }
}

/// Errors that may be encountered during Queue creation.
#[derive(Error, Debug)]
pub enum QueueCreateErr {
    /// The specified base address is invalid.
    #[error("invalid base address")]
    InvalidBaseAddr,

    /// The specified length is invalid.
    #[error("invalid size")]
    InvalidSize,

    #[error("the SQ ID {0} is already associated with the CQ")]
    SubQueueIdAlreadyExists(QueueId),
}

/// Errors that may be encountered while adjusting Queue head/tail pointers.
#[derive(Error, Debug)]
pub enum QueueUpdateError {
    #[error("tried to move head or tail pointer to an invalid index")]
    InvalidEntry,

    #[error(
        "tried to push or pop too many entries given the current head/tail"
    )]
    TooManyEntries,
}

/// Basic parameters for Submission & Completion Queue creation
#[derive(Copy, Clone)]
pub struct CreateParams {
    pub id: QueueId,
    // Not strictly necessary for submission or completion queues, but helpful
    // to disambiguate the queue in probes.
    pub device_id: DeviceId,
    pub base: GuestAddr,
    pub size: u32,
}

/// Type for manipulating Submission Queues.
pub struct SubQueue {
    /// The ID of this Submission Queue.
    id: QueueId,

    /// The ID of the device that owns this submission queue. Kept here only to
    /// produce `devsq_id` for DTrace probes.
    device_id: DeviceId,

    /// The corresponding Completion Queue.
    cq: Arc<CompQueue>,

    /// Queue state such as the size and current head/tail entry pointers.
    state: QueueState<SubQueueState>,

    /// Duplicate of head pointer value from inside [SubQueueState], kept in
    /// sync for lockless access during [SubQueue::annotate_completion()] calls.
    cur_head: AtomicU16,

    /// The [`GuestAddr`] at which the Queue is mapped.
    base: GuestAddr,
}

impl Drop for SubQueue {
    fn drop(&mut self) {
        // Remove the CQ-SQ link
        let mut cq_sqs = self.cq.sqs.lock().unwrap();
        cq_sqs.remove(&self.id).unwrap();
    }
}

impl SubQueue {
    /// Create a Submission Queue object backed by the guest memory at the
    /// given base address.
    pub fn new(
        params: CreateParams,
        cq: Arc<CompQueue>,
        acc_mem: MemAccessor,
    ) -> Result<Arc<Self>, QueueCreateErr> {
        let CreateParams { id, device_id, base, size } = params;
        validate(id == ADMIN_QUEUE_ID, base, size)?;
        let sq = Arc::new(Self {
            id,
            device_id,
            cq,
            state: SubQueueState::new(size, acc_mem),
            cur_head: AtomicU16::new(0),
            base,
        });

        use std::collections::hash_map::Entry;
        // Associate this SQ with the given CQ
        let mut cq_sqs = sq.cq.sqs.lock().unwrap();
        match cq_sqs.entry(id) {
            Entry::Occupied(_) => {
                Err(QueueCreateErr::SubQueueIdAlreadyExists(id))
            }
            Entry::Vacant(entry) => {
                entry.insert(Arc::downgrade(&sq));
                drop(cq_sqs);
                Ok(sq)
            }
        }
    }

    /// Attempt to move the Tail entry pointer forward to the given index.
    pub fn notify_tail(&self, idx: u16) -> Result<u16, QueueUpdateError> {
        let mut state = self.state.lock();
        state.push_tail_to(idx)?;
        if self.id == ADMIN_QUEUE_ID {
            if let Some(mem) = state.acc_mem.access() {
                state.db_buf_write_shadow(self.devq_id(), &mem);
            }
        }

        Ok(state.num_occupied())
    }

    pub fn num_occupied(&self) -> u16 {
        self.state.lock().num_occupied()
    }

    /// Returns the next entry off of the Queue or [`None`] if it is empty.
    pub fn pop(
        self: &Arc<SubQueue>,
    ) -> Option<(GuestData<SubmissionQueueEntry>, Permit, u16)> {
        // Lock the SubQueueState early to conform to lock ordering requirement;
        // see docs on QueueState::acc_mem.
        let mut state = self.state.lock();

        let Some(mem) = self.state.acc_mem.access_locked() else { return None };
        let mem = mem.view();

        // Attempt to reserve an entry on the Completion Queue
        let permit = self.cq.reserve_entry(&self, &mem)?;

        // Check for last-minute updates to the tail via any configured doorbell
        // page, prior to attempting the pop itself.
        state.db_buf_read(self.devq_id(), &mem);

        if let Some(idx) = state.pop_head(&self.cur_head) {
            let addr = self.base.offset::<SubmissionQueueEntry>(idx as usize);

            if let Some(ent) = mem.read::<SubmissionQueueEntry>(addr) {
                let devq_id = self.devq_id();
                state.db_buf_write(devq_id, &mem);
                state.db_buf_read(devq_id, &mem);
                return Some((ent, permit.promote(ent.cid()), idx));
            }
            // TODO: set error state on queue/ctrl if we cannot read entry
        }

        // Drop lock on SQ before releasing permit (which locks CQ)
        drop(state);

        // No Submission Queue entry, so return the CQE permit
        permit.remit();
        None
    }

    /// Returns the ID of this Submission Queue.
    pub(super) fn id(&self) -> QueueId {
        self.id
    }

    pub(super) fn set_db_buf(
        &self,
        db_buf: Option<DoorbellBuffer>,
        is_import: bool,
    ) {
        let mut state = self.state.lock();
        state.state.db_buf =
            db_buf.map(|dbb| dbb.offset_for_queue(false, self.id()));

        if !is_import {
            // Mimic qemu and sync out the SQ tail during setup
            if let (Some(mem), Some(db_buf)) =
                (state.acc_mem.access(), state.state.db_buf)
            {
                mem.write(db_buf.shadow, &(state.state.tail as u32));
            }
        }
    }

    pub(super) fn update_params(&self, params: TransferParams) {
        self.state.lock().state.inner.params = params;
    }
    pub(super) fn params(&self) -> TransferParams {
        self.state.lock().state.inner.params
    }

    /// Annotate a CQE with data (ID and head index) from this SQ
    fn annotate_completion(&self, cqe: &mut CompletionQueueEntry) {
        cqe.sqid = self.id;
        cqe.sqhd = self.cur_head.load(Ordering::Acquire);
    }

    /// Return a VM-unique identifier for this submission queue
    pub(crate) fn devq_id(&self) -> u64 {
        super::devq_id(self.device_id, self.id)
    }

    pub(super) fn export(&self) -> migrate::NvmeSubQueueV1 {
        let inner = self.state.inner.lock().unwrap();
        migrate::NvmeSubQueueV1 {
            id: self.id,
            size: self.state.size,
            head: inner.head,
            tail: inner.tail,
            base: self.base.0,
            cq_id: self.cq.id,
        }
    }

    pub(super) fn import(
        &self,
        state: migrate::NvmeSubQueueV1,
    ) -> Result<(), MigrateStateError> {
        // These must've been provided at construction
        assert_eq!(self.id, state.id);
        assert_eq!(self.cq.id, state.cq_id);
        assert_eq!(self.base.0, state.base);
        assert_eq!(self.state.size, state.size);

        let mut inner = self.state.inner.lock().unwrap();
        inner.head = state.head;
        inner.tail = state.tail;

        Ok(())
    }
}

/// Type for manipulating Completion Queues.
pub struct CompQueue {
    /// The ID of this Completion Queue.
    id: QueueId,

    /// The ID of the device that owns this completion queue. Kept here only to
    /// produce `devcq_id` for DTrace probes.
    device_id: DeviceId,

    /// The Interrupt Vector used to signal to the host (VM) upon pushing
    /// entries onto the Completion Queue.
    iv: u16,

    /// Queue state such as the size and current head/tail entry pointers.
    state: QueueState<CompQueueState>,

    /// The [`GuestAddr`] at which the Queue is mapped.
    base: GuestAddr,

    /// MSI-X object associated with PCIe device to signal host (VM).
    hdl: pci::MsixHdl,

    /// [`SubQueue`]'s associated with this Completion Queue.
    sqs: Mutex<HashMap<QueueId, Weak<SubQueue>>>,
}

impl CompQueue {
    /// Creates a Completion Queue object backed by the guest memory at the
    /// given base address.
    pub fn new(
        params: CreateParams,
        iv: u16,
        hdl: pci::MsixHdl,
        acc_mem: MemAccessor,
    ) -> Result<Self, QueueCreateErr> {
        let CreateParams { id, device_id, base, size } = params;
        validate(id == ADMIN_QUEUE_ID, base, size)?;
        Ok(Self {
            id,
            device_id,
            iv,
            state: CompQueueState::new(size, acc_mem),
            base,
            hdl,
            sqs: Mutex::new(HashMap::new()),
        })
    }

    /// Attempt to move the Head entry pointer forward to the given index.
    pub fn notify_head(&self, idx: u16) -> Result<(), QueueUpdateError> {
        let mut state = self.state.lock();
        state.pop_head_to(idx)?;
        if self.id == ADMIN_QUEUE_ID {
            if let Some(mem) = state.acc_mem.access() {
                state.db_buf_write_shadow(self.devq_id(), &mem)
            }
        }
        Ok(())
    }

    /// Fires an interrupt to the guest with the associated interrupt vector
    /// if the queue is not currently empty.
    pub fn fire_interrupt(&self) {
        let state = self.state.lock();
        if !state.is_empty() {
            self.hdl.fire(self.iv);
        }
    }

    /// Returns whether the SQIDs should be kicked due to no permits being
    /// available previously.
    ///
    /// If the value was true, it will also get reset to false.
    pub fn kick(&self) -> Option<Vec<QueueId>> {
        self.state.lock().kick()
    }

    /// Returns the ID of this Completion Queue.
    pub fn id(&self) -> QueueId {
        self.id
    }

    /// Returns the number of SQs associated with this Completion Queue.
    pub fn associated_sqs(&self) -> usize {
        let sqs = self.sqs.lock().unwrap();
        sqs.len()
    }

    /// Attempt to reserve an entry in the Completion Queue.
    ///
    /// An entry permit allows the user to push onto the Completion Queue.
    fn reserve_entry(
        self: &Arc<Self>,
        sq: &Arc<SubQueue>,
        mem: &MemCtx,
    ) -> Option<ProtoPermit> {
        let mut state = self.state.lock();
        if !state.has_avail() {
            // If the CQ appears full, but the db_buf shadow is configured, do a
            // last-minute check to see if entries have been consumed/freed
            // without a doorbell call.
            state.db_buf_read(self.devq_id(), mem);
        }
        if state.take_avail(sq) {
            Some(ProtoPermit::new(self, sq))
        } else {
            // No more spots available.
            None
        }
    }

    /// Add a new entry to the Completion Queue while consuming a `Permit`.
    fn push(&self, comp: Completion, cid: u16, sq: &SubQueue) {
        let mut cqe = CompletionQueueEntry::new(comp, cid);
        sq.annotate_completion(&mut cqe);

        let mut state = self.state.lock();
        let (idx, phase) = state
            .push_tail()
            .expect("CQ should have available space for assigned permit");

        probes::nvme_cqe!(|| (self.devq_id(), idx, u8::from(phase)));

        // The only definite indicator that a CQE has become valid is the phase
        // bit being toggled.  Since the interface for writing to guest memory
        // cannot ensure that the other bits of the CQE are written before the
        // phase bit, we must proceed carefully:
        //
        // The CQE is first written with the opposite phase set, so it appears
        // to the guest OS as a to-be-filled entry.  With that write complete,
        // ensuring that all fields of the CQE are visible to the host, we write
        // it again, with the phase bit correctly set.
        //
        // XXX: handle a guest addr that becomes unmapped later
        let addr = self.base.offset::<CompletionQueueEntry>(idx as usize);
        // TODO: access disallowed?
        let Some(mem) = self.state.acc_mem.access_locked() else {
            // TODO: mark the queue/controller in error state?
            return;
        };
        let mem = mem.view();
        cqe.set_phase(!phase);
        mem.write(addr, &cqe);
        cqe.set_phase(phase);
        mem.write(addr, &cqe);

        let devq_id = self.devq_id();
        state.db_buf_read(devq_id, &mem);
        state.db_buf_write(devq_id, &mem);
    }

    pub(super) fn set_db_buf(
        &self,
        db_buf: Option<DoorbellBuffer>,
        is_import: bool,
    ) {
        let mut state = self.state.lock();
        state.state.db_buf =
            db_buf.map(|dbb| dbb.offset_for_queue(true, self.id()));

        if !is_import {
            // Mimic qemu and sync out the CQ head during setup
            if let (Some(mem), Some(db_buf)) =
                (state.acc_mem.access(), state.state.db_buf)
            {
                mem.write(db_buf.shadow, &(state.state.head as u32));
            }
        }
    }

    /// Return a VM-unique identifier for this completion queue
    pub(crate) fn devq_id(&self) -> u64 {
        super::devq_id(self.device_id, self.id)
    }

    pub(super) fn export(&self) -> migrate::NvmeCompQueueV1 {
        let guard = self.state.lock();
        migrate::NvmeCompQueueV1 {
            id: self.id,
            size: self.state.size,
            head: guard.state.head,
            tail: guard.state.tail,
            avail: guard.state.inner.avail,
            phase: guard.state.inner.phase,
            base: self.base.0,
            iv: self.iv,
        }
    }

    pub(super) fn import(
        &self,
        state: migrate::NvmeCompQueueV1,
    ) -> Result<(), MigrateStateError> {
        // These must've been provided at construction
        assert_eq!(self.id, state.id);
        assert_eq!(self.iv, state.iv);
        assert_eq!(self.base.0, state.base);
        assert_eq!(self.state.size, state.size);

        let mut guard = self.state.lock();
        guard.state.head = state.head;
        guard.state.tail = state.tail;
        guard.state.inner.avail = state.avail;
        guard.state.inner.phase = state.phase;

        Ok(())
    }
}

/// "Proto" permit for a Completion Queue Entry.
///
/// This guarantees the holder capacity in the associated Completion Queue to
/// push their CQE.  A `ProtoPermit` is either promoted to a full [Permit] via
/// [promote()](Self::promote), or if no submissions were found to be available
/// after reserving the `ProtoPermit`, discarded to release its reservation via
/// [remit()](Self::remit).
pub struct ProtoPermit {
    /// The corresponding Completion Queue for which we have a permit.
    cq: Weak<CompQueue>,

    /// The Submission Queue for which this entry is reserved.
    sq: Weak<SubQueue>,

    /// The ID for the device and Submission Queue the command associated with
    /// this permit was submtited from. Stored separately to avoid going through
    /// the Weak ref.
    devsq_id: u64,
}
impl ProtoPermit {
    fn new(cq: &Arc<CompQueue>, sq: &Arc<SubQueue>) -> Self {
        Self {
            cq: Arc::downgrade(cq),
            sq: Arc::downgrade(sq),
            devsq_id: sq.devq_id(),
        }
    }

    /// Promote a "proto" permit to a [Permit].
    ///
    /// Once an entry has been read from the Submission Queue, the holder of a
    /// `ProtoPermit` promotes it to `Permit`, committing to use the reserved
    /// CQE capacity when the submission is processed.
    pub fn promote(self, cid: u16) -> Permit {
        Permit {
            cq: self.cq,
            sq: self.sq,
            devsq_id: self.devsq_id,
            cid,
            _nodrop: NoDropPermit,
        }
    }

    /// Return the permit without having actually used it.
    ///
    /// Frees up the space for someone else to grab it via
    /// `CompQueue::reserve_entry`.
    fn remit(self) {
        if let Some(cq) = self.cq.upgrade() {
            let mut state = cq.state.lock();
            state.release_avail();
        }
    }
}

/// A permit reserving capacity to push a [CompletionQueueEntry] into a
/// Completion Queue for a command submitted to the device.
pub struct Permit {
    /// The corresponding Completion Queue for which we have a permit.
    cq: Weak<CompQueue>,

    /// The Submission Queue for which this entry is reserved.
    sq: Weak<SubQueue>,

    /// The Submission Queue and device ID the request came in on. Retained as a
    /// consistent source identifier for probes.
    devsq_id: u64,

    /// ID of command holding this permit.  Used to populate `cid` field in
    /// Completion Queue Entry.
    cid: u16,

    /// Marker to ensure holder calls [Permit::complete()].
    _nodrop: NoDropPermit,
}

impl Permit {
    /// Consume the permit by placing an entry into the Completion Queue.
    pub fn complete(self, comp: Completion) {
        let Permit { cq, sq, cid, _nodrop, .. } = self;
        std::mem::forget(_nodrop);

        let cq = match cq.upgrade() {
            Some(cq) => cq,
            None => {
                // The CQ has since been deleted so no way to complete this
                // request nor to return the permit.
                debug_assert!(sq.upgrade().is_none());
                return;
            }
        };

        if let Some(sq) = sq.upgrade() {
            cq.push(comp, cid, &sq);
            cq.fire_interrupt();
        } else {
            // The SQ has since been deleted (so the request has already
            // implicitly been aborted by the prior Delete Queue command) or
            // the device currently lacks access to guest memory.
            //
            // Just make sure we return the "avail hold" from the permit
            let mut state = cq.state.lock();
            state.release_avail();
        }
    }

    /// Get the ID of the submitted command associated with this permit.
    pub fn cid(&self) -> u16 {
        self.cid
    }

    /// Get the ID of the device and Submission Queue the command associated
    /// with this permit was submitted on.
    pub fn devsq_id(&self) -> u64 {
        self.devsq_id
    }

    /// A device reset may cause us to abandon some in-flight I/O, dropping the
    /// request `Permit` without any kind of completion.  Additionally, some of
    /// the tests which to acquire Permit entries with no intent to drive them
    /// through to completion.  Allow them to bypass the
    /// ensure-this-permit-is-completed check in [`Drop`].
    pub fn abandon(self) {
        let Permit { _nodrop, .. } = self;
        std::mem::forget(_nodrop);
    }

    /// Consume the permit by placing an entry into the Completion Queue.
    ///
    /// This is a simpler version of [Self::complete()] for testing purposes
    /// which does not require passing in the actual completion data.  It is
    /// only to be used for exercising the Submission and Completion Queues in
    /// unit tests.
    #[cfg(test)]
    fn test_complete(self, sq: &SubQueue) {
        let Permit { cq, cid, _nodrop, .. } = self;
        std::mem::forget(_nodrop);

        if let Some(cq) = cq.upgrade() {
            cq.push(Completion::success(), cid, sq);
        }
    }
}
impl Debug for Permit {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("Permit")
            .field("sq", &self.sq)
            .field("cq", &self.cq)
            .field("devsq_id", &self.devsq_id)
            .field("cid", &self.cid)
            .finish()
    }
}

/// Marker struct to ensure that [Permit] consumers call
/// [complete()](Permit::complete()), rather than silently dropping it.
struct NoDropPermit;
impl Drop for NoDropPermit {
    fn drop(&mut self) {
        panic!("Permit should be complete()-ed before drop");
    }
}

/// Convert IO SQID to block-layer [block::QueueId]
pub fn sqid_to_block_qid(sqid: super::QueueId) -> block::QueueId {
    // With the admin SQ occupying ID 0, the corresponding block-layer queue IDs
    // are offset by 1
    sqid.checked_sub(1).expect("IO SQID is non-zero").into()
}

#[derive(Copy, Clone, Debug, Default)]
pub struct TransferParams {
    pub lba_data_size: u64,
    pub max_data_transfer_size: u64,
}

/// Configuration for Doorbell Buffer feature
#[derive(Copy, Clone, Debug)]
pub struct DoorbellBuffer {
    pub shadow: GuestAddr,
    pub eventidx: GuestAddr,
}
impl DoorbellBuffer {
    /// Determine Shadow Doorbell and EventIdx addresses for a specified
    /// [QueueId] and stride configuration.
    pub fn offset_for_queue(&self, is_cq: bool, qid: QueueId) -> Self {
        let idx: usize = (qid as usize) * 2 + is_cq.then_some(1).unwrap_or(0);
        Self {
            shadow: self.shadow.offset::<u32>(idx),
            eventidx: self.eventidx.offset::<u32>(idx),
        }
    }
}

pub(super) mod migrate {
    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct NvmeCompQueueV1 {
        pub id: u16,

        pub size: u32,
        pub head: u16,
        pub tail: u16,

        pub avail: u16,
        pub phase: bool,

        pub base: u64,
        pub iv: u16,
    }

    #[derive(Deserialize, Serialize)]
    pub struct NvmeSubQueueV1 {
        pub id: u16,

        pub size: u32,
        pub head: u16,
        pub tail: u16,

        pub base: u64,
        pub cq_id: u16,
    }
}

#[cfg(test)]
mod test {
    use rand::Rng;

    use super::*;

    use crate::{common::GuestAddr, vmm::Machine};
    use std::io::Error;
    use std::thread::{sleep, spawn};
    use std::time::Duration;

    #[test]
    fn create_cqs() -> Result<(), Error> {
        let machine = Machine::new_test()?;
        let hdl = pci::MsixHdl::new_test();
        let write_base = GuestAddr(1024 * 1024);
        let tmpl = CreateParams {
            id: ADMIN_QUEUE_ID,
            device_id: DeviceId(0),
            base: write_base,
            size: 0,
        };

        let acc_mem = || machine.acc_mem.child(None);

        // Admin queues must be less than 4K
        let cq = CompQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                size: 1024,
                ..tmpl
            },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Ok(_)));
        let cq = CompQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                size: 5 * 1024,
                ..tmpl
            },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Err(QueueCreateErr::InvalidSize)));

        // I/O queues must be less than 64K
        let cq = CompQueue::new(
            CreateParams { id: 1, device_id: DeviceId(0), size: 1024, ..tmpl },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Ok(_)));
        let cq = CompQueue::new(
            CreateParams {
                id: 1,
                device_id: DeviceId(0),
                size: 65 * 1024,
                ..tmpl
            },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Err(QueueCreateErr::InvalidSize)));

        // Neither must be less than 2
        let cq = CompQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                size: 1,
                ..tmpl
            },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Err(QueueCreateErr::InvalidSize)));
        let cq = CompQueue::new(
            CreateParams { id: 1, device_id: DeviceId(0), size: 1, ..tmpl },
            0,
            hdl.clone(),
            acc_mem(),
        );
        assert!(matches!(cq, Err(QueueCreateErr::InvalidSize)));

        Ok(())
    }

    #[test]
    fn create_sqs() -> Result<(), Error> {
        let machine = Machine::new_test()?;
        let hdl = pci::MsixHdl::new_test();
        let read_base = GuestAddr(0);
        let write_base = GuestAddr(1024 * 1024);

        let acc_mem = || machine.acc_mem.child(None);

        // Create corresponding CQs
        let admin_cq = Arc::new(
            CompQueue::new(
                CreateParams {
                    id: ADMIN_QUEUE_ID,
                    device_id: DeviceId(0),
                    base: write_base,
                    size: 1024,
                },
                0,
                hdl.clone(),
                acc_mem(),
            )
            .unwrap(),
        );
        let io_cq = Arc::new(
            CompQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: write_base,
                    size: 1024,
                },
                0,
                hdl,
                acc_mem(),
            )
            .unwrap(),
        );

        // Admin queues must be less than 4K
        let sq = SubQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                base: read_base,
                size: 1024,
            },
            admin_cq.clone(),
            acc_mem(),
        );
        assert!(matches!(sq, Ok(_)));
        let sq = SubQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                base: read_base,
                size: 5 * 1024,
            },
            admin_cq.clone(),
            acc_mem(),
        );
        assert!(matches!(sq, Err(QueueCreateErr::InvalidSize)));

        // I/O queues must be less than 64K
        let sq = SubQueue::new(
            CreateParams {
                id: 1,
                device_id: DeviceId(0),
                base: read_base,
                size: 1024,
            },
            io_cq.clone(),
            acc_mem(),
        );
        assert!(matches!(sq, Ok(_)));
        let sq = SubQueue::new(
            CreateParams {
                id: 1,
                device_id: DeviceId(0),
                base: read_base,
                size: 65 * 1024,
            },
            io_cq,
            acc_mem(),
        );
        assert!(matches!(sq, Err(QueueCreateErr::InvalidSize)));

        // Neither must be less than 2
        let sq = SubQueue::new(
            CreateParams {
                id: ADMIN_QUEUE_ID,
                device_id: DeviceId(0),
                base: read_base,
                size: 1,
            },
            admin_cq.clone(),
            acc_mem(),
        );
        assert!(matches!(sq, Err(QueueCreateErr::InvalidSize)));
        let sq = SubQueue::new(
            CreateParams {
                id: 1,
                device_id: DeviceId(0),
                base: read_base,
                size: 1,
            },
            admin_cq,
            acc_mem(),
        );
        assert!(matches!(sq, Err(QueueCreateErr::InvalidSize)));

        // Completion Queue's must be mapped to readable memory
        //
        // This relied on the test machinery establishing the writable memory
        // region as write-only.  Until we expose such a region, it is not clear
        // how much value such a test brings to the table.
        //
        // let sq = SubQueue::new(
        //     ADMIN_QUEUE_ID,
        //     admin_cq.clone(),
        //     2,
        //     write_base,
        //     &mem,
        // );
        // assert!(matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)));
        // let sq = SubQueue::new(1, admin_cq.clone(), 2, write_base, &mem);
        // assert!(matches!(sq, Err(QueueCreateErr::InvalidBaseAddr)));

        Ok(())
    }

    #[test]
    fn push_failures() -> Result<(), Error> {
        let machine = Machine::new_test()?;
        let hdl = pci::MsixHdl::new_test();
        let read_base = GuestAddr(0);
        let write_base = GuestAddr(1024 * 1024);

        let acc_mem = || machine.acc_mem.child(None);

        // Create our queues
        let cq = Arc::new(
            CompQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: write_base,
                    size: 4,
                },
                0,
                hdl,
                acc_mem(),
            )
            .unwrap(),
        );
        let sq = Arc::new(
            SubQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: read_base,
                    size: 4,
                },
                cq.clone(),
                acc_mem(),
            )
            .unwrap(),
        );

        // Replicate guest VM notifying us things were pushed to the SQ
        let mut sq_tail = 0;
        for _ in 0..sq.state.size - 1 {
            sq_tail = wrap_add(sq.state.size, sq_tail, 1);
            // These should all succeed
            assert!(matches!(sq.notify_tail(sq_tail), Ok(_)));
        }

        // But anything more should fail
        sq_tail = wrap_add(sq.state.size, sq_tail, 1);
        assert!(matches!(
            sq.notify_tail(sq_tail),
            Err(QueueUpdateError::TooManyEntries)
        ));

        // Also anything that falls outside the boundaries (i.e. we didn't wrap properly)
        assert!(matches!(
            sq.notify_tail(sq.state.size as u16),
            Err(QueueUpdateError::InvalidEntry)
        ));

        // Now pop those SQ items and complete them in the CQ
        while let Some((_, permit, _)) = sq.pop() {
            permit.test_complete(&sq);
        }

        // Replicate guest VM notifying us things were consumed off the CQ
        let mut cq_head = 0;
        for _ in 0..sq.state.size - 1 {
            cq_head = wrap_add(cq.state.size, cq_head, 1);
            // These should all succeed
            assert!(matches!(cq.notify_head(cq_head), Ok(_)));
        }

        // There's nothing else to pop so this should fail
        cq_head = wrap_add(cq.state.size, cq_head, 1);
        assert!(matches!(
            cq.notify_head(cq_head),
            Err(QueueUpdateError::TooManyEntries)
        ));

        // Also anything that falls outside the boundaries (i.e. we didn't wrap properly)
        assert!(matches!(
            cq.notify_head(cq.state.size as u16),
            Err(QueueUpdateError::InvalidEntry)
        ));

        Ok(())
    }

    #[test]
    fn cq_kicks() -> Result<(), Error> {
        let machine = Machine::new_test()?;
        let hdl = pci::MsixHdl::new_test();
        let read_base = GuestAddr(0);
        let write_base = GuestAddr(1024 * 1024);

        let acc_mem = || machine.acc_mem.child(None);

        // Create our queues
        // Purposely make the CQ smaller to test kicks
        let cq = Arc::new(
            CompQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: write_base,
                    size: 2,
                },
                0,
                hdl,
                acc_mem(),
            )
            .unwrap(),
        );
        let sq = Arc::new(
            SubQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: read_base,
                    size: 4,
                },
                cq.clone(),
                acc_mem(),
            )
            .unwrap(),
        );

        // Replicate guest VM notifying us things were pushed to the SQ
        let mut sq_tail = 0;
        for _ in 0..sq.state.size - 1 {
            sq_tail = wrap_add(sq.state.size, sq_tail, 1);
            assert!(matches!(sq.notify_tail(sq_tail), Ok(_)));
        }

        // We should be able to pop based on how much space is in the CQ
        for _ in 0..cq.state.size - 1 {
            let pop = sq.pop();
            assert!(matches!(pop, Some(_)));

            // Complete these in the CQ (but note guest won't have acknowledged them yet)
            pop.unwrap().1.test_complete(&sq);
        }

        // But we can't pop anymore due to no more CQ space to reserve
        assert!(matches!(sq.pop(), None));

        // The guest consuming things off the CQ should let free us
        assert!(matches!(cq.notify_head(1), Ok(_)));

        // Kick should've been set in the failed pop
        assert!(cq.kick().is_some());

        // We should have one more space now and should be able to pop 1 more
        assert!(matches!(
            sq.pop().map(|(_sub, permit, _idx)| {
                // ignore permit so it can be discarded when done
                permit.abandon()
            }),
            Some(_)
        ));

        Ok(())
    }

    #[test]
    fn push_pop() -> Result<(), Error> {
        let machine = Machine::new_test()?;
        let hdl = pci::MsixHdl::new_test();
        let read_base = GuestAddr(0);
        let write_base = GuestAddr(1024 * 1024);

        let acc_mem = || machine.acc_mem.child(None);

        // Create a pair of Completion and Submission Queues
        // with a random size. We purposefully give the CQ a smaller
        // size to exercise the "kick" conditions where we have some
        // request available in the SQ but can't pop it until there's
        // space available in the CQ.
        let mut rng = rand::rng();
        let sq_size = rng.random_range(512..2048);
        let cq = Arc::new(
            CompQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: write_base,
                    size: 4,
                },
                0,
                hdl,
                acc_mem(),
            )
            .unwrap(),
        );
        let sq = Arc::new(
            SubQueue::new(
                CreateParams {
                    id: 1,
                    device_id: DeviceId(0),
                    base: read_base,
                    size: sq_size,
                },
                cq.clone(),
                acc_mem(),
            )
            .unwrap(),
        );

        // We'll be generating a random number of submissions
        let submissions_rand = rng.random_range(2..sq.state.size - 1);

        let (doorbell_tx, doorbell_rx) =
            crossbeam_channel::unbounded::<Doorbell>();
        let (workers_tx, workers_rx) = crossbeam_channel::unbounded();
        let (comp_tx, comp_rx) = crossbeam_channel::unbounded();

        // Create a thread to mimic the main device thread that
        // will handle "doorbell" read/write ops.
        enum Doorbell {
            Cq(u16),
            Sq(u16),
        }
        let (doorbell_cq, doorbell_sq) = (cq, sq.clone());
        let doorbell_handler = spawn(move || {
            // Keep track of the "host" side CQ head and SQ tail as
            // we receive "doorbell" hits.
            let mut cq_head = 0;
            let mut sq_tail = 0;
            loop {
                match doorbell_rx.recv() {
                    Ok(Doorbell::Cq(n)) => {
                        cq_head = wrap_add(doorbell_cq.state.size, cq_head, n);
                        assert!(matches!(
                            doorbell_cq.notify_head(cq_head),
                            Ok(_)
                        ));
                        if doorbell_cq.kick().is_some() {
                            assert!(workers_tx.send(()).is_ok());
                        }
                    }
                    Ok(Doorbell::Sq(n)) => {
                        sq_tail = wrap_add(doorbell_sq.state.size, sq_tail, n);
                        // The "doorbell" was rung and so let's have the SQ
                        // update its internal state before poking the workers
                        assert!(matches!(
                            doorbell_sq.notify_tail(sq_tail),
                            Ok(_)
                        ));
                        assert!(workers_tx.send(()).is_ok());
                    }
                    Err(_) => break,
                }
            }
        });

        // Create a number of worker threads to simulate the block
        // dev backend workers that will be notified every time the
        // SQ "doorbell" is hit and will attempt to pull a new IO
        // request off the SQ. At the end, each will return a count
        // of how many requests they received and then completed.
        //
        // N.B. Clippy is too aggressive here; the `collect` is needed to
        //      ensure the closure is evaluated, which is what actually
        //      launches the worker threads.
        #[allow(clippy::needless_collect)]
        let io_workers = (0..4)
            .map(|_| {
                let worker_rx = workers_rx.clone();
                let worker_sq = sq.clone();
                let worker_comp_tx = comp_tx.clone();

                spawn(move || {
                    let mut submissions = 0;

                    let mut rng = rand::rng();
                    while let Ok(()) = worker_rx.recv() {
                        while let Some((_, cqe_permit, _)) = worker_sq.pop() {
                            submissions += 1;

                            // Sleep for a bit to mimic actually doing
                            // some work before we complete the IO
                            sleep(Duration::from_micros(
                                rng.random_range(0..500),
                            ));

                            cqe_permit.test_complete(&worker_sq);

                            // Signal "guest" side of completion handler
                            assert!(worker_comp_tx.send(()).is_ok());
                        }
                    }
                    submissions
                })
            })
            .collect::<Vec<_>>();

        // Create a thread to "consume" things off the Completion Queue. This
        // simulates the host reacting to our CQ pushes and "ringing" the CQ
        // doorbell. At the end, it returns how many completion were handled.
        // Regardless, it'll stop after the number of completions is at least as
        // many as the number of submissions we decided to generate.
        let comp_doorbell_tx = doorbell_tx.clone();
        let comp_handler = spawn(move || {
            let exit_after = submissions_rand;
            let mut completions = 0;
            loop {
                match comp_rx.recv() {
                    Ok(()) => {
                        // "Ring" the CQ doorbell
                        // TODO: test completing more than 1 at a time
                        assert!(comp_doorbell_tx.send(Doorbell::Cq(1)).is_ok());
                        completions += 1;
                    }
                    Err(_) => break completions,
                }
                if completions >= exit_after {
                    break completions;
                }
            }
        });

        // Now, start generating a random number of submissions
        for _ in 0..submissions_rand {
            // "Ring" the SQ doorbell
            // TODO: test submitting more than 1 at a time
            //let doorbell_tx = doorbell_tx.clone();
            assert!(doorbell_tx.send(Doorbell::Sq(1)).is_ok());

            // Sleep up to 100us in between
            sleep(Duration::from_micros(rng.random_range(0..100)));
        }
        drop(doorbell_tx);

        // Wait for the completion handler and its count
        let completions: u32 = comp_handler.join().unwrap();

        // Wait for doorbell handler
        doorbell_handler.join().unwrap();

        // Wait for the IO workers to complete and sum the total
        // number of submissions they recevied
        let submissions: u32 =
            io_workers.into_iter().map(|j| j.join().unwrap()).sum();

        // Make sure the number of submission we recevied matched the number we
        // generated and completed
        assert_eq!(submissions, submissions_rand);
        assert_eq!(submissions, completions);

        Ok(())
    }
}


================================================
FILE: lib/propolis/src/hw/nvme/requests.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use itertools::Itertools;
use std::sync::Arc;
use std::time::Instant;

use super::{cmds::NvmCmd, queue::Permit, PciNvme};
use crate::accessors::MemAccessor;
use crate::block::{self, Operation, Request};
use crate::hw::nvme::cmds::ParseErr;
use crate::hw::nvme::{bits, cmds::Completion, queue::SubQueue};

#[usdt::provider(provider = "propolis")]
mod probes {
    // Note that unlike the probes in `queue.rs`, the probes here provide a
    // `devsq_id` for completion as well as enqueuement. The submission queue is
    // the one the command was originally submitted on.
    //
    // As long as queues are not destroyed (and the device is not reset), a
    // `(devsq_id, cid)` tuple seen in an `nvme_*_enqueue` probably will not be
    // reused before that same tuple is used in a corresponding
    // `nvme_*_complete` probe. It is possible, but such a case is a
    // guest error and unlikely. From the NVM Express Base Specification:
    //
    // > The Command Identifier field in the SQE shall be unique among all
    // > outstanding commands associated with that queue.
    fn nvme_read_enqueue(devsq_id: u64, idx: u16, cid: u16, off: u64, sz: u64) {
    }
    fn nvme_read_complete(devsq_id: u64, cid: u16, res: u8) {}

    fn nvme_write_enqueue(
        devsq_id: u64,
        idx: u16,
        cid: u16,
        off: u64,
        sz: u64,
    ) {
    }
    fn nvme_write_complete(devsq_id: u64, cid: u16, res: u8) {}

    fn nvme_discard_enqueue(devsq_id: u64, idx: u16, cid: u16, nr: u16) {}
    fn nvme_discard_complete(devsq_id: u64, cid: u16, res: u8) {}

    fn nvme_flush_enqueue(devsq_id: u64, idx: u16, cid: u16) {}
    fn nvme_flush_complete(devsq_id: u64, cid: u16, res: u8) {}

    fn nvme_raw_cmd(
        devsq_id: u64,
        cdw0nsid: u64,
        prp1: u64,
        prp2: u64,
        cdw10cdw11: u64,
    ) {
    }
}

impl block::Device for PciNvme {
    fn attachment(&self) -> &block::DeviceAttachment {
        &self.block_attach
    }
}

pub(super) struct NvmeBlockQueue {
    sq: Arc<SubQueue>,
    acc_mem: MemAccessor,
}
impl NvmeBlockQueue {
    pub(super) fn new(sq: Arc<SubQueue>, acc_mem: MemAccessor) -> Arc<Self> {
        Arc::new(Self { sq, acc_mem })
    }
}
impl block::DeviceQueue for NvmeBlockQueue {
    type Token = Permit;

    /// Pop an available I/O request off of the Submission Queue for hand-off to
    /// the underlying block backend
    fn next_req(&self) -> Option<(Request, Self::Token, Option<Instant>)> {
        let sq = &self.sq;
        let mem = self.acc_mem.access_locked()?;
        let mem = mem.view();
        let params = self.sq.params();

        while let Some((sub, permit, idx)) = sq.pop() {
            let devsq_id = sq.devq_id();
            probes::nvme_raw_cmd!(|| {
                (
                    devsq_id,
                    u64::from(sub.cdw0) | (u64::from(sub.nsid) << 32),
                    sub.prp1,
                    sub.prp2,
                    (u64::from(sub.cdw10) | (u64::from(sub.cdw11) << 32)),
                )
            });
            let cid = sub.cid();
            let cmd = NvmCmd::parse(sub);

            match cmd {
                Ok(NvmCmd::Write(cmd)) => {
                    let off = params.lba_data_size * cmd.slba;
                    let size = params.lba_data_size * (cmd.nlb as u64);

                    if size > params.max_data_transfer_size {
                        permit.complete(
                            Completion::generic_err(bits::STS_INVAL_FIELD)
                                .dnr(),
                        );
                        continue;
                    }

                    probes::nvme_write_enqueue!(|| (
                        sq.devq_id(),
                        idx,
                        cid,
                        off,
                        size
                    ));

                    let bufs = cmd.data(size, &mem).collect();
                    let req =
                        Request::new_write(off as usize, size as usize, bufs);
                    return Some((req, permit, None));
                }
                Ok(NvmCmd::Read(cmd)) => {
                    let off = params.lba_data_size * cmd.slba;
                    let size = params.lba_data_size * (cmd.nlb as u64);

                    if size > params.max_data_transfer_size {
                        permit.complete(
                            Completion::generic_err(bits::STS_INVAL_FIELD)
                                .dnr(),
                        );
                        continue;
                    }

                    probes::nvme_read_enqueue!(|| (
                        sq.devq_id(),
                        idx,
                        cid,
                        off,
                        size
                    ));

                    let bufs = cmd.data(size, &mem).collect();
                    let req =
                        Request::new_read(off as usize, size as usize, bufs);
                    return Some((req, permit, None));
                }
                Ok(NvmCmd::DatasetManagement(cmd)) => {
                    // We only support the "deallocate" (discard/trim) operation of Dataset
                    // Management.
                    if !cmd.is_deallocate() {
                        permit.complete(
                            Completion::generic_err(bits::STS_INVAL_FIELD)
                                .dnr(),
                        );
                        continue;
                    }
                    probes::nvme_discard_enqueue!(|| (
                        sq.devq_id(),
                        idx,
                        cid,
                        cmd.nr,
                    ));
                    let Ok(ranges): Result<Vec<_>, _> =
                        cmd.ranges(&mem).try_collect()
                    else {
                        // If we couldn't read the ranges, fail the command
                        permit.complete(
                            Completion::generic_err(bits::STS_DATA_XFER_ERR)
                                .dnr(),
                        );
                        continue;
                    };
                    let Ok(ranges) = ranges
                        .into_iter()
                        .map(|r| r.offset_len(params.lba_data_size))
                        .try_collect()
                    else {
                        // If the ranges were invalid (e.g. arithmetic overflow), fail the command
                        permit.complete(
                            Completion::generic_err(bits::STS_INVAL_FIELD)
                                .dnr(),
                        );
                        continue;
                    };

                    let req = Request::new_discard(ranges);
                    return Some((req, permit, None));
                }
                Ok(NvmCmd::Flush) => {
                    probes::nvme_flush_enqueue!(|| (sq.devq_id(), idx, cid));
                    let req = Request::new_flush();
                    return Some((req, permit, None));
                }
                Err(ParseErr::ReservedFuse) | Err(ParseErr::Reserved) => {
                    // For commands that fail parsing due to reserved fields being set,
                    // complete with an invalid field error
                    let comp =
                        Completion::generic_err(bits::STS_INVAL_FIELD).dnr();
                    permit.complete(comp);
                }
                Ok(NvmCmd::Unknown(_)) | Err(_) => {
                    // For any other unrecognized or malformed command,
                    // just immediately complete it with an error
                    let comp = Completion::generic_err(bits::STS_INTERNAL_ERR);
                    permit.complete(comp);
                }
            }
        }
        None
    }

    /// Place the operation result (success or failure) onto the corresponding
    /// Completion Queue.
    fn complete(
        &self,
        op: block::Operation,
        result: block::Result,
        permit: Self::Token,
    ) {
        let devsq_id = permit.devsq_id();
        let cid = permit.cid();
        let resnum = result as u8;
        match op {
            Operation::Read(..) => {
                probes::nvme_read_complete!(|| (devsq_id, cid, resnum));
            }
            Operation::Write(..) => {
                probes::nvme_write_complete!(|| (devsq_id, cid, resnum));
            }
            Operation::Flush => {
                probes::nvme_flush_complete!(|| (devsq_id, cid, resnum));
            }
            Operation::Discard => {
                probes::nvme_discard_complete!(|| (devsq_id, cid, resnum));
            }
        }

        permit.complete(Completion::from(result));
    }

    /// In the unlikely case we must give up on an in-flight I/O, tear it down
    /// without triggering the no-drop check on NVMe request permits.
    fn abandon(&self, token: Self::Token) {
        token.abandon();
    }
}


================================================
FILE: lib/propolis/src/hw/pci/bar.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::convert::TryFrom;
use std::convert::TryInto;

use crate::migrate::MigrateStateError;

use super::bits;
use super::BarN;

pub const BAR_COUNT: usize = 6;

#[derive(Eq, PartialEq, Clone, Copy, Debug)]
pub enum BarDefine {
    Pio(u16),
    Mmio(u32),
    Mmio64(u64),
}
impl BarDefine {
    /// Definition represent PIO-backed BAR
    pub fn is_pio(&self) -> bool {
        matches!(self, BarDefine::Pio(_))
    }
    /// Definition represent MMIO-backed (32-bit or 64-bit) BAR
    pub fn is_mmio(&self) -> bool {
        matches!(self, BarDefine::Mmio(_) | BarDefine::Mmio64(_))
    }
    /// Get the size of the BAR definition, regardless of type
    pub fn size(&self) -> u64 {
        match self {
            BarDefine::Pio(sz) => u64::from(*sz),
            BarDefine::Mmio(sz) => u64::from(*sz),
            BarDefine::Mmio64(sz) => *sz,
        }
    }
}
impl TryFrom<EntryKind> for BarDefine {
    type Error = ();

    fn try_from(value: EntryKind) -> Result<Self, Self::Error> {
        match value {
            EntryKind::Empty | EntryKind::Mmio64High => Err(()),
            EntryKind::Pio(sz) => Ok(BarDefine::Pio(sz)),
            EntryKind::Mmio(sz) => Ok(BarDefine::Mmio(sz)),
            EntryKind::Mmio64(sz) => Ok(BarDefine::Mmio64(sz)),
        }
    }
}

#[derive(Copy, Clone)]
enum EntryKind {
    Empty,
    Pio(u16),
    Mmio(u32),
    Mmio64(u64),
    Mmio64High,
}

#[derive(Copy, Clone)]
struct Entry {
    kind: EntryKind,
    value: u64,
}
impl Default for Entry {
    fn default() -> Self {
        Self { kind: EntryKind::Empty, value: 0 }
    }
}

pub(super) struct Bars {
    entries: [Entry; BAR_COUNT],
}

impl Bars {
    pub(super) fn new(defs: &[Option<BarDefine>; BAR_COUNT]) -> Self {
        let mut this = Self { entries: Default::default() };
        for (idx, def) in defs.iter().enumerate() {
            match def {
                None => continue,
                Some(def) => {
                    assert!(matches!(this.entries[idx].kind, EntryKind::Empty));
                    this.entries[idx].kind = match def {
                        BarDefine::Pio(sz) => EntryKind::Pio(*sz),
                        BarDefine::Mmio(sz) => EntryKind::Mmio(*sz),
                        BarDefine::Mmio64(sz) => {
                            // Make sure 64-bit BAR definitions are playing by
                            // the rules
                            assert!(idx < (BarN::BAR5 as usize));
                            this.entries[idx + 1].kind = EntryKind::Mmio64High;
                            EntryKind::Mmio64(*sz)
                        }
                    }
                }
            }
        }

        this
    }
    pub(super) fn reg_read(&self, bar: BarN) -> u32 {
        let idx = bar as usize;
        let ent = self.entries[idx];
        match ent.kind {
            EntryKind::Empty => 0,
            EntryKind::Pio(_) => {
                u32::from(ent.value as u16) | bits::BAR_TYPE_IO
            }
            EntryKind::Mmio(_) => ent.value as u32 | bits::BAR_TYPE_MEM,
            EntryKind::Mmio64(_) => ent.value as u32 | bits::BAR_TYPE_MEM64,
            EntryKind::Mmio64High => {
                assert_ne!(idx, 0);
                let ent = self.entries[idx - 1];
                assert!(matches!(ent.kind, EntryKind::Mmio64(_)));

                (ent.value >> 32) as u32
            }
        }
    }
    pub(super) fn reg_write(
        &mut self,
        bar: BarN,
        val: u32,
    ) -> Option<WriteResult> {
        let idx = bar as usize;
        let ent = &mut self.entries[idx];
        let (id, def, val_old, val_new) = match ent.kind {
            EntryKind::Empty => return None,
            EntryKind::Pio(size) => {
                let mask = u32::from(!(size - 1));
                let old = ent.value;
                ent.value = u64::from(val & mask);
                (bar, BarDefine::Pio(size), old, ent.value)
            }
            EntryKind::Mmio(size) => {
                let mask = !(size - 1);
                let old = ent.value;
                ent.value = u64::from(val & mask);
                (bar, BarDefine::Mmio(size), old, ent.value)
            }
            EntryKind::Mmio64(size) => {
                let old = ent.value;
                let mask = !(size - 1) as u32;
                let low = val & mask;
                ent.value = (old & (0xffffffff << 32)) | u64::from(low);
                (bar, BarDefine::Mmio64(size), old, ent.value)
            }
            EntryKind::Mmio64High => {
                assert!(idx > 0);
                let real_idx = idx - 1;
                let id = BarN::from_repr(real_idx as u8).unwrap();
                let ent = &mut self.entries[real_idx];
                let size = match ent.kind {
                    EntryKind::Mmio64(sz) => sz,
                    _ => panic!(),
                };
                let mask = !(size - 1);
                let old = ent.value;
                let high = ((u64::from(val) << 32) & mask) & 0xffffffff00000000;
                ent.value = high | (old & 0xffffffff);
                (id, BarDefine::Mmio64(size), old, ent.value)
            }
        };
        if val_old != val_new {
            return Some(WriteResult { id, def, val_old, val_new });
        }
        None
    }

    /// Get BAR definition and current value
    pub fn get(&self, n: BarN) -> Option<(BarDefine, u64)> {
        let ent = &self.entries[n as usize];
        let def = BarDefine::try_from(ent.kind).ok()?;
        Some((def, ent.value))
    }

    /// Set BAR value directly
    ///
    /// May only be called on BARs which are defined (not on empty BARs or the
    /// high portions of 64-bit MMIO BARs).  Furthermore, the value must be
    /// valid for the BAR type  (ie. not > u32::MAX for 32-bit MMIO BAR).
    pub fn set(&mut self, n: BarN, value: u64) {
        let ent = &mut self.entries[n as usize];
        match ent.kind {
            EntryKind::Empty => panic!("{:?} not defined", n),
            EntryKind::Mmio64High => {
                panic!("high BAR bits not to be set directly")
            }
            EntryKind::Pio(_) => {
                assert!(value <= u64::from(u16::MAX));
                ent.value = value;
            }
            EntryKind::Mmio(_) => {
                assert!(value <= u64::from(u32::MAX));
            }
            EntryKind::Mmio64(_) => {}
        }
        ent.value = value;
    }

    pub(super) fn export(&self) -> migrate::BarStateV1 {
        let entries = self.entries.map(|entry| match entry.kind {
            EntryKind::Pio(sz) => migrate::BarEntryV1 {
                kind: migrate::BarKindV1::Pio,
                size: u64::from(sz),
                value: entry.value,
            },
            EntryKind::Mmio(sz) => migrate::BarEntryV1 {
                kind: migrate::BarKindV1::Mmio,
                size: u64::from(sz),
                value: entry.value,
            },
            EntryKind::Mmio64(sz) => migrate::BarEntryV1 {
                kind: migrate::BarKindV1::Mmio64,
                size: sz,
                value: entry.value,
            },
            // We encode `Mmio64High` as Empty here because it is always implied
            // by a preceding `Mmio64` entry.
            EntryKind::Mmio64High | EntryKind::Empty => migrate::BarEntryV1 {
                kind: migrate::BarKindV1::Empty,
                size: 0,
                value: 0,
            },
        });
        migrate::BarStateV1 { entries }
    }

    pub(super) fn import(
        &mut self,
        input_bars: migrate::BarStateV1,
    ) -> Result<(), MigrateStateError> {
        let mut entries = self.entries.iter_mut();
        let mut input_entries = IntoIterator::into_iter(input_bars.entries);

        while let (Some(entry), Some(input_entry)) =
            (entries.next(), input_entries.next())
        {
            let sz = input_entry.size;
            entry.kind = match input_entry.kind {
                migrate::BarKindV1::Empty => EntryKind::Empty,
                migrate::BarKindV1::Pio => {
                    let sz = sz.try_into().map_err(|_| {
                        MigrateStateError::ImportFailed(format!(
                            "Pio Bar: invalid entry size ({})",
                            sz
                        ))
                    })?;
                    EntryKind::Pio(sz)
                }
                migrate::BarKindV1::Mmio => {
                    let sz = sz.try_into().map_err(|_| {
                        MigrateStateError::ImportFailed(format!(
                            "Mmio Bar: invalid entry size ({})",
                            sz
                        ))
                    })?;
                    EntryKind::Mmio(sz)
                }
                migrate::BarKindV1::Mmio64 => {
                    // An `Mmio64` already implies the next should be `Mmio64High` so
                    // the export logic just leaves the slot empty.
                    match input_entries.next() {
                        Some(e) if e.kind == migrate::BarKindV1::Empty => {
                            // Verify there's an available next slot that will be
                            // set to Mmio64High
                            let next_entry = entries
                                .next()
                                .ok_or_else(|| {
                                    MigrateStateError::ImportFailed(
                                        "Mmio64 Bar: last entry cannot be set as Mmio64"
                                        .to_string()
                                    )
                                })?;
                            next_entry.kind = EntryKind::Mmio64High;
                        }
                        _ => return Err(MigrateStateError::ImportFailed(
                            "Mmio64 Bar: expected empty entry for Mmio64High"
                                .to_string(),
                        )),
                    }
                    EntryKind::Mmio64(sz)
                }
            };
            entry.value = input_entry.value;
        }

        Ok(())
    }
}

/// Result from a write to a BAR
pub struct WriteResult {
    /// Identifier of the actual impacted BAR.
    ///
    /// If write was to the high word of a 64-bit BAR, this would hold the
    /// `BarN` for the lower word.
    pub id: BarN,
    pub def: BarDefine,
    pub val_old: u64,
    pub val_new: u64,
}

pub mod migrate {
    use serde::{Deserialize, Serialize};

    use super::BAR_COUNT;

    #[derive(Eq, PartialEq, Deserialize, Serialize)]
    pub enum BarKindV1 {
        Empty,
        Pio,
        Mmio,
        Mmio64,
    }

    #[derive(Deserialize, Serialize)]
    pub struct BarEntryV1 {
        pub kind: BarKindV1,
        pub size: u64,
        pub value: u64,
    }

    #[derive(Deserialize, Serialize)]
    pub struct BarStateV1 {
        pub entries: [BarEntryV1; BAR_COUNT],
    }
}

#[cfg(test)]
mod test {
    use super::*;

    fn setup() -> Bars {
        let bar_defs = [
            Some(BarDefine::Pio(0x100)),
            Some(BarDefine::Mmio(0x20000)),
            Some(BarDefine::Mmio64(0x40000)),
            None, // high bits
            Some(BarDefine::Mmio64(0x200000000)),
            None, // high bits
        ];
        let bars = Bars::new(&bar_defs);

        bars
    }
    #[test]
    fn init() {
        let _ = setup();
    }

    #[test]
    fn read_type() {
        let mut bars = setup();
        bars.set(BarN::BAR0, 0x1000);
        bars.set(BarN::BAR1, 0xc000000);
        bars.set(BarN::BAR2, 0xd000000);
        bars.set(BarN::BAR4, 0x800000000);

        assert_eq!(bars.reg_read(BarN::BAR0), 0x1001);
        assert_eq!(bars.reg_read(BarN::BAR1), 0xc000000);
        assert_eq!(bars.reg_read(BarN::BAR2), 0xd000004);
        assert_eq!(bars.reg_read(BarN::BAR3), 0);
        assert_eq!(bars.reg_read(BarN::BAR4), 0x4);
        assert_eq!(bars.reg_read(BarN::BAR5), 0x8);
    }

    #[test]
    fn write_place() {
        let mut bars = setup();
        bars.reg_write(BarN::BAR0, 0x1000);
        bars.reg_write(BarN::BAR1, 0xc000000);
        bars.reg_write(BarN::BAR2, 0xd000000);
        bars.reg_write(BarN::BAR5, 0x8);
        bars.reg_write(BarN::BAR4, 0x0);

        assert_eq!(bars.reg_read(BarN::BAR0), 0x1001);
        assert_eq!(bars.reg_read(BarN::BAR1), 0xc000000);
        assert_eq!(bars.reg_read(BarN::BAR2), 0xd000004);
        assert_eq!(bars.reg_read(BarN::BAR3), 0);
        assert_eq!(bars.reg_read(BarN::BAR4), 0x4);
        assert_eq!(bars.reg_read(BarN::BAR5), 0x8);
    }

    #[test]
    fn limits() {
        let mut bars = setup();
        for i in 0..=5u8 {
            bars.reg_write(BarN::from_repr(i).unwrap(), 0xffffffff);
        }
        assert_eq!(bars.reg_read(BarN::BAR0), 0x0000ff01);
        assert_eq!(bars.reg_read(BarN::BAR1), 0xfffe0000);
        assert_eq!(bars.reg_read(BarN::BAR2), 0xfffc0004);
        assert_eq!(bars.reg_read(BarN::BAR3), 0xffffffff);
        assert_eq!(bars.reg_read(BarN::BAR4), 0x00000004);
        assert_eq!(bars.reg_read(BarN::BAR5), 0xfffffffe);
    }
}


================================================
FILE: lib/propolis/src/hw/pci/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! PCI config registers.

#![allow(unused)]

pub const LEN_CFG: usize = 0x100;
pub const LEN_CFG_STD: usize = 0x40;
pub const LEN_CFG_ECAM: usize = 0x1000;

bitflags! {
    #[derive(Copy, Clone)]
    pub struct RegCmd: u16 {
    const IO_EN = 1 << 0;
    const MMIO_EN = 1 << 1;
    const BUSMSTR_EN = 1 << 2;
    const INTX_DIS = 1 << 10;
    }
}
impl RegCmd {
    pub fn reset(&mut self) {
        *self = RegCmd::default()
    }
}
impl Default for RegCmd {
    fn default() -> Self {
        RegCmd::INTX_DIS
    }
}

bitflags! {
    #[derive(Default)]
    pub struct RegStatus: u16 {
        const INTR_STATUS = 1 << 3;
        const CAP_LIST = 1 << 4;
    }
}

pub const BAR_TYPE_IO: u32 = 0b01;
pub const BAR_TYPE_MEM: u32 = 0b000;
pub const BAR_TYPE_MEM64: u32 = 0b100;

pub const CAP_ID_MSI: u8 = 0x05;
pub const CAP_ID_VENDOR: u8 = 0x09;
pub const CAP_ID_MSIX: u8 = 0x11;

pub const CLASS_UNCLASSIFIED: u8 = 0;
pub const CLASS_STORAGE: u8 = 1;
pub const CLASS_NETWORK: u8 = 2;
pub const CLASS_DISPLAY: u8 = 3;
pub const CLASS_MULTIMEDIA: u8 = 4;
pub const CLASS_MEMORY: u8 = 5;
pub const CLASS_BRIDGE: u8 = 6;

// Sub-classes under CLASS_STORAGE
pub const SUBCLASS_STORAGE_NVM: u8 = 8;

// Sub-classes under CLASS_BRIDGE
pub const SUBCLASS_BRIDGE_HOST: u8 = 0;
pub const SUBCLASS_BRIDGE_ISA: u8 = 1;
pub const SUBCLASS_BRIDGE_OTHER: u8 = 0x80;

pub const HEADER_TYPE_DEVICE: u8 = 0b0;
pub const HEADER_TYPE_BRIDGE: u8 = 0b1;
pub const HEADER_TYPE_MULTIFUNC: u8 = 0b1000_0000;

// Programming Interfaces for SUBCLASS_STORAGE_NVM
pub const PROGIF_ENTERPRISE_NVME: u8 = 2;

pub(super) const MASK_FUNC: u8 = 0x07;
pub(super) const MASK_DEV: u8 = 0x1f;
pub(super) const MASK_BUS: u8 = 0xff;

pub const PORT_PCI_CONFIG_ADDR: u16 = 0xcf8;
pub const LEN_PCI_CONFIG_ADDR: u16 = 4;
pub const PORT_PCI_CONFIG_DATA: u16 = 0xcfc;
pub const LEN_PCI_CONFIG_DATA: u16 = 4;

/// The minimum number of buses a single ECAM region can address. The PCIe spec
/// requires that at least one bit of the ECAM address space be used to specify
/// a bus number (see PCIe base spec rev 5.0 table 7-1).
pub const PCIE_MIN_BUSES_PER_ECAM_REGION: u16 = 2;

/// The maximum number of buses a single ECAM region can address.
pub const PCIE_MAX_BUSES_PER_ECAM_REGION: u16 = 256;

/// Bitwise AND'ing an ECAM MMIO access address with this mask produces an
/// offset in bytes at which to access the target BDF's configuration region.
pub const MASK_ECAM_CFG_OFFSET: usize = 0xfff;

/// Class code identifiers required by SS3.2.4.6 of the PCI bridge spec rev 1.2.
pub const BRIDGE_PROG_CLASS: u8 = 0x06;
pub const BRIDGE_PROG_SUBCLASS: u8 = 0x04;
pub const BRIDGE_PROG_IF: u8 = 0x00;

/// Clear all reserved bits and decline to emulate error reporting bits in the
/// bridge secondary status register (SS3.2.5.7).
pub const BRIDGE_SECONDARY_STATUS: u16 = 0x0000;

/// Mask for the reserved bottom bits of the memory base and memory limit
/// registers (SS3.2.5.8).
pub const BRIDGE_MEMORY_REG_MASK: u16 = 0xfff0;


================================================
FILE: lib/propolis/src/hw/pci/bridge.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for PCI bridges.

use std::num::NonZeroU8;
use std::sync::{Arc, Mutex, Weak};

use super::bus::Attachment;
use super::cfgspace::{CfgBuilder, CfgReg};
use super::topology::{LogicalBusId, RoutedBusId, Topology};
use super::{bits::*, Endpoint, Ident};
use super::{BarN, BusNum, StdCfgReg};
use crate::common::{Lifecycle, RWOp, ReadOp, WriteOp};
use crate::migrate::Migrator;
use crate::util::regmap::RegMap;

use lazy_static::lazy_static;

// Bridge configuration space header registers.
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum BridgeReg {
    Common(StdCfgReg),
    PrimaryBus,
    SecondaryBus,
    SubordinateBus,
    SecondaryLatencyTimer,
    IoBase,
    IoLimit,
    SecondaryStatus,
    MemoryBase,
    MemoryLimit,
    PrefetchableMemoryBase,
    PrefetchableMemoryLimit,
    PrefetchableMemoryBaseUpper,
    PrefetchableMemoryLimitUpper,
    IoBaseUpper,
    IoLimitUpper,
    BridgeControl,
}

lazy_static! {
    static ref CFG_HEADER_MAP: RegMap<BridgeReg> = {
        let layout = [
            (BridgeReg::Common(StdCfgReg::VendorId), 2),
            (BridgeReg::Common(StdCfgReg::DeviceId), 2),
            (BridgeReg::Common(StdCfgReg::Command), 2),
            (BridgeReg::Common(StdCfgReg::Status), 2),
            (BridgeReg::Common(StdCfgReg::RevisionId), 1),
            (BridgeReg::Common(StdCfgReg::ProgIf), 1),
            (BridgeReg::Common(StdCfgReg::Subclass), 1),
            (BridgeReg::Common(StdCfgReg::Class), 1),
            (BridgeReg::Common(StdCfgReg::CacheLineSize), 1),
            (BridgeReg::Common(StdCfgReg::LatencyTimer), 1),
            (BridgeReg::Common(StdCfgReg::HeaderType), 1),
            (BridgeReg::Common(StdCfgReg::Bist), 1),
            (BridgeReg::Common(StdCfgReg::Bar(BarN::BAR0)), 4),
            (BridgeReg::Common(StdCfgReg::Bar(BarN::BAR1)), 4),
            (BridgeReg::PrimaryBus, 1),
            (BridgeReg::SecondaryBus, 1),
            (BridgeReg::SubordinateBus, 1),
            (BridgeReg::SecondaryLatencyTimer, 1),
            (BridgeReg::IoBase, 1),
            (BridgeReg::IoLimit, 1),
            (BridgeReg::SecondaryStatus, 2),
            (BridgeReg::MemoryBase, 2),
            (BridgeReg::MemoryLimit, 2),
            (BridgeReg::PrefetchableMemoryBase, 2),
            (BridgeReg::PrefetchableMemoryLimit, 2),
            (BridgeReg::PrefetchableMemoryBaseUpper, 4),
            (BridgeReg::PrefetchableMemoryLimitUpper, 4),
            (BridgeReg::IoBaseUpper, 2),
            (BridgeReg::IoLimitUpper, 2),
            (BridgeReg::Common(StdCfgReg::CapPtr), 1),
            (BridgeReg::Common(StdCfgReg::Reserved), 3),
            (BridgeReg::Common(StdCfgReg::ExpansionRomAddr), 4),
            (BridgeReg::Common(StdCfgReg::IntrLine), 1),
            (BridgeReg::Common(StdCfgReg::IntrPin), 1),
            (BridgeReg::BridgeControl, 2),
        ];
        RegMap::create_packed(
            LEN_CFG_STD,
            &layout,
            Some(BridgeReg::Common(StdCfgReg::Reserved)),
        )
    };
}

/// A PCI-PCI bridge.
pub struct Bridge {
    ident: Ident,

    // The common PCI state has its own synchronization. Accesses to it are
    // currently mutually exclusive with accesses to the bridge state (i.e. no
    // single config transaction is expected to access both common state and
    // bridge state).
    cfg_map: RegMap<CfgReg>,
    inner: Mutex<Inner>,
}

impl Bridge {
    /// Construct a new PCI bridge with the supplied downstream bus. Updating
    /// the bridge's secondary bus number will update the supplied router such
    /// that it maps the new bus number to the bridge's downstream bus.
    pub fn new(
        vendor: u16,
        device: u16,
        topology: &Arc<Topology>,
        downstream_bus_id: LogicalBusId,
    ) -> Arc<Self> {
        let cfg_builder = CfgBuilder::new();
        Arc::new(Self {
            ident: Ident {
                vendor_id: vendor,
                device_id: device,
                sub_vendor_id: vendor,
                sub_device_id: device,
                device_class: BRIDGE_PROG_CLASS,
                device_subclass: BRIDGE_PROG_SUBCLASS,
                prog_if: BRIDGE_PROG_IF,
                ..Default::default()
            },
            cfg_map: cfg_builder.finish().0,
            inner: Mutex::new(Inner::new(topology, downstream_bus_id)),
        })
    }

    fn cfg_header_rw(&self, mut rwo: RWOp) {
        CFG_HEADER_MAP.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => {
                self.cfg_std_read(id, ro);
            }
            RWOp::Write(wo) => {
                self.cfg_std_write(id, wo);
            }
        })
    }

    fn cfg_std_read(&self, id: &BridgeReg, ro: &mut ReadOp) {
        match id {
            BridgeReg::Common(id) => match id {
                StdCfgReg::VendorId => ro.write_u16(self.ident.vendor_id),
                StdCfgReg::DeviceId => ro.write_u16(self.ident.device_id),
                StdCfgReg::Class => ro.write_u8(self.ident.device_class),
                StdCfgReg::Subclass => ro.write_u8(self.ident.device_subclass),
                StdCfgReg::SubVendorId => {
                    ro.write_u16(self.ident.sub_vendor_id)
                }
                StdCfgReg::SubDeviceId => {
                    ro.write_u16(self.ident.sub_device_id)
                }
                StdCfgReg::ProgIf => ro.write_u8(self.ident.prog_if),
                StdCfgReg::RevisionId => ro.write_u8(0),
                StdCfgReg::HeaderType => ro.write_u8(HEADER_TYPE_BRIDGE),
                StdCfgReg::Reserved => ro.fill(0),
                StdCfgReg::Command => {
                    let guard = self.inner.lock().unwrap();
                    ro.write_u16(guard.reg_command.bits());
                }

                // The bridge never generates its own interrupts and currently
                // has no capabilities, so set both of those bits to 0.
                StdCfgReg::Status => ro.write_u16(0),

                // Disable interrupts from the bridge device itself (SS3.2.5.16
                // and 17).
                StdCfgReg::IntrLine => ro.write_u8(0xFF),
                StdCfgReg::IntrPin => ro.write_u8(0),

                // The bridge has no internal resources, so disable its BARs.
                // This doesn't affect transactions that cross the bridge
                // (SS3.2.5.1).
                StdCfgReg::Bar(_) => ro.write_u32(0),

                // Expansion ROMs are not supported.
                StdCfgReg::ExpansionRomAddr => ro.write_u32(0),

                // No capabilities for now.
                StdCfgReg::CapPtr => ro.write_u8(0),

                // Other registers defined to be optional in SS3.2.4.
                StdCfgReg::CacheLineSize => ro.write_u8(0),
                StdCfgReg::LatencyTimer => ro.write_u8(0),
                StdCfgReg::Bist => ro.write_u8(0),

                // These registers appear in type-0 PCI headers, but not bridge
                // headers.
                StdCfgReg::MaxLatency
                | StdCfgReg::MinGrant
                | StdCfgReg::CardbusPtr => {
                    panic!("Unexpected register type {:?}", id);
                }
            },
            BridgeReg::PrimaryBus => {
                let guard = self.inner.lock().unwrap();
                ro.write_u8(guard.primary_bus.get());
            }
            BridgeReg::SecondaryBus => {
                let guard = self.inner.lock().unwrap();
                ro.write_u8(guard.secondary_bus.get());
            }
            BridgeReg::SubordinateBus => {
                let guard = self.inner.lock().unwrap();
                ro.write_u8(guard.subordinate_bus.get());
            }
            BridgeReg::SecondaryLatencyTimer => ro.write_u8(0),
            BridgeReg::IoBase | BridgeReg::IoLimit => ro.write_u8(0),
            BridgeReg::SecondaryStatus => ro.write_u16(BRIDGE_SECONDARY_STATUS),
            BridgeReg::MemoryBase => {
                let guard = self.inner.lock().unwrap();
                ro.write_u16(guard.memory_base & BRIDGE_MEMORY_REG_MASK);
            }
            BridgeReg::MemoryLimit => {
                let guard = self.inner.lock().unwrap();
                ro.write_u16(guard.memory_limit & BRIDGE_MEMORY_REG_MASK);
            }
            BridgeReg::PrefetchableMemoryBase
            | BridgeReg::PrefetchableMemoryLimit => ro.write_u16(0),
            BridgeReg::PrefetchableMemoryBaseUpper
            | BridgeReg::PrefetchableMemoryLimitUpper => ro.write_u32(0),
            BridgeReg::IoBaseUpper | BridgeReg::IoLimitUpper => ro.write_u16(0),
            BridgeReg::BridgeControl => ro.write_u16(0),
        }
    }

    fn cfg_std_write(&self, id: &BridgeReg, wo: &mut WriteOp) {
        match id {
            BridgeReg::Common(id) => match id {
                // Ignore writes to read-only standard registers.
                StdCfgReg::VendorId
                | StdCfgReg::DeviceId
                | StdCfgReg::Class
                | StdCfgReg::Subclass
                | StdCfgReg::SubVendorId
                | StdCfgReg::SubDeviceId
                | StdCfgReg::ProgIf
                | StdCfgReg::RevisionId
                | StdCfgReg::HeaderType
                | StdCfgReg::CapPtr
                | StdCfgReg::CacheLineSize
                | StdCfgReg::LatencyTimer
                | StdCfgReg::Bist
                | StdCfgReg::Reserved => {}

                StdCfgReg::Command => {
                    let new = RegCmd::from_bits_truncate(wo.read_u16());
                    let mut guard = self.inner.lock().unwrap();
                    guard.reg_command = new;
                }

                // Writes to the status register are supported to clear certain
                // error bits, but the virtual bridge doesn't report these
                // errors, so just ignore writes to this register.
                StdCfgReg::Status => {}

                // The bridge has no interrupt signal pin, so treat its line
                // register as read-only.
                StdCfgReg::IntrLine | StdCfgReg::IntrPin => {}

                // The bridge has no internal resources, so disable its BARs.
                // This doesn't affect transactions that cross the bridge
                // (SS3.2.5.1).
                StdCfgReg::Bar(_) => {}

                // Expansion ROMs are not supported.
                StdCfgReg::ExpansionRomAddr => {}

                // These registers appear in type-0 PCI headers, but not bridge
                // headers.
                StdCfgReg::MaxLatency
                | StdCfgReg::MinGrant
                | StdCfgReg::CardbusPtr => {
                    panic!("Unexpected register type {:?}", id);
                }
            },

            // Writable bridge registers.
            BridgeReg::PrimaryBus => {
                let mut guard = self.inner.lock().unwrap();
                guard.primary_bus = BusNum::new(wo.read_u8())
            }
            BridgeReg::SecondaryBus => {
                let mut guard = self.inner.lock().unwrap();
                guard.set_secondary_bus(BusNum::new(wo.read_u8()))
            }
            BridgeReg::SubordinateBus => {
                let mut guard = self.inner.lock().unwrap();
                guard.subordinate_bus = BusNum::new(wo.read_u8())
            }
            BridgeReg::MemoryBase => {
                let mut guard = self.inner.lock().unwrap();
                guard.memory_base = wo.read_u16();
            }
            BridgeReg::MemoryLimit => {
                let mut guard = self.inner.lock().unwrap();
                guard.memory_limit = wo.read_u16();
            }

            // Read-only bridge registers.
            BridgeReg::SecondaryLatencyTimer => {}
            BridgeReg::IoBase | BridgeReg::IoLimit => {}
            BridgeReg::SecondaryStatus => {}
            BridgeReg::PrefetchableMemoryBase
            | BridgeReg::PrefetchableMemoryLimit => {}
            BridgeReg::PrefetchableMemoryBaseUpper
            | BridgeReg::PrefetchableMemoryLimitUpper => {}
            BridgeReg::IoBaseUpper | BridgeReg::IoLimitUpper => {}
            BridgeReg::BridgeControl => {}
        }
    }
}

impl Endpoint for Bridge {
    fn attach(&self, attachment: Attachment) {
        let mut inner = self.inner.lock().unwrap();
        let _old = inner.attachment.replace(attachment);
        assert!(_old.is_none());
    }

    fn cfg_rw(&self, mut rwo: RWOp) {
        self.cfg_map.process(&mut rwo, |id, rwo| match id {
            CfgReg::Std => {
                self.cfg_header_rw(rwo);
            }
            _ => {
                panic!(
                    "Unexpected read of bridge config space with ID {:?}",
                    id
                )
            }
        });
    }

    fn bar_rw(&self, _bar: BarN, _rwo: RWOp) {
        // Bridges don't consume any additional I/O or memory space that would
        // be described in a BAR (and indeed their BARs are read-only), so this
        // routine should never be reached.
        panic!("unexpected BAR-defined region I/O in PCI bridge");
    }
}

impl Lifecycle for Bridge {
    fn type_name(&self) -> &'static str {
        "pci-bridge"
    }
    fn reset(&self) {
        self.inner.lock().unwrap().reset();
    }
    fn migrate(&self) -> Migrator<'_> {
        // TODO Should be migratable in theory: copy all the register state,
        // then enumerate bridges on the target and reconstruct the routing
        // table from their bus registers' values.
        Migrator::NonMigratable
    }
}

struct Inner {
    attachment: Option<Attachment>,

    // This reference must be weak to avoid a topology -> bus -> attached
    // bridge -> topology reference cycle.
    topology: Weak<Topology>,
    downstream_bus_id: LogicalBusId,

    reg_command: RegCmd,
    primary_bus: BusNum,
    secondary_bus: BusNum,
    subordinate_bus: BusNum,
    memory_base: u16,
    memory_limit: u16,
}

impl Inner {
    fn new(topology: &Arc<Topology>, downstream_bus_id: LogicalBusId) -> Self {
        Self {
            attachment: None,
            topology: Arc::downgrade(topology),
            downstream_bus_id,
            reg_command: RegCmd::empty(),
            primary_bus: BusNum::new(0),
            secondary_bus: BusNum::new(0),
            subordinate_bus: BusNum::new(0),
            memory_base: 0,
            memory_limit: 0,
        }
    }

    fn set_secondary_bus(&mut self, n: BusNum) {
        let topology = self.topology.upgrade();
        if let Some(bus) = NonZeroU8::new(self.secondary_bus.get()) {
            if let Some(topology) = &topology {
                topology.set_bus_route(RoutedBusId(bus.get()), None);
            }
        }
        self.secondary_bus = n;
        if let Some(bus) = NonZeroU8::new(self.secondary_bus.get()) {
            if let Some(topology) = &topology {
                topology.set_bus_route(
                    RoutedBusId(bus.get()),
                    Some(self.downstream_bus_id),
                );
            }
        }
    }

    fn reset(&mut self) {
        self.primary_bus = BusNum::new(0);
        self.set_secondary_bus(BusNum::new(0));
        self.subordinate_bus = BusNum::new(0);
        self.memory_base = 0;
        self.memory_limit = 0;
    }
}

#[cfg(test)]
mod test {
    use crate::hw::ids;
    use crate::hw::pci::topology::{
        BridgeDescription, Builder, FinishedTopology, LogicalBusId,
    };
    use crate::hw::pci::{Bdf, Endpoint};
    use crate::vmm::Machine;

    use super::*;

    const OFFSET_VENDOR_ID: usize = 0x00;
    const OFFSET_DEVICE_ID: usize = 0x02;
    const OFFSET_HEADER_TYPE: usize = 0x0E;
    const OFFSET_SECONDARY_BUS: usize = 0x19;

    struct Env {
        _machine: Machine,
        topology: Arc<Topology>,
    }

    impl Env {
        fn new(bridges: Option<Vec<BridgeDescription>>) -> Self {
            let mut builder = Builder::new();
            if let Some(bridges) = bridges {
                for bridge in bridges {
                    builder.add_bridge(bridge).unwrap();
                }
            }

            let machine = Machine::new_test().unwrap();
            let FinishedTopology { topology, bridges: _bridges } =
                builder.finish(&machine).unwrap();
            Self { _machine: machine, topology }
        }

        fn make_bridge(&self) -> Arc<Bridge> {
            Bridge::new(
                ids::pci::VENDOR_OXIDE,
                ids::pci::PROPOLIS_BRIDGE_DEV_ID,
                &self.topology,
                LogicalBusId(0xFF),
            )
        }

        fn read_header_byte(&self, target: Bdf, offset: usize) -> u8 {
            let mut buf = [0u8; 1];
            let mut ro = ReadOp::from_buf(offset, &mut buf);
            self.topology.pci_cfg_rw(
                RoutedBusId(target.bus.get()),
                target.location,
                RWOp::Read(&mut ro),
            );
            buf[0]
        }

        fn read_header_type(&self, target: Bdf) -> u8 {
            self.read_header_byte(target, OFFSET_HEADER_TYPE)
        }

        fn read_secondary_bus(&self, target: Bdf) -> u8 {
            self.read_header_byte(target, OFFSET_SECONDARY_BUS)
        }

        fn write_header_byte(&self, target: Bdf, offset: usize, val: u8) {
            let mut buf = [val; 1];
            let mut wo = WriteOp::from_buf(offset, &mut buf);
            self.topology.pci_cfg_rw(
                RoutedBusId(target.bus.get()),
                target.location,
                RWOp::Write(&mut wo),
            );
        }

        fn write_secondary_bus(&self, target: Bdf, val: u8) {
            self.write_header_byte(target, OFFSET_SECONDARY_BUS, val);
        }
    }

    #[test]
    fn bridge_properties() {
        let env = Env::new(None);
        let bridge = env.make_bridge();
        let mut buf = [0xffu8; 1];
        let mut ro = ReadOp::from_buf(OFFSET_HEADER_TYPE, &mut buf);
        Endpoint::cfg_rw(bridge.as_ref(), RWOp::Read(&mut ro));
        assert_eq!(buf[0], HEADER_TYPE_BRIDGE);

        let mut buf = [0xffu8; 2];
        let mut ro = ReadOp::from_buf(OFFSET_VENDOR_ID, &mut buf);
        Endpoint::cfg_rw(bridge.as_ref(), RWOp::Read(&mut ro));
        assert_eq!(u16::from_le_bytes(buf), ids::pci::VENDOR_OXIDE);

        let mut buf = [0xffu8; 2];
        let mut ro = ReadOp::from_buf(OFFSET_DEVICE_ID, &mut buf);
        Endpoint::cfg_rw(bridge.as_ref(), RWOp::Read(&mut ro));
        assert_eq!(u16::from_le_bytes(buf), ids::pci::PROPOLIS_BRIDGE_DEV_ID);
    }

    #[test]
    fn bridge_routing() {
        let env = Env::new(Some(vec![
            BridgeDescription::new(LogicalBusId(1), Bdf::new(0, 1, 0).unwrap()),
            BridgeDescription::new(LogicalBusId(2), Bdf::new(0, 2, 0).unwrap()),
            BridgeDescription::new(LogicalBusId(3), Bdf::new(1, 1, 0).unwrap()),
            BridgeDescription::new(LogicalBusId(4), Bdf::new(2, 1, 0).unwrap()),
        ]));

        // Set the first test bridge's downstream bus to 81, then verify that
        // 81.1.0 is a valid bridge device.
        env.write_secondary_bus(Bdf::new(0, 1, 0).unwrap(), 81);
        assert_eq!(
            env.read_header_type(Bdf::new(81, 1, 0).unwrap()),
            HEADER_TYPE_BRIDGE
        );

        // Write bus 83 to the newly-connected downstream bridge's secondary
        // bus number to distinguish it from the other, uninitialized bridge.
        env.write_secondary_bus(Bdf::new(81, 1, 0).unwrap(), 83);

        // Clear the test bridge's secondary bus register and verify the routing
        // is removed.
        env.write_secondary_bus(Bdf::new(0, 1, 0).unwrap(), 0);
        assert_eq!(env.read_secondary_bus(Bdf::new(81, 1, 0).unwrap()), 0);

        // Set the second parent bridge's downstream bus to 81. The downstream
        // bridge's secondary bus should not be set.
        env.write_secondary_bus(Bdf::new(0, 2, 0).unwrap(), 81);
        assert_eq!(
            env.read_header_type(Bdf::new(81, 1, 0).unwrap()),
            HEADER_TYPE_BRIDGE
        );
        assert_eq!(env.read_secondary_bus(Bdf::new(81, 1, 0).unwrap()), 0);

        // Route the first parent bridge to downstream bus 82 and verify the
        // child bridge with bus 83 is still there.
        env.write_secondary_bus(Bdf::new(0, 1, 0).unwrap(), 82);
        assert_eq!(env.read_secondary_bus(Bdf::new(82, 1, 0).unwrap()), 83);

        // Clear the second bridge's routing and verify that the first bridge's
        // routing is left alone.
        env.write_secondary_bus(Bdf::new(0, 2, 0).unwrap(), 0);
        assert_eq!(env.read_secondary_bus(Bdf::new(82, 1, 0).unwrap()), 83);

        // Clear the first bridge's routing and verify that its entry is also
        // safely removed.
        env.write_secondary_bus(Bdf::new(0, 1, 0).unwrap(), 0);
        assert_eq!(env.read_secondary_bus(Bdf::new(82, 1, 0).unwrap()), 0);
    }
}


================================================
FILE: lib/propolis/src/hw/pci/bus.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeMap;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Mutex, Weak};

use super::bar::BarDefine;
use super::{BarN, BusLocation, Endpoint, LintrCfg};
use crate::accessors::*;
use crate::common::RWOp;
use crate::mmio::{MmioBus, MmioFn};
use crate::pio::{PioBus, PioFn};

pub struct Bus {
    inner: Arc<Mutex<Inner>>,
}

impl Bus {
    pub fn new(
        pio: &Arc<PioBus>,
        mmio: &Arc<MmioBus>,
        acc_mem: MemAccessor,
        acc_msi: MsiAccessor,
    ) -> Self {
        Self {
            inner: Arc::new(Mutex::new(Inner::new(
                pio, mmio, acc_mem, acc_msi,
            ))),
        }
    }

    pub fn attach(
        &self,
        location: BusLocation,
        dev: Arc<dyn Endpoint>,
        lintr_cfg: Option<LintrCfg>,
    ) {
        let mut inner = self.inner.lock().unwrap();
        let (slot_state, acc_msi, acc_mem) =
            inner.attach(location, dev.clone());

        let attached = Attachment {
            inner: Arc::downgrade(&self.inner),
            location,
            lintr_cfg,
            slot_state,
            acc_msi,
            acc_mem,
        };
        dev.attach(attached);
    }

    pub fn device_at(
        &self,
        location: BusLocation,
    ) -> Option<Arc<dyn Endpoint>> {
        let inner = self.inner.lock().unwrap();
        inner.device_at(location)
    }
}

pub struct Attachment {
    inner: Weak<Mutex<Inner>>,
    location: BusLocation,
    lintr_cfg: Option<LintrCfg>,
    slot_state: Arc<SlotState>,
    pub acc_msi: MsiAccessor,
    pub acc_mem: MemAccessor,
}
impl Attachment {
    pub fn bar_register(&self, n: BarN, def: BarDefine, addr: u64) {
        if let Some(inner) = self.inner.upgrade() {
            let mut guard = inner.lock().unwrap();
            guard.bar_register(self.location, n, def, addr);
        }
    }
    pub fn bar_unregister(&self, n: BarN) {
        if let Some(inner) = self.inner.upgrade() {
            let mut guard = inner.lock().unwrap();
            guard.bar_unregister(self.location, n);
        }
    }
    pub fn lintr_cfg(&self) -> Option<&LintrCfg> {
        self.lintr_cfg.as_ref()
    }
    pub fn location(&self) -> BusLocation {
        self.location
    }
    pub fn is_multifunc(&self) -> bool {
        self.slot_state.is_multifunc.load(Ordering::Acquire)
    }
}

#[derive(Default)]
struct SlotState {
    is_multifunc: AtomicBool,
}

const SLOTS_PER_BUS: usize = 32;
const FUNCS_PER_SLOT: usize = 8;

#[derive(Default)]
struct Slot {
    funcs: [Option<Arc<dyn Endpoint>>; FUNCS_PER_SLOT],
    state: Arc<SlotState>,
}
impl Slot {
    fn attach(
        &mut self,
        location: BusLocation,
        dev: Arc<dyn Endpoint>,
    ) -> Arc<SlotState> {
        let _old = self.funcs[location.func.get() as usize].replace(dev);

        // XXX be strict for now
        assert!(matches!(_old, None));

        // Keep multi-func state updated
        if !self.state.is_multifunc.load(Ordering::Acquire) {
            if self.funcs.iter().filter(|x| x.is_some()).count() > 1 {
                self.state.is_multifunc.store(true, Ordering::Release);
            }
        }
        self.state.clone()
    }
}

struct BarState {
    def: BarDefine,
    value: u64,
    live: bool,
}

struct Inner {
    slots: [Slot; SLOTS_PER_BUS],
    bar_state: BTreeMap<(BusLocation, BarN), BarState>,
    bus_pio: Weak<PioBus>,
    bus_mmio: Weak<MmioBus>,

    acc_msi: MsiAccessor,
    acc_mem: MemAccessor,
}
impl Inner {
    fn new(
        pio: &Arc<PioBus>,
        mmio: &Arc<MmioBus>,
        acc_mem: MemAccessor,
        acc_msi: MsiAccessor,
    ) -> Self {
        Self {
            slots: Default::default(),
            bar_state: BTreeMap::new(),
            bus_pio: Arc::downgrade(pio),
            bus_mmio: Arc::downgrade(mmio),

            acc_msi,
            acc_mem,
        }
    }
    fn device_at(&self, location: BusLocation) -> Option<Arc<dyn Endpoint>> {
        let res = self.slots[location.dev.get() as usize].funcs
            [location.func.get() as usize]
            .clone();
        res
    }
    fn attach(
        &mut self,
        location: BusLocation,
        dev: Arc<dyn Endpoint>,
    ) -> (Arc<SlotState>, MsiAccessor, MemAccessor) {
        let slot_state =
            self.slots[location.dev.get() as usize].attach(location, dev);

        let acc_name = format!(
            "PCI dev:{} func:{}",
            location.dev.get(),
            location.func.get()
        );
        (
            slot_state,
            self.acc_msi.child(Some(acc_name.clone())),
            self.acc_mem.child(Some(acc_name)),
        )
    }
    fn bar_register(
        &mut self,
        location: BusLocation,
        n: BarN,
        def: BarDefine,
        value: u64,
    ) {
        let dev = self.device_at(location).unwrap();

        let live = match def {
            BarDefine::Pio(sz) => {
                if let Some(pio) = self.bus_pio.upgrade() {
                    let func = Arc::new(move |_port: u16, rwo: RWOp| {
                        dev.bar_rw(n, rwo)
                    }) as Arc<PioFn>;
                    pio.register(value as u16, sz, func).is_ok()
                } else {
                    false
                }
            }
            BarDefine::Mmio(sz) => {
                if let Some(mmio) = self.bus_mmio.upgrade() {
                    let func = Arc::new(move |_addr: usize, rwo: RWOp| {
                        dev.bar_rw(n, rwo)
                    }) as Arc<MmioFn>;
                    mmio.register(value as usize, sz as usize, func).is_ok()
                } else {
                    false
                }
            }
            BarDefine::Mmio64(sz) => {
                if let Some(mmio) = self.bus_mmio.upgrade() {
                    let func = Arc::new(move |_addr: usize, rwo: RWOp| {
                        dev.bar_rw(n, rwo)
                    }) as Arc<MmioFn>;
                    mmio.register(value as usize, sz as usize, func).is_ok()
                } else {
                    false
                }
            }
        };
        let _old =
            self.bar_state.insert((location, n), BarState { def, value, live });
        // XXX be strict for now
        assert!(_old.is_none());
    }
    fn bar_unregister(&mut self, location: BusLocation, n: BarN) {
        if let Some(state) = self.bar_state.remove(&(location, n)) {
            if !state.live {
                // when BAR was registered, it conflicted with something else on
                // the bus, so no further action is necessary
                return;
            }
            match state.def {
                BarDefine::Pio(_) => {
                    if let Some(pio) = self.bus_pio.upgrade() {
                        pio.unregister(state.value as u16).unwrap();
                    }
                }
                BarDefine::Mmio(_) | BarDefine::Mmio64(_) => {
                    if let Some(mmio) = self.bus_mmio.upgrade() {
                        mmio.unregister(state.value as usize).unwrap();
                    }
                }
            }
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::hw::pci::test::Scaffold;

    #[derive(Default)]
    struct TestDev {
        inner: Mutex<Option<Attachment>>,
    }
    impl Endpoint for TestDev {
        fn attach(&self, attachment: Attachment) {
            let mut attach = self.inner.lock().unwrap();
            attach.replace(attachment);
        }
        fn cfg_rw(&self, _op: RWOp) {}
        fn bar_rw(&self, _bar: BarN, _rwo: RWOp) {}
    }
    impl TestDev {
        fn check_multifunc(&self) -> Option<bool> {
            self.inner.lock().unwrap().as_ref().map(Attachment::is_multifunc)
        }
    }

    #[test]
    fn empty() {
        let scaffold = Scaffold::new();
        let bus = scaffold.create_bus();

        for slot in 0..31 {
            for func in 0..7 {
                let location = BusLocation::new(slot, func).unwrap();
                assert!(
                    matches!(bus.device_at(location), None),
                    "no device at 0.{:?}",
                    location
                );
            }
        }
    }

    #[test]
    fn set_multifunc() {
        let scaffold = Scaffold::new();
        let bus = scaffold.create_bus();

        let first = Arc::new(TestDev::default());
        let other_slot = Arc::new(TestDev::default());
        let same_slot = Arc::new(TestDev::default());

        bus.attach(
            BusLocation::new(0, 0).unwrap(),
            Arc::clone(&first) as Arc<dyn Endpoint>,
            None,
        );
        assert_eq!(first.check_multifunc(), Some(false));

        bus.attach(
            BusLocation::new(1, 0).unwrap(),
            Arc::clone(&other_slot) as Arc<dyn Endpoint>,
            None,
        );
        assert_eq!(first.check_multifunc(), Some(false));
        assert_eq!(other_slot.check_multifunc(), Some(false));

        bus.attach(
            BusLocation::new(0, 1).unwrap(),
            Arc::clone(&same_slot) as Arc<dyn Endpoint>,
            None,
        );
        assert_eq!(first.check_multifunc(), Some(true));
        assert_eq!(same_slot.check_multifunc(), Some(true));
        assert_eq!(other_slot.check_multifunc(), Some(false));
    }
}


================================================
FILE: lib/propolis/src/hw/pci/cfgspace.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Helpers for dealing with configuration space.

use crate::common::RWOp;
use crate::common::ReadOp;
use crate::hw::pci::CapId;
use crate::util::regmap::Flags;
use crate::util::regmap::RegMap;

use super::bits::*;
use super::Cap;

#[derive(Debug)]
pub(super) enum CfgCapReg {
    Id(u8),
    Next(u8),
    Body(u8),
}

#[derive(Debug)]
pub(super) enum CfgReg {
    Std,
    Custom(u8),
    Cap(CfgCapReg),
}

/// A helper for building maps of PCI device configuration space.
pub(super) struct CfgBuilder {
    cfgmap: RegMap<CfgReg>,
    caps: Vec<Cap<u32>>,
    cap_next_alloc: usize,
}

impl CfgBuilder {
    /// Creates a new PCI configuration space map builder.
    pub fn new() -> Self {
        let mut cfgmap = RegMap::new(LEN_CFG_ECAM);
        cfgmap.define_with_flags(0, LEN_CFG_STD, CfgReg::Std, Flags::PASSTHRU);
        Self { cfgmap, caps: Vec::new(), cap_next_alloc: LEN_CFG_STD }
    }

    fn check_overlap(&self, offset: usize, len: usize) {
        let mut buf = [0u8; u8::MAX as usize + 1];
        let mut ro = ReadOp::from_buf(offset, &mut buf);
        self.cfgmap.read(&mut ro, &mut |region, rwo| {
            if let RWOp::Read(ro) = rwo {
                panic!(
                    "New config region at {} with length {} conflicts \
                       with existing region {:?} at offset {} with length {}",
                    offset,
                    len,
                    region,
                    ro.offset(),
                    ro.len()
                );
            } else {
                panic!("Unexpected write operation in check_overlap");
            }
        });
    }

    /// Adds a custom endpoint-defined configuration region at the supplied
    /// offset with the supplied length.
    ///
    /// # Panics
    ///
    /// Panics if the custom region overlaps an existing region in the space
    /// under construction.
    pub fn add_custom(&mut self, offset: u8, len: u8) {
        self.check_overlap(offset as usize, len as usize);
        self.cfgmap.define_with_flags(
            offset as usize,
            len as usize,
            CfgReg::Custom(offset),
            Flags::PASSTHRU,
        );
    }

    /// Adds a new capability region of the supplied length at the next
    /// available offset in configuration space.
    ///
    /// The `len` argument supplies the length of the variable-size portion of
    /// the capability (i.e., the length of the capability data exclusive of the
    /// 1-byte capability ID and next capability pointer registers).
    ///
    /// Note: The builder allocates capability regions in sequence starting
    /// immediately after the config space header. Allocating a custom region
    /// does not advance the builder's "next capability" pointer. The caller is
    /// responsible for arranging config space so that capabilities precede any
    /// allocated custom regions; the builder will panic if it detects any
    /// overlapping regions.
    ///
    /// # Panics
    ///
    /// Panics if:
    ///
    /// - The capability overlaps an existing region in the space under
    ///   construction;
    /// - The capability's total size (inclusive of the capability ID and
    ///   capability pointer registers) is not a multiple of 4 bytes; or
    /// - The capability's total size (again inclusive of the standard
    ///   registers) is 256 bytes or larger.
    pub fn add_capability(&mut self, id: CapId<u32>, len: u8) {
        self.check_overlap(self.cap_next_alloc, len as usize);
        let end = self.cap_next_alloc + 2 + len as usize;
        // XXX: on the caller to size properly for alignment requirements
        assert_eq!(end % 4, 0);
        assert!(end <= u8::MAX as usize);
        let idx = self.caps.len() as u8;
        self.caps.push(Cap::new(id, self.cap_next_alloc as u8));
        self.cfgmap.define(
            self.cap_next_alloc,
            1,
            CfgReg::Cap(CfgCapReg::Id(idx)),
        );
        self.cfgmap.define(
            self.cap_next_alloc + 1,
            1,
            CfgReg::Cap(CfgCapReg::Next(idx)),
        );
        self.cfgmap.define(
            self.cap_next_alloc + 2,
            len as usize,
            CfgReg::Cap(CfgCapReg::Body(idx)),
        );
        self.cap_next_alloc = end;
    }

    /// Constructs the configuration space and a description of its
    /// capabilities.
    pub fn finish(self) -> (RegMap<CfgReg>, Vec<Cap<u32>>) {
        (self.cfgmap, self.caps)
    }
}


================================================
FILE: lib/propolis/src/hw/pci/device.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Condvar, Mutex, MutexGuard};

use super::bar::{BarDefine, Bars};
use super::bits::*;
use super::cfgspace::{CfgBuilder, CfgCapReg, CfgReg};
use super::{bus, BarN, Endpoint};
use crate::accessors::{MemAccessor, MsiAccessor};
use crate::common::*;
use crate::intr_pins::IntrPin;
use crate::migrate::*;
use crate::util::regmap::{Flags, RegMap};

use lazy_static::lazy_static;
use strum::IntoEnumIterator;

fn op_meta(rwo: &RWOp) -> (usize, &'static str) {
    match rwo {
        RWOp::Read(ro) => (ro.offset(), "read"),
        RWOp::Write(wo) => (wo.offset(), "write"),
    }
}

/// Represents behavior common across virtualized PCI(e) devices.
pub trait Device: Send + Sync + 'static {
    /// Returns the device state of this device.
    fn device_state(&self) -> &DeviceState;

    /// Reads or writes an MMIO region described by a BAR.
    fn bar_rw(&self, bar: BarN, rwo: RWOp) {
        let (offset, op) = op_meta(&rwo);
        unimplemented!("BAR {op} ({bar:?} @ {offset:x})")
    }

    /// Reads or writes capability space.
    fn cfg_rw(&self, region: u8, rwo: RWOp) {
        let (offset, op) = op_meta(&rwo);
        unimplemented!("CFG {op} ({region:x} @ {offset:x})")
    }

    /// Reads or writes a capability in configuration space.
    fn cap_rw(&self, id: CapId<u32>, rwo: RWOp) {
        let (offset, op) = op_meta(&rwo);
        unimplemented!("CAP {op} ({id:x?} @ {offset:x})")
    }

    /// Attaches the device to the virtual machine.
    fn attach(&self) {}

    /// Notification that the interrupt mode has changed.  For
    /// example, we might change from MSI-X to MSI.
    fn interrupt_mode_change(&self, mode: IntrMode) {
        let _used = mode;
    }

    /// Notification that our MSI configuration has changed.
    fn msi_update(&self, info: MsiUpdate) {
        let _used = info;
    }

    /// Notification that configuration of BAR(s) has changed, either due to
    /// writes to the BARs themselves, or an overall status change (via the
    /// Command register or a device reset).
    fn bar_update(&self, bstate: BarState) {
        let _used = bstate;
    }
}

impl<D: Device + Send + Sync + 'static> Endpoint for D {
    fn attach(&self, attachment: bus::Attachment) {
        let ds = self.device_state();
        ds.attach(attachment);
        self.attach();
    }
    fn cfg_rw(&self, mut rwo: RWOp) {
        let ds = self.device_state();
        ds.cfg_space.process(&mut rwo, |id, mut rwo| match id {
            CfgReg::Std => {
                STD_CFG_MAP.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => ds.cfg_std_read(id, ro),
                    RWOp::Write(wo) => ds.cfg_std_write(self, id, wo),
                });
            }
            CfgReg::Custom(region) => Device::cfg_rw(self, *region, rwo),
            CfgReg::Cap(reg) => ds.cfg_cap_rw(self, reg, rwo),
        });
    }
    fn bar_rw(&self, bar: BarN, rwo: RWOp) {
        let ds = self.device_state();
        if let Some(msix) = ds.msix_cfg.as_ref() {
            if msix.bar_match(bar) {
                msix.bar_rw(rwo, |info| ds.notify_msi_update(self, info));
                return;
            }
        }
        Device::bar_rw(self, bar, rwo);
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub(super) enum StdCfgReg {
    VendorId,
    DeviceId,
    Command,
    Status,
    RevisionId,
    ProgIf,
    Subclass,
    Class,
    CacheLineSize,
    LatencyTimer,
    HeaderType,
    Bist,
    Bar(BarN),
    CardbusPtr,
    SubVendorId,
    SubDeviceId,
    ExpansionRomAddr,
    CapPtr,
    Reserved,
    IntrLine,
    IntrPin,
    MinGrant,
    MaxLatency,
}

lazy_static! {
    static ref STD_CFG_MAP: RegMap<StdCfgReg> = {
        let layout = [
            (StdCfgReg::VendorId, 2),
            (StdCfgReg::DeviceId, 2),
            (StdCfgReg::Command, 2),
            (StdCfgReg::Status, 2),
            (StdCfgReg::RevisionId, 1),
            (StdCfgReg::ProgIf, 1),
            (StdCfgReg::Subclass, 1),
            (StdCfgReg::Class, 1),
            (StdCfgReg::CacheLineSize, 1),
            (StdCfgReg::LatencyTimer, 1),
            (StdCfgReg::HeaderType, 1),
            (StdCfgReg::Bist, 1),
            (StdCfgReg::Bar(BarN::BAR0), 4),
            (StdCfgReg::Bar(BarN::BAR1), 4),
            (StdCfgReg::Bar(BarN::BAR2), 4),
            (StdCfgReg::Bar(BarN::BAR3), 4),
            (StdCfgReg::Bar(BarN::BAR4), 4),
            (StdCfgReg::Bar(BarN::BAR5), 4),
            (StdCfgReg::CardbusPtr, 4),
            (StdCfgReg::SubVendorId, 2),
            (StdCfgReg::SubDeviceId, 2),
            (StdCfgReg::ExpansionRomAddr, 4),
            (StdCfgReg::CapPtr, 1),
            (StdCfgReg::Reserved, 7),
            (StdCfgReg::IntrLine, 1),
            (StdCfgReg::IntrPin, 1),
            (StdCfgReg::MinGrant, 1),
            (StdCfgReg::MaxLatency, 1),
        ];
        RegMap::create_packed(LEN_CFG_STD, &layout, Some(StdCfgReg::Reserved))
    };
}

#[derive(Default)]
pub struct Ident {
    pub vendor_id: u16,
    pub device_id: u16,
    pub device_class: u8,
    pub device_subclass: u8,
    pub prog_if: u8,
    pub revision_id: u8,
    pub sub_vendor_id: u16,
    pub sub_device_id: u16,
}

struct State {
    reg_command: RegCmd,
    reg_intr_line: u8,

    attach: Option<bus::Attachment>,
    bars: Bars,

    update_in_progress: bool,
}
impl State {
    /// Creates a new state structure for a device with the given BARs.
    fn new(bars: Bars) -> Self {
        Self {
            reg_command: RegCmd::empty(),
            reg_intr_line: 0xff,
            attach: None,
            bars,
            update_in_progress: false,
        }
    }

    /// Returns the bus attachment state.
    fn attached(&self) -> &bus::Attachment {
        self.attach.as_ref().unwrap()
    }

    /// Is MMIO access decoding enabled?
    fn mmio_en(&self) -> bool {
        self.reg_command.contains(RegCmd::MMIO_EN)
    }

    /// Is PIO access decoding enabled?
    fn pio_en(&self) -> bool {
        self.reg_command.contains(RegCmd::IO_EN)
    }

    /// Given the device state, is decoding enabled for a specified [BarDefine]
    fn decoding_active(&self, bar: &BarDefine) -> bool {
        (bar.is_pio() && self.pio_en()) || (bar.is_mmio() && self.mmio_en())
    }
}

/// A capability ID uniquely identifies a type of capability that may
/// be present in configuration space.  Vendor capabilities are generic
/// over some type T that is passed back to the device, allowing it to
/// identify which capability is being accessed.
#[derive(Clone, Copy, Debug)]
pub enum CapId<T: Clone + Copy> {
    Msix,
    Vendor(T),
}

impl<T: Clone + Copy> CapId<T> {
    /// Returns the PCI-defined capability ID for this CapId.
    pub fn as_pci_cap_id(&self) -> u8 {
        match self {
            Self::Msix => CAP_ID_MSIX,
            Self::Vendor(_) => CAP_ID_VENDOR,
        }
    }
}

/// Represents a capability with its type and offset in configuration space.
pub struct Cap<T: Clone + Copy> {
    id: CapId<T>,
    offset: u8,
}

impl<T: Clone + Copy> Cap<T> {
    /// Creates a new CapId with the given type, at the given offset.
    pub(super) fn new(id: CapId<T>, offset: u8) -> Self {
        Self { id, offset }
    }
}

pub struct DeviceState {
    ident: Ident,
    lintr_support: bool,
    cfg_space: RegMap<CfgReg>,
    msix_cfg: Option<Arc<MsixCfg>>,
    caps: Vec<Cap<u32>>,

    pub acc_mem: MemAccessor,
    // MSI accessor remains "hidden" behind MsixCfg machinery
    acc_msi: MsiAccessor,

    state: Mutex<State>,
    cond: Condvar,
}

impl DeviceState {
    fn new(
        ident: Ident,
        lintr_support: bool,
        cfg_space: RegMap<CfgReg>,
        msix_cfg: Option<Arc<MsixCfg>>,
        caps: Vec<Cap<u32>>,
        bars: Bars,
    ) -> Self {
        let acc_msi = MsiAccessor::new_orphan();
        if let Some(cfg) = msix_cfg.as_ref() {
            cfg.attach(&acc_msi);
        }

        Self {
            ident,
            lintr_support,
            cfg_space,
            msix_cfg,
            caps,

            acc_mem: MemAccessor::new_orphan(),
            acc_msi,

            state: Mutex::new(State::new(bars)),
            cond: Condvar::new(),
        }
    }

    /// State changes which result in a new interrupt mode for the device incur
    /// a notification which could trigger deadlock if normal lock-ordering was
    /// used.  In such cases, the process is done in two stages: the state
    /// update (under lock) and the notification (outside the lock) with
    /// protection provided against other such updates which might race.
    fn affects_intr_mode(
        &self,
        dev: &dyn Device,
        mut state: MutexGuard<State>,
        f: impl FnOnce(&mut State),
    ) -> MutexGuard<'_, State> {
        state = self.cond.wait_while(state, |s| s.update_in_progress).unwrap();
        f(&mut state);
        let next_mode = self.which_intr_mode(&state);

        state.update_in_progress = true;
        drop(state);
        // device is notified of mode change w/o state locked
        dev.interrupt_mode_change(next_mode);

        let mut state = self.state.lock().unwrap();
        assert!(state.update_in_progress);
        state.update_in_progress = false;
        self.cond.notify_all();
        state
    }

    fn cfg_std_read(&self, id: &StdCfgReg, ro: &mut ReadOp) {
        assert!(ro.offset() == 0 || *id == StdCfgReg::Reserved);

        match id {
            StdCfgReg::VendorId => ro.write_u16(self.ident.vendor_id),
            StdCfgReg::DeviceId => ro.write_u16(self.ident.device_id),
            StdCfgReg::Class => ro.write_u8(self.ident.device_class),
            StdCfgReg::Subclass => ro.write_u8(self.ident.device_subclass),
            StdCfgReg::SubVendorId => ro.write_u16(self.ident.sub_vendor_id),
            StdCfgReg::SubDeviceId => ro.write_u16(self.ident.sub_device_id),
            StdCfgReg::ProgIf => ro.write_u8(self.ident.prog_if),
            StdCfgReg::RevisionId => ro.write_u8(self.ident.revision_id),

            StdCfgReg::Command => {
                let val = self.state.lock().unwrap().reg_command.bits();
                ro.write_u16(val);
            }
            StdCfgReg::Status => {
                let mut val = RegStatus::empty();
                if self.lintr_support {
                    let state = self.state.lock().unwrap();
                    if let Some((_id, pin)) = state.attached().lintr_cfg() {
                        if pin.is_asserted() {
                            val.insert(RegStatus::INTR_STATUS);
                        }
                    }
                }
                if !self.caps.is_empty() {
                    val.insert(RegStatus::CAP_LIST);
                }
                ro.write_u16(val.bits());
            }
            StdCfgReg::IntrLine => {
                ro.write_u8(self.state.lock().unwrap().reg_intr_line)
            }
            StdCfgReg::IntrPin => {
                if self.lintr_support {
                    let state = self.state.lock().unwrap();
                    let pin_ident = state
                        .attach
                        .as_ref()
                        .and_then(bus::Attachment::lintr_cfg)
                        .map(|(id, _pin)| *id as u8)
                        .unwrap_or(0);
                    ro.write_u8(pin_ident)
                } else {
                    ro.write_u8(0);
                }
            }
            StdCfgReg::Bar(bar) => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.bars.reg_read(*bar))
            }
            StdCfgReg::ExpansionRomAddr => {
                // no rom for now
                ro.write_u32(0);
            }
            StdCfgReg::CapPtr => {
                if !self.caps.is_empty() {
                    ro.write_u8(self.caps[0].offset);
                } else {
                    ro.write_u8(0);
                }
            }
            StdCfgReg::HeaderType => {
                let mut val = HEADER_TYPE_DEVICE;
                let state = self.state.lock().unwrap();
                if state
                    .attach
                    .as_ref()
                    .map(bus::Attachment::is_multifunc)
                    .unwrap_or(false)
                {
                    val |= HEADER_TYPE_MULTIFUNC;
                }
                ro.write_u8(val);
            }
            StdCfgReg::Reserved => {
                ro.fill(0);
            }
            StdCfgReg::CacheLineSize
            | StdCfgReg::LatencyTimer
            | StdCfgReg::MaxLatency
            | StdCfgReg::Bist
            | StdCfgReg::MinGrant
            | StdCfgReg::CardbusPtr => {
                // XXX: zeroed for now
                ro.fill(0);
            }
        }
    }
    fn cfg_std_write(
        &self,
        dev: &dyn Device,
        id: &StdCfgReg,
        wo: &mut WriteOp,
    ) {
        assert!(wo.offset() == 0 || *id == StdCfgReg::Reserved);

        match id {
            StdCfgReg::Command => {
                let new = RegCmd::from_bits_truncate(wo.read_u16());
                self.reg_cmd_write(dev, new);
            }
            StdCfgReg::IntrLine => {
                self.state.lock().unwrap().reg_intr_line = wo.read_u8();
            }
            StdCfgReg::Bar(bar) => {
                let val = wo.read_u32();
                let mut state = self.state.lock().unwrap();
                if let Some(res) = state.bars.reg_write(*bar, val) {
                    let attach = state.attached();
                    if state.decoding_active(&res.def) {
                        attach.bar_unregister(res.id);
                        attach.bar_register(res.id, res.def, res.val_new);
                        dev.bar_update(BarState {
                            id: res.id,
                            def: res.def,
                            value: res.val_new,
                            decode_en: true,
                        });
                    }
                }
            }
            StdCfgReg::VendorId
            | StdCfgReg::DeviceId
            | StdCfgReg::Class
            | StdCfgReg::Subclass
            | StdCfgReg::SubVendorId
            | StdCfgReg::SubDeviceId
            | StdCfgReg::HeaderType
            | StdCfgReg::ProgIf
            | StdCfgReg::RevisionId
            | StdCfgReg::CapPtr
            | StdCfgReg::IntrPin
            | StdCfgReg::Reserved => {
                // ignore writes to RO fields
            }
            StdCfgReg::ExpansionRomAddr => {
                // no expansion rom for now
            }
            StdCfgReg::Status => {
                // Treat status register as RO until there is a need for guests
                // to clear bits within it
            }
            StdCfgReg::CacheLineSize
            | StdCfgReg::LatencyTimer
            | StdCfgReg::MaxLatency
            | StdCfgReg::Bist
            | StdCfgReg::MinGrant
            | StdCfgReg::CardbusPtr => {
                // XXX: ignored for now
            }
        }
    }
    fn reg_cmd_write(&self, dev: &dyn Device, val: RegCmd) {
        let mut state = self.state.lock().unwrap();
        let attach = state.attached();
        let diff = val ^ state.reg_command;

        // Update BAR registrations
        if diff.intersects(RegCmd::IO_EN | RegCmd::MMIO_EN) {
            let pio_en = val.contains(RegCmd::IO_EN);
            let mmio_en = val.contains(RegCmd::MMIO_EN);
            for n in BarN::iter() {
                let bar = state.bars.get(n);
                if bar.is_none() {
                    continue;
                }
                let (def, v) = bar.unwrap();

                if diff.contains(RegCmd::IO_EN) && def.is_pio() {
                    if pio_en {
                        attach.bar_register(n, def, v);
                    } else {
                        attach.bar_unregister(n);
                    }
                    dev.bar_update(BarState {
                        id: n,
                        def,
                        value: v,
                        decode_en: pio_en,
                    });
                }
                if diff.contains(RegCmd::MMIO_EN) && def.is_mmio() {
                    if mmio_en {
                        attach.bar_register(n, def, v);
                    } else {
                        attach.bar_unregister(n);
                    }
                    dev.bar_update(BarState {
                        id: n,
                        def,
                        value: v,
                        decode_en: mmio_en,
                    });
                }
            }
        }

        if diff.intersects(RegCmd::INTX_DIS) {
            // special handling required for INTx enable/disable
            let _state = self.affects_intr_mode(dev, state, |state| {
                state.reg_command = val;
            });
        } else {
            state.reg_command = val;
        }
        // TODO: disable memory and MSI access when busmastering is disabled
    }

    fn which_intr_mode(&self, state: &State) -> IntrMode {
        if self.msix_cfg.is_some()
            && self.msix_cfg.as_ref().unwrap().is_enabled()
        {
            return IntrMode::Msix;
        }
        if let Some(attach) = state.attach.as_ref() {
            if attach.lintr_cfg().is_some()
                && !state.reg_command.contains(RegCmd::INTX_DIS)
            {
                return IntrMode::INTxPin;
            }
        }

        IntrMode::Disabled
    }

    pub(crate) fn get_intr_mode(&self) -> IntrMode {
        let state = self.state.lock().unwrap();
        self.which_intr_mode(&state)
    }

    pub(crate) fn bar(&self, id: BarN) -> Option<BarState> {
        let state = self.state.lock().unwrap();
        state.bars.get(id).map(|(def, value)| {
            let decode_en = match def {
                BarDefine::Pio(_) => state.pio_en(),
                BarDefine::Mmio(_) | BarDefine::Mmio64(_) => state.mmio_en(),
            };
            BarState { id, def, value, decode_en }
        })
    }

    fn cfg_cap_rw(&self, dev: &dyn Device, id: &CfgCapReg, rwo: RWOp) {
        match id {
            CfgCapReg::Id(idx) => {
                if let RWOp::Read(ro) = rwo {
                    let i = *idx as usize;
                    let cap_id = match self.caps[i].id {
                        CapId::Msix => CAP_ID_MSIX,
                        CapId::Vendor(_) => CAP_ID_VENDOR,
                    };
                    ro.write_u8(cap_id);
                }
            }
            CfgCapReg::Next(i) => {
                if let RWOp::Read(ro) = rwo {
                    let next = *i as usize + 1;
                    if next < self.caps.len() {
                        ro.write_u8(self.caps[next].offset);
                    } else {
                        ro.write_u8(0);
                    }
                }
            }
            CfgCapReg::Body(i) => self.cap_rw_body(dev, *i, rwo),
        }
    }

    fn cap_rw_body(&self, dev: &dyn Device, idx: u8, rwo: RWOp) {
        assert!(idx < self.caps.len() as u8);
        // XXX: no fancy capability support for now
        let cap = &self.caps[idx as usize];
        match cap.id {
            CapId::Msix => {
                let msix_cfg = self.msix_cfg.as_ref().unwrap();
                if let RWOp::Write(_) = rwo {
                    // MSI-X cap writes may result in a change to the interrupt
                    // mode of the device which requires extra locking concerns.
                    let state = self.state.lock().unwrap();
                    let _state = self.affects_intr_mode(dev, state, |_state| {
                        msix_cfg.cfg_rw(rwo, |info| {
                            self.notify_msi_update(dev, info)
                        });
                    });
                } else {
                    msix_cfg
                        .cfg_rw(rwo, |info| self.notify_msi_update(dev, info));
                }
            }
            CapId::Vendor(_) => dev.cap_rw(cap.id, rwo),
        }
    }
    fn notify_msi_update(&self, dev: &dyn Device, info: MsiUpdate) {
        dev.msi_update(info);
    }
    pub fn reset(&self, dev: &dyn Device) {
        let state = self.state.lock().unwrap();

        let mut state = self.affects_intr_mode(dev, state, |state| {
            state.reg_command.reset();
            if let Some(msix) = &self.msix_cfg {
                msix.reset();
            }
        });

        // Both IO and MMIO BARs should be disabled at this point
        debug_assert!(!state
            .reg_command
            .intersects(RegCmd::IO_EN | RegCmd::MMIO_EN));
        for n in BarN::iter() {
            if let Some(_) = state.bars.get(n) {
                state.bars.set(n, 0);
                let attach = state.attached();
                attach.bar_unregister(n);
                // TODO: notify device of zeroed BARs
            }
        }
    }
    fn attach(&self, attachment: bus::Attachment) {
        let mut state = self.state.lock().unwrap();
        let _old = state.attach.replace(attachment);
        assert!(_old.is_none());
        let attach = state.attach.as_ref().unwrap();
        attach.acc_mem.adopt(&self.acc_mem, None);
        attach.acc_msi.adopt(&self.acc_msi, None);
    }

    pub fn lintr_pin(&self) -> Option<Arc<dyn IntrPin>> {
        let state = self.state.lock().unwrap();
        let attach = state.attach.as_ref()?;
        let (_id, pin) = attach.lintr_cfg()?;
        Some(Arc::clone(pin))
    }

    pub fn msix_hdl(&self) -> Option<MsixHdl> {
        let cfg = self.msix_cfg.as_ref()?;
        Some(MsixHdl::new(cfg))
    }

    pub fn export(&self) -> migrate::PciStateV1 {
        let state = self.state.lock().unwrap();
        let msix = self.msix_cfg.as_ref().map(|cfg| cfg.export());
        migrate::PciStateV1 {
            reg_command: state.reg_command.bits(),
            reg_intr_line: state.reg_intr_line,
            bars: state.bars.export(),
            msix,
        }
    }

    pub fn import(
        &self,
        state: migrate::PciStateV1,
    ) -> Result<(), MigrateStateError> {
        let mut inner = self.state.lock().unwrap();
        inner.reg_command =
            RegCmd::from_bits(state.reg_command).ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PciState reg_command: failed to import saved value {:#x}",
                    state.reg_command
                ))
            })?;
        inner.reg_intr_line = state.reg_intr_line;
        inner.bars.import(state.bars)?;

        // Reattach any imported Bars to their respective handlers (pio, mmio)
        let attach = inner.attached();
        for n in BarN::iter() {
            if let Some((def, addr)) = inner.bars.get(n) {
                if inner.decoding_active(&def) {
                    attach.bar_register(n, def, addr);
                }
            }
        }

        match (self.msix_cfg.as_ref(), state.msix) {
            (Some(msix_cfg), Some(saved_cfg)) => msix_cfg.import(saved_cfg)?,
            (None, None) => {}
            (None, Some(_)) => {
                return Err(MigrateStateError::ImportFailed(
                    "PciState: device has no MSI-X config".to_string(),
                ))
            }
            (Some(_), None) => {
                return Err(MigrateStateError::ImportFailed(
                    "PciState: device has MSI-X config but none in payload"
                        .to_string(),
                ))
            }
        }

        Ok(())
    }
}

impl MigrateMulti for DeviceState {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        output.push(self.export().into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        self.import(offer.take()?)
    }
}

#[derive(Copy, Clone, Eq, PartialEq)]
pub enum IntrMode {
    Disabled,
    INTxPin,
    Msix,
}

pub enum MsiUpdate {
    MaskAll,
    UnmaskAll,
    Modify(u16),
}

#[derive(Debug)]
enum MsixBarReg {
    Addr(u16),
    Data(u16),
    VecCtrl(u16),
    Reserved,
    Pba,
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum MsixCapReg {
    MsgCtrl,
    TableOff,
    PbaOff,
}
lazy_static! {
    static ref CAP_MSIX_MAP: RegMap<MsixCapReg> = {
        let layout = [
            (MsixCapReg::MsgCtrl, 2),
            (MsixCapReg::TableOff, 4),
            (MsixCapReg::PbaOff, 4),
        ];
        RegMap::create_packed(10, &layout, None)
    };
}

const MSIX_VEC_MASK: u32 = 1 << 0;

const MSIX_MSGCTRL_ENABLE: u16 = 1 << 15;
const MSIX_MSGCTRL_FMASK: u16 = 1 << 14;

#[derive(Debug, Default)]
struct MsixEntry {
    addr: u64,
    data: u32,
    mask_vec: bool,
    mask_func: bool,
    enabled: bool,
    pending: bool,
    acc_msi: Option<MsiAccessor>,
}
impl MsixEntry {
    fn fire(&mut self) {
        if !self.enabled {
            return;
        }
        if self.mask_func || self.mask_vec {
            self.pending = true;
            return;
        }
        self.send();
    }
    fn check_mask(&mut self) {
        if !self.mask_vec && !self.mask_func && self.pending {
            self.pending = false;
            self.send();
        }
    }
    fn send(&self) {
        if let Some(acc) = self.acc_msi.as_ref() {
            let _ = acc.send(self.addr, u64::from(self.data));
        }
    }
    fn reset(&mut self) {
        self.addr = 0;
        self.data = 0;
        self.mask_vec = false;
        self.mask_func = false;
        self.enabled = false;
        self.pending = false;
    }
}

#[derive(Debug)]
struct MsixCfg {
    count: u16,
    bar: BarN,
    pba_off: u32,
    map: RegMap<MsixBarReg>,
    entries: Vec<Mutex<MsixEntry>>,
    state: Mutex<MsixCfgState>,
}
#[derive(Debug, Default)]
struct MsixCfgState {
    enabled: bool,
    func_mask: bool,
}
impl MsixCfg {
    fn new(count: u16, bar: BarN) -> (Arc<Self>, usize) {
        assert!(count > 0 && count <= 2048);

        // Pad table so PBA is on a separate page.  This will allow the guest
        // to map it separately, should it so choose.
        let table_size = count as usize * 16;
        let table_pad = match table_size % PAGE_SIZE {
            0 => 0,
            a => PAGE_SIZE - a,
        };

        // With a maximum vector count, the PBA will not require more than a
        // page.  For convenience, pad it out to that size.
        let pba_size = PAGE_SIZE;

        let pba_off = table_size + table_pad;
        let bar_size = (pba_off + pba_size).next_power_of_two();

        let mut map = RegMap::new(bar_size);
        let mut off = 0;
        for i in 0..count {
            map.define(off, 8, MsixBarReg::Addr(i));
            map.define(off + 8, 4, MsixBarReg::Data(i));
            map.define(off + 12, 4, MsixBarReg::VecCtrl(i));
            off += 16;
        }
        if table_pad != 0 {
            map.define_with_flags(
                off,
                table_pad,
                MsixBarReg::Reserved,
                Flags::PASSTHRU,
            );
        }
        off += table_pad;
        map.define_with_flags(off, pba_size, MsixBarReg::Pba, Flags::PASSTHRU);
        off += pba_size;

        // If table sizing leaves space after the PBA in order to pad the BAR
        // out to the next power of 2, cover it with Reserved handling.
        if off < bar_size {
            let pba_pad = bar_size - off;
            map.define_with_flags(
                off,
                pba_pad,
                MsixBarReg::Reserved,
                Flags::PASSTHRU,
            );
        }

        let mut entries = Vec::with_capacity(count as usize);
        entries.resize_with(count as usize, Default::default);

        let this = Self {
            count,
            bar,
            pba_off: pba_off as u32,
            map,
            entries,
            state: Default::default(),
        };

        (Arc::new(this), bar_size)
    }
    fn bar_match(&self, bar: BarN) -> bool {
        self.bar == bar
    }
    fn bar_rw(&self, mut rwo: RWOp, updatef: impl Fn(MsiUpdate)) {
        self.map.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => match id {
                MsixBarReg::Addr(i) => {
                    let ent = self.entries[*i as usize].lock().unwrap();
                    ro.write_u64(ent.addr);
                }
                MsixBarReg::Data(i) => {
                    let ent = self.entries[*i as usize].lock().unwrap();
                    ro.write_u32(ent.data);
                }
                MsixBarReg::VecCtrl(i) => {
                    let ent = self.entries[*i as usize].lock().unwrap();
                    let mut val = 0;
                    if ent.mask_vec {
                        val |= MSIX_VEC_MASK;
                    }
                    ro.write_u32(val);
                }
                MsixBarReg::Reserved => {
                    ro.fill(0);
                }
                MsixBarReg::Pba => {
                    self.read_pba(ro);
                }
            },
            RWOp::Write(wo) => {
                // If modifying an individual entry, its lock needs to be dropped before making
                // the `updatef` callback, since it may attempt to access the entry itself.  To
                // synchronize access, hold on to the state lock across that call.
                let state = self.state.lock().unwrap();
                match id {
                    MsixBarReg::Addr(i) => {
                        let mut ent = self.entries[*i as usize].lock().unwrap();
                        ent.addr = wo.read_u64();
                        drop(ent);
                        updatef(MsiUpdate::Modify(*i));
                    }
                    MsixBarReg::Data(i) => {
                        let mut ent = self.entries[*i as usize].lock().unwrap();
                        ent.data = wo.read_u32();
                        drop(ent);
                        updatef(MsiUpdate::Modify(*i));
                    }
                    MsixBarReg::VecCtrl(i) => {
                        let mut ent = self.entries[*i as usize].lock().unwrap();
                        let val = wo.read_u32();
                        ent.mask_vec = val & MSIX_VEC_MASK != 0;
                        ent.check_mask();
                        drop(ent);
                        updatef(MsiUpdate::Modify(*i));
                    }
                    MsixBarReg::Reserved | MsixBarReg::Pba => {}
                }
                drop(state);
            }
        });
    }
    fn read_pba(&self, ro: &mut ReadOp) {
        let avail = ro.avail();
        let offset = ro.offset();

        for i in 0..avail {
            let mut val: u8 = 0;
            for bitpos in 0..8 {
                let idx = ((i + offset) * 8) + bitpos;
                if idx < self.count as usize {
                    let ent = self.entries[idx].lock().unwrap();
                    if ent.pending {
                        val |= 1 << bitpos;
                    }
                }
            }
            ro.write_u8(val);
        }
    }
    fn cfg_rw(&self, mut rwo: RWOp, updatef: impl Fn(MsiUpdate)) {
        CAP_MSIX_MAP.process(&mut rwo, |id, rwo| {
            match rwo {
                RWOp::Read(ro) => {
                    match id {
                        MsixCapReg::MsgCtrl => {
                            let state = self.state.lock().unwrap();
                            // low 10 bits hold `count - 1`
                            let mut val = self.count - 1;
                            if state.enabled {
                                val |= MSIX_MSGCTRL_ENABLE;
                            }
                            if state.func_mask {
                                val |= MSIX_MSGCTRL_FMASK;
                            }
                            ro.write_u16(val);
                        }
                        MsixCapReg::TableOff => {
                            // table always at offset 0 for now
                            ro.write_u32(u32::from(self.bar as u8));
                        }
                        MsixCapReg::PbaOff => {
                            ro.write_u32(
                                self.pba_off | u32::from(self.bar as u8),
                            );
                        }
                    }
                }
                RWOp::Write(wo) => {
                    match id {
                        MsixCapReg::MsgCtrl => {
                            let val = wo.read_u16();
                            let mut state = self.state.lock().unwrap();
                            let new_ena = val & MSIX_MSGCTRL_ENABLE != 0;
                            let old_ena = state.enabled;
                            let new_mask = val & MSIX_MSGCTRL_FMASK != 0;
                            let old_mask = state.func_mask;
                            if old_ena != new_ena || old_mask != new_mask {
                                self.each_entry(|ent| {
                                    ent.mask_func = new_mask;
                                    ent.enabled = new_ena;
                                    ent.check_mask();
                                });
                            }
                            state.enabled = new_ena;
                            state.func_mask = new_mask;

                            // Notify when the MSI-X function mask is changing.
                            // Changes to enable/disable state is already
                            // covered by the logic for interrupt_mode_change
                            // updates
                            if old_mask != new_mask
                                && old_ena == new_ena
                                && new_ena
                            {
                                updatef(match new_mask {
                                    true => MsiUpdate::MaskAll,
                                    false => MsiUpdate::UnmaskAll,
                                });
                            }
                        }
                        // only msgctrl can be written
                        _ => {}
                    }
                }
            }
        });
    }
    fn each_entry(&self, mut cb: impl FnMut(&mut MsixEntry)) {
        for ent in self.entries.iter() {
            let mut locked = ent.lock().unwrap();
            cb(&mut locked)
        }
    }
    fn fire(&self, idx: u16) {
        assert!(idx < self.count);
        let mut ent = self.entries[idx as usize].lock().unwrap();
        ent.fire();
    }
    fn is_enabled(&self) -> bool {
        let state = self.state.lock().unwrap();
        state.enabled
    }
    fn read(&self, idx: u16) -> MsiEnt {
        assert!(idx < self.count);
        let ent = self.entries[idx as usize].lock().unwrap();
        MsiEnt {
            addr: ent.addr,
            data: ent.data,
            masked: ent.mask_vec || ent.mask_func,
            pending: ent.pending,
        }
    }
    fn reset(&self) {
        let mut state = self.state.lock().unwrap();
        state.enabled = false;
        state.func_mask = false;
        drop(state);
        self.each_entry(|ent| ent.reset());
    }
    fn attach(&self, msi_acc: &MsiAccessor) {
        for entry in self.entries.iter() {
            let mut guard = entry.lock().unwrap();
            guard.acc_msi = Some(msi_acc.child(None));
        }
    }
    fn export(&self) -> migrate::MsixStateV1 {
        let state = self.state.lock().unwrap();
        let mut entries = Vec::new();
        for entry in self.entries.iter() {
            let lentry = entry.lock().unwrap();
            entries.push(migrate::MsixEntryV1 {
                addr: lentry.addr,
                data: lentry.data,
                is_vec_masked: lentry.mask_vec,
                is_pending: lentry.pending,
            });
        }
        migrate::MsixStateV1 {
            count: self.count,
            is_enabled: state.enabled,
            is_func_masked: state.func_mask,
            entries,
        }
    }
    fn import(
        &self,
        state: migrate::MsixStateV1,
    ) -> Result<(), MigrateStateError> {
        let mut inner = self.state.lock().unwrap();

        if self.count != state.count {
            return Err(MigrateStateError::ImportFailed(format!(
                "MsixCfg: count mismatch {} vs {}",
                self.count, state.count
            )));
        }
        if self.entries.len() != state.entries.len() {
            return Err(MigrateStateError::ImportFailed(format!(
                "MsixCfg: entry count mismatch {} vs {}",
                self.entries.len(),
                state.entries.len()
            )));
        }
        inner.enabled = state.is_enabled;
        inner.func_mask = state.is_func_masked;
        for (entry, saved) in self.entries.iter().zip(state.entries) {
            let mut entry = entry.lock().unwrap();
            entry.addr = saved.addr;
            entry.data = saved.data;
            entry.mask_vec = saved.is_vec_masked;
            entry.mask_func = state.is_func_masked;
            entry.enabled = state.is_enabled;
            entry.pending = saved.is_pending;
        }

        Ok(())
    }
}

// public struct for exposing MSI(-X) values
pub struct MsiEnt {
    pub addr: u64,
    pub data: u32,
    pub masked: bool,
    pub pending: bool,
}

#[derive(Debug)]
pub struct MsixHdl {
    cfg: Arc<MsixCfg>,
}
impl MsixHdl {
    fn new(cfg: &Arc<MsixCfg>) -> Self {
        Self { cfg: Arc::clone(cfg) }
    }
    #[cfg(test)]
    pub(crate) fn new_test() -> Self {
        Self { cfg: MsixCfg::new(2048, BarN::BAR0).0 }
    }
    pub fn fire(&self, idx: u16) {
        self.cfg.fire(idx);
    }
    pub fn read(&self, idx: u16) -> MsiEnt {
        self.cfg.read(idx)
    }
    pub fn count(&self) -> u16 {
        self.cfg.count
    }
}
impl Clone for MsixHdl {
    fn clone(&self) -> Self {
        Self { cfg: Arc::clone(&self.cfg) }
    }
}

/// Describes the state of a BAR
pub struct BarState {
    pub id: BarN,
    pub def: BarDefine,
    pub value: u64,
    /// Is decoding for this BAR enabled in the device control?
    pub decode_en: bool,
}

pub struct Builder {
    ident: Ident,
    lintr_support: bool,
    msix_cfg: Option<Arc<MsixCfg>>,
    bars: [Option<BarDefine>; 6],
    cfg_builder: CfgBuilder,
}

impl Builder {
    pub fn new(ident: Ident) -> Self {
        let mut cfgmap = RegMap::new(LEN_CFG_ECAM);
        cfgmap.define_with_flags(0, LEN_CFG_STD, CfgReg::Std, Flags::PASSTHRU);
        Self {
            ident,
            lintr_support: false,
            msix_cfg: None,
            bars: [None; 6],
            cfg_builder: CfgBuilder::new(),
        }
    }

    /// Add a BAR which is accessible via IO ports
    ///
    /// # Panics
    ///
    /// If `size` is < 4 or not a power of 2.
    pub fn add_bar_io(mut self, bar: BarN, size: u16) -> Self {
        assert!(size.is_power_of_two());
        assert!(size >= 4);

        let idx = bar as usize;
        assert!(self.bars[idx].is_none());

        self.bars[idx] = Some(BarDefine::Pio(size));
        self
    }

    /// Add a BAR which is accessible via MMIO.  The size and placement of the
    /// BAR is limited to the 32-bit address space.
    ///
    /// # Panics
    ///
    /// If `size` is < 16 or not a power of 2.
    pub fn add_bar_mmio(mut self, bar: BarN, size: u32) -> Self {
        assert!(size.is_power_of_two());
        assert!(size >= 16);

        let idx = bar as usize;
        assert!(self.bars[idx].is_none());

        self.bars[idx] = Some(BarDefine::Mmio(size));
        self
    }

    /// Add a BAR which is accessible via MMIO.  As a 64-bit BAR, its size can
    /// be >= 4G, and it is expected to be placed above the 32-bit address
    /// limit.
    ///
    /// # Panics
    ///
    /// If `size` is < 16 or not a power of 2.
    pub fn add_bar_mmio64(mut self, bar: BarN, size: u64) -> Self {
        assert!(size.is_power_of_two());
        assert!(size >= 16);

        let idx = bar as usize;
        assert!(idx != 6);
        assert!(self.bars[idx].is_none());
        assert!(self.bars[idx + 1].is_none());

        self.bars[idx] = Some(BarDefine::Mmio64(size));
        // TODO: prevent later BAR definition from occupying high word
        self
    }

    /// Add a legacy (pin-based) interrupt
    pub fn add_lintr(mut self) -> Self {
        self.lintr_support = true;
        self
    }

    /// Add a region of the PCI config space for the device which has custom
    /// handling.
    pub fn add_custom_cfg(mut self, offset: u8, len: u8) -> Self {
        self.cfg_builder.add_custom(offset, len);
        self
    }

    fn add_cap_raw(&mut self, id: CapId<u32>, len: u8) {
        self.cfg_builder.add_capability(id, len);
    }

    /// Add MSI-X interrupt functionality.
    ///
    /// # Panics
    ///
    /// If:
    /// - `count` is 0 or > 2048
    /// - `bar` conflicts (overlaps) with other defined BAR for the device
    pub fn add_cap_msix(mut self, bar: BarN, count: u16) -> Self {
        assert!(self.msix_cfg.is_none());

        let (cfg, bar_size) = MsixCfg::new(count, bar);

        assert!(bar_size < u32::MAX as usize);
        self = self.add_bar_mmio(bar, bar_size as u32);
        self.msix_cfg = Some(cfg);
        self.add_cap_raw(CapId::Msix, 10);

        self
    }

    /// Add a "Vendor" capabiltiy.
    pub fn add_cap_vendor(mut self, tag: u32, len: u8) -> Self {
        self.add_cap_raw(CapId::Vendor(tag), len);
        self
    }

    pub fn finish(self) -> DeviceState {
        let (cfgmap, caps) = self.cfg_builder.finish();
        DeviceState::new(
            self.ident,
            self.lintr_support,
            cfgmap,
            self.msix_cfg,
            caps,
            Bars::new(&self.bars),
        )
    }
}

pub mod migrate {
    use crate::hw::pci::bar;
    use crate::migrate::*;

    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct MsixEntryV1 {
        pub addr: u64,
        pub data: u32,
        pub is_vec_masked: bool,
        pub is_pending: bool,
    }

    #[derive(Deserialize, Serialize)]
    pub struct MsixStateV1 {
        pub count: u16,
        pub is_enabled: bool,
        pub is_func_masked: bool,
        pub entries: Vec<MsixEntryV1>,
    }

    #[derive(Deserialize, Serialize)]
    pub struct PciStateV1 {
        pub reg_command: u16,
        pub reg_intr_line: u8,
        pub bars: bar::migrate::BarStateV1,
        pub msix: Option<MsixStateV1>,
    }
    impl Schema<'_> for PciStateV1 {
        fn id() -> SchemaId {
            ("pci-device", 1)
        }
    }
}

#[cfg(test)]
pub(crate) mod test {
    use super::*;

    #[test]
    #[should_panic]
    fn msix_cfg_zero() {
        let (_cfg, _bsize) = MsixCfg::new(0, BarN::BAR1);
    }
    #[test]
    #[should_panic]
    fn msix_cfg_too_big() {
        let (_cfg, _bsize) = MsixCfg::new(2049, BarN::BAR1);
    }
    #[test]
    fn msix_cfg_sizing() {
        let (_cfg, bar_size) = MsixCfg::new(2048, BarN::BAR1);
        // 32k for entries + 4k PBA -> 64k (rounded to next pow2)
        assert_eq!(bar_size, 65536);

        // 4k for entries + 4k PBA
        let (_cfg, bar_size) = MsixCfg::new(256, BarN::BAR1);
        assert_eq!(bar_size, 8192);
    }

    /// For a given [Device], perform reads of the entire PCI cfg space, 4-bytes
    /// at a time.
    pub(crate) fn cfg_read(dev: &dyn Endpoint) {
        // Read the whole config space for the device, 4 bytes at a time
        let mut buf = [0u8; 4];

        for off in (0..=255).step_by(4) {
            let mut op = ReadOp::from_buf(off, &mut buf[..]);
            dev.cfg_rw(RWOp::Read(&mut op));
        }
    }

    /// For a given [Device], perform writes (of all-1s) of the entire PCI cfg
    /// space, 4-bytes at a time.
    ///
    /// The device is not expected to function well in the face of such abusive
    /// writes, but it should not blow any assertions.
    pub(crate) fn cfg_write(dev: &dyn Endpoint) {
        // Read the whole config space for the device, 4 bytes at a time
        let buf = [0xffu8; 4];

        for off in (0..=255).step_by(4) {
            let mut op = WriteOp::from_buf(off, &buf[..]);
            dev.cfg_rw(RWOp::Write(&mut op));
        }
    }
}


================================================
FILE: lib/propolis/src/hw/pci/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fmt::Result as FmtResult;
use std::fmt::{Display, Formatter};
use std::io::{Error, ErrorKind};
use std::str::FromStr;
use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::intr_pins::IntrPin;

use strum::{EnumIter, FromRepr};

pub mod bar;
pub mod bits;
pub mod bridge;
pub mod bus;
mod cfgspace;
pub(crate) mod device;
pub mod topology;

#[cfg(test)]
pub(crate) mod test;

pub use bus::Bus;
pub use device::*;

#[derive(Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd)]
pub struct BusNum(u8);
impl BusNum {
    pub const fn new(n: u8) -> Self {
        Self(n)
    }
    pub const fn get(&self) -> u8 {
        self.0
    }
}

impl From<BusNum> for u8 {
    fn from(value: BusNum) -> Self {
        value.get()
    }
}
#[derive(Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd)]
pub struct DevNum(u8);
impl DevNum {
    /// Attempts to make a new PCI [DevNum]
    ///
    /// Returns [`Option::None`] if `n` is outside the range for a valid PCI
    /// device.
    pub const fn new(n: u8) -> Option<Self> {
        if n <= bits::MASK_DEV {
            Some(Self(n))
        } else {
            None
        }
    }
    /// Create a new [DevNum]
    ///
    /// # Panics
    ///
    /// If `n` is outside the range for a valid PCI device
    pub const fn new_unchecked(n: u8) -> Self {
        if n <= bits::MASK_DEV {
            Self(n)
        } else {
            panic!("device number exceeds max");
        }
    }
    pub const fn get(&self) -> u8 {
        self.0
    }
}

impl From<DevNum> for u8 {
    fn from(value: DevNum) -> Self {
        value.get()
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd)]
pub struct FuncNum(u8);
impl FuncNum {
    /// Attempts to make a new PCI [FuncNum]
    ///
    /// Returns [`Option::None`] if `n` is outside the range for a valid PCI
    /// function.
    pub const fn new(n: u8) -> Option<Self> {
        if n <= bits::MASK_FUNC {
            Some(Self(n))
        } else {
            None
        }
    }
    /// Create a new [FuncNum]
    ///
    /// # Panics
    ///
    /// If `n` is outside the range for a valid PCI function
    pub const fn new_unchecked(n: u8) -> Self {
        if n <= bits::MASK_FUNC {
            Self(n)
        } else {
            panic!("function number exceeds max");
        }
    }
    pub const fn get(&self) -> u8 {
        self.0
    }
}

impl From<FuncNum> for u8 {
    fn from(value: FuncNum) -> Self {
        value.get()
    }
}

/// A device/function located on a specific PCI bus.
#[derive(Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd)]
pub struct BusLocation {
    pub dev: DevNum,
    pub func: FuncNum,
}

impl BusLocation {
    /// Attempts to make a new PCI [BusLocation]
    ///
    /// Returns [`Option::None`] if the `dev` or `func` are outside their
    /// respective valid ranges.
    pub const fn new(dev: u8, func: u8) -> Option<Self> {
        let dnum = DevNum::new(dev);
        let fnum = FuncNum::new(func);
        match (dnum, fnum) {
            (Some(d), Some(f)) => Some(Self { dev: d, func: f }),
            _ => None,
        }
    }
    /// Create a new PCI [BusLocation]
    ///
    /// # Panics
    ///
    /// If `dev` or `func` are outside their respective valid ranges
    pub const fn new_unchecked(dev: u8, func: u8) -> Self {
        Self {
            dev: DevNum::new_unchecked(dev),
            func: FuncNum::new_unchecked(func),
        }
    }
}

/// Bus, Device, Function.
///
/// Acts as an address for PCI and PCIe device functionality.
#[derive(Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd)]
pub struct Bdf {
    pub bus: BusNum,
    pub location: BusLocation,
}

impl FromStr for Bdf {
    type Err = std::io::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let mut fields = Vec::with_capacity(3);
        for f in s.split('.') {
            let num = usize::from_str(f).map_err(|e| {
                Error::new(ErrorKind::InvalidInput, e.to_string())
            })?;
            if num > u8::MAX as usize {
                return Err(Error::new(
                    ErrorKind::InvalidInput,
                    format!("Value too large: {}", num),
                ));
            }
            fields.push(num as u8);
        }

        if fields.len() != 3 {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "Wrong number of fields for BDF",
            ));
        }

        Bdf::new(fields[0], fields[1], fields[2]).ok_or_else(|| {
            Error::new(
                ErrorKind::InvalidInput,
                "Failed to parse as BDF".to_string(),
            )
        })
    }
}

// The corresponding `propolis_types_pcipath_is_always_valid_bdf` validates this
// `From` impl exhaustively.
impl From<propolis_types::PciPath> for Bdf {
    fn from(value: propolis_types::PciPath) -> Self {
        // PciPath is exactly as well-formed as Bdf
        Bdf::new_unchecked(value.bus(), value.device(), value.function())
    }
}

#[test]
fn propolis_types_pcipath_is_always_valid_bdf() {
    fn check_bdf(bus: u8, device: u8, function: u8) -> bool {
        let bdf = Bdf::new(bus, device, function);
        let pci_path = propolis_types::PciPath::new(bus, device, function);

        match (bdf, pci_path) {
            (None, Err(_)) => {
                return true;
            }
            (Some(_), Err(_)) | (None, Ok(_)) => {
                return false;
            }
            (Some(bdf), Ok(pci_path)) => {
                let converted_bdf: Bdf = pci_path.into();
                converted_bdf == bdf
            }
        }
    }

    for bus in 0..=255 {
        for device in 0..=255 {
            for function in 0..=255 {
                assert!(
                    check_bdf(bus, device, function),
                    "Bdf and PciPath did not match for \
                    bus/device/function {}/{}/{}",
                    bus,
                    device,
                    function
                );
            }
        }
    }
}

impl Bdf {
    /// Attempts to make a new PCI [Bdf].
    ///
    /// Returns [`Option::None`] if the values would not fit within a BDF.
    pub const fn new(bus: u8, dev: u8, func: u8) -> Option<Self> {
        // Until the `?` operator is supported in `const fn`s, this more verbose
        // implementation is required.
        if let Some(location) = BusLocation::new(dev, func) {
            Some(Self { bus: BusNum::new(bus), location })
        } else {
            None
        }
    }
    /// Create a new PCI [Bdf]
    ///
    /// # Panics
    ///
    /// If `dev` or `func` are outside their respective valid ranges
    pub const fn new_unchecked(bus: u8, dev: u8, func: u8) -> Self {
        Self {
            bus: BusNum::new(bus),
            location: BusLocation::new_unchecked(dev, func),
        }
    }
}
impl Display for Bdf {
    fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
        write!(
            f,
            "{}.{}.{}",
            self.bus.0, self.location.dev.0, self.location.func.0
        )
    }
}

#[derive(
    Copy, Clone, Eq, PartialEq, Debug, Ord, PartialOrd, FromRepr, EnumIter,
)]
#[repr(u8)]
pub enum BarN {
    BAR0 = 0,
    BAR1,
    BAR2,
    BAR3,
    BAR4,
    BAR5,
}

#[repr(u8)]
#[derive(Copy, Clone)]
pub enum INTxPinID {
    IntA = 1,
    IntB = 2,
    IntC = 3,
    IntD = 4,
}

pub type LintrCfg = (INTxPinID, Arc<dyn IntrPin>);

pub trait Endpoint: Send + Sync {
    fn attach(&self, attachment: bus::Attachment);
    fn cfg_rw(&self, op: RWOp<'_, '_>);
    fn bar_rw(&self, bar: BarN, rwo: RWOp);
}

fn cfg_addr_parse(addr: u32) -> Option<(Bdf, u8)> {
    if addr & 0x80000000 == 0 {
        // Enable bit not set
        None
    } else {
        Some((
            Bdf::new(
                (addr >> 16) as u8 & bits::MASK_BUS,
                (addr >> 11) as u8 & bits::MASK_DEV,
                (addr >> 8) as u8 & bits::MASK_FUNC,
            )
            .unwrap(),
            (addr & 0xff) as u8,
        ))
    }
}

pub struct PioCfgDecoder {
    addr: Mutex<u32>,
}
impl PioCfgDecoder {
    pub fn new() -> Self {
        Self { addr: Mutex::new(0) }
    }
    pub fn service_addr(&self, rwop: RWOp) {
        if rwop.len() != 4 || rwop.offset() != 0 {
            // XXX expect aligned/sized reads
            return;
        }
        let mut addr = self.addr.lock().unwrap();
        match rwop {
            RWOp::Read(ro) => ro.write_u32(*addr),
            RWOp::Write(wo) => *addr = wo.read_u32(),
        }
    }
    pub fn service_data<F>(&self, rwop: RWOp, mut cb: F)
    where
        F: FnMut(&Bdf, RWOp) -> Option<()>,
    {
        let locked_addr = self.addr.lock().unwrap();
        let addr = *locked_addr;
        drop(locked_addr);

        if let Some((bdf, cfg_off)) = cfg_addr_parse(addr) {
            let off = cfg_off as usize + rwop.offset();
            match rwop {
                RWOp::Read(ro) => {
                    let mut cro = ReadOp::new_child(off, ro, ..);
                    let hit = cb(&bdf, RWOp::Read(&mut cro));
                    if hit.is_none() {
                        cro.fill(0xff);
                    }
                }
                RWOp::Write(wo) => {
                    let mut cwo = WriteOp::new_child(off, wo, ..);
                    let _ = cb(&bdf, RWOp::Write(&mut cwo));
                }
            };
        }
    }
    pub fn addr(&self) -> u32 {
        let addr = self.addr.lock().unwrap();
        *addr
    }

    pub(super) fn set_addr(&self, addr: u32) {
        let mut inner = self.addr.lock().unwrap();
        *inner = addr;
    }
}

pub struct PcieCfgDecoder {
    bus_mask: u8,
}

impl PcieCfgDecoder {
    /// Creates a PCIe config space access decoder that can address the supplied
    /// number of buses.
    ///
    /// The supplied bus count must be a power of 2 between
    /// [`bits::PCIE_MIN_BUSES_PER_ECAM_REGION`] and
    /// [`bits::PCIE_MAX_BUSES_PER_ECAM_REGION`] inclusive.
    pub fn new(bus_count: u16) -> Self {
        assert!(bus_count.is_power_of_two());
        assert!(bus_count >= bits::PCIE_MIN_BUSES_PER_ECAM_REGION);
        assert!(bus_count <= bits::PCIE_MAX_BUSES_PER_ECAM_REGION);

        Self { bus_mask: (bus_count - 1) as u8 }
    }

    /// Decodes a request to access PCIe configuration space and dispatches the
    /// resulting BDF and device-relative configuration space offset to a
    /// caller-supplied completion function.
    pub fn service<F>(&self, rwop: RWOp, mut cb: F)
    where
        F: FnMut(&Bdf, RWOp) -> Option<()>,
    {
        assert_ne!(rwop.len(), 0);
        let (bdf, cfg_off) = self.decode_enhanced_cfg_offset(rwop.offset());

        // Ensure the access is addressed to a single device.
        let (end_bdf, _) =
            self.decode_enhanced_cfg_offset(rwop.offset() + rwop.len() - 1);
        if bdf != end_bdf {
            if let RWOp::Read(ro) = rwop {
                ro.fill(0xff);
            }
            return;
        }
        match rwop {
            RWOp::Read(ro) => {
                let mut cro = ReadOp::new_child(cfg_off, ro, ..);
                let hit = cb(&bdf, RWOp::Read(&mut cro));
                if hit.is_none() {
                    cro.fill(0xff);
                }
            }
            RWOp::Write(wo) => {
                let mut cwo = WriteOp::new_child(cfg_off, wo, ..);
                let _ = cb(&bdf, RWOp::Write(&mut cwo));
            }
        }
    }

    /// Decodes an offset into a PCIe ECAM region into a bus/device/function and
    /// an offset into that function's configuration space.
    fn decode_enhanced_cfg_offset(&self, region_offset: usize) -> (Bdf, usize) {
        let bus = (region_offset >> 20) as u8 & self.bus_mask;
        let dev = (region_offset >> 15) as u8 & bits::MASK_DEV;
        let func = (region_offset >> 12) as u8 & bits::MASK_FUNC;
        let cfg_offset = region_offset & bits::MASK_ECAM_CFG_OFFSET;
        (Bdf::new(bus, dev, func).unwrap(), cfg_offset)
    }
}

pub mod migrate {
    pub use super::device::migrate::*;
}


================================================
FILE: lib/propolis/src/hw/pci/test.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::accessors::*;
use crate::common::{RWOp, ReadOp, WriteOp};
use crate::mmio::MmioBus;
use crate::pio::PioBus;

use super::bus::Bus;
use super::topology::{self, Topology};
use super::{bits, Bdf, PcieCfgDecoder};

// Common test prep setup

pub(crate) struct Scaffold {
    pub bus_mmio: Arc<MmioBus>,
    pub bus_pio: Arc<PioBus>,
    pub acc_mem: MemAccessor,
    pub acc_msi: MsiAccessor,
}
impl Scaffold {
    pub(crate) fn new() -> Self {
        Self {
            bus_mmio: Arc::new(MmioBus::new(u32::MAX as usize)),
            bus_pio: Arc::new(PioBus::new()),
            acc_mem: MemAccessor::new_orphan(),
            acc_msi: MsiAccessor::new_orphan(),
        }
    }

    pub(crate) fn create_bus(&self) -> Bus {
        Bus::new(
            &self.bus_pio,
            &self.bus_mmio,
            self.acc_mem.child(None),
            self.acc_msi.child(None),
        )
    }

    pub(crate) fn basic_topo(&self) -> Arc<Topology> {
        topology::Topology::new_test(self.create_bus())
    }
}

// PCI-generic tests

#[test]
fn pcie_decoder() {
    let pcie = PcieCfgDecoder::new(bits::PCIE_MAX_BUSES_PER_ECAM_REGION);
    let mut buf = [0u8; 4];
    let mut ro = ReadOp::from_buf(0, &mut buf);
    pcie.service(RWOp::Read(&mut ro), |bdf, rwo| {
        assert_eq!(*bdf, Bdf::new(0, 0, 0).unwrap());
        assert!(matches!(rwo, RWOp::Read(_)));
        assert_eq!(rwo.offset(), 0);
        assert_eq!(rwo.len(), 4);
        Some(())
    });

    let buf = [0u8; 16];
    let mut wo = WriteOp::from_buf(0x400, &buf);
    pcie.service(RWOp::Write(&mut wo), |bdf, rwo| {
        assert_eq!(*bdf, Bdf::new(0, 0, 0).unwrap());
        assert!(matches!(rwo, RWOp::Write(_)));
        assert_eq!(rwo.offset(), 0x400);
        assert_eq!(rwo.len(), 16);
        Some(())
    })
}

#[test]
fn pcie_decoder_multiple_bdfs() {
    let pcie = PcieCfgDecoder::new(bits::PCIE_MAX_BUSES_PER_ECAM_REGION);
    let mut buf = [0u8; 4];
    let mut ro = ReadOp::from_buf(1_usize << 12, &mut buf);
    pcie.service(RWOp::Read(&mut ro), |bdf, rwo| {
        assert_eq!(*bdf, Bdf::new(0, 0, 1).unwrap());
        assert_eq!(rwo.offset(), 0);
        Some(())
    });

    let mut ro =
        ReadOp::from_buf((4_usize << 15) | (3_usize << 12) | 0x123, &mut buf);
    pcie.service(RWOp::Read(&mut ro), |bdf, rwo| {
        assert_eq!(*bdf, Bdf::new(0, 4, 3).unwrap());
        assert_eq!(rwo.offset(), 0x123);
        Some(())
    });

    let mut ro = ReadOp::from_buf(
        (133_usize << 20) | (7_usize << 15) | (1_usize << 12) | 0x337,
        &mut buf,
    );
    pcie.service(RWOp::Read(&mut ro), |bdf, rwo| {
        assert_eq!(*bdf, Bdf::new(133, 7, 1).unwrap());
        assert_eq!(rwo.offset(), 0x337);
        Some(())
    });
}

#[test]
fn pcie_decoder_min_buses() {
    let pcie = PcieCfgDecoder::new(4);
    let mut buf = [0u8; 4];
    for seg_group in 0..4 {
        for bus in 0..4 {
            let mut ro =
                ReadOp::from_buf((seg_group * 4 + bus) << 20, &mut buf);
            pcie.service(RWOp::Read(&mut ro), |bdf, rwo| {
                assert_eq!(
                    *bdf,
                    Bdf::new(bus as u8, 0, 0).unwrap(),
                    "group {}, bus {}",
                    seg_group,
                    bus
                );
                assert_eq!(rwo.offset(), 0);
                Some(())
            });
        }
    }
}

#[test]
fn pcie_decoder_access_spans_multiple_devs() {
    let pcie = PcieCfgDecoder::new(bits::PCIE_MAX_BUSES_PER_ECAM_REGION);
    let mut buf = [0u8; 8];
    let mut ro = ReadOp::from_buf(0xffc, &mut buf);

    // This access spans multiple functions, so the decoder can't
    // meaningfully address a single BDF and should therefore not
    // invoke the closure.
    pcie.service(RWOp::Read(&mut ro), |_bdf, _rwo| panic!());
    assert_eq!(buf, [0xffu8; 8]);
}


================================================
FILE: lib/propolis/src/hw/pci/topology.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A PCI topology containing one or more PCI buses.

use std::collections::{BTreeMap, BTreeSet};
use std::io::{Error as IoError, ErrorKind};
use std::sync::{Arc, Mutex};

use crate::common::RWOp;
use crate::hw::ids;
use crate::vmm::Machine;

use super::bridge::Bridge;
use super::{Bdf, Bus, BusLocation, Endpoint, LintrCfg};

use thiserror::Error;

/// A logical identifier for a bus in the topology. A bus's logical identifer
/// is stable irrespective of the way the topology's bridges are configured.
#[derive(Clone, Copy, Debug, Ord, PartialOrd, Eq, PartialEq)]
pub struct LogicalBusId(pub u8);

/// A "routing" identifier for a bus in the topology. The topology considers
/// bridge configurations when deciding what bus will receive messages directed
/// using this kind of ID.
#[derive(Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
pub struct RoutedBusId(pub u8);

#[derive(Clone, Copy)]
struct BusIndex(usize);

/// Errors returned when manipulating PCI topology.
#[derive(Debug, Error)]
pub enum PciTopologyError {
    #[error("The logical bus with ID {0:?} was not found")]
    LogicalBusNotFound(LogicalBusId),

    #[error("Downstream logical bus ID {0:?} was already registered")]
    LogicalBusAlreadyExists(LogicalBusId),

    #[error("A PCI device was already attached at {0:?}")]
    DeviceAlreadyAttached(Bdf),
}

impl From<PciTopologyError> for IoError {
    fn from(e: PciTopologyError) -> IoError {
        use PciTopologyError::*;
        match e {
            LogicalBusNotFound(b) => IoError::new(
                ErrorKind::NotFound,
                format!("Logical bus {} not found", b.0),
            ),
            LogicalBusAlreadyExists(b) => IoError::new(
                ErrorKind::AlreadyExists,
                format!("Logical bus {} already exists", b.0),
            ),
            DeviceAlreadyAttached(bdf) => IoError::new(
                ErrorKind::AlreadyExists,
                format!("Device at {} already attached", bdf),
            ),
        }
    }
}

/// A PCI topology manager.
pub struct Topology {
    buses: Vec<Bus>,
    logical_buses: BTreeMap<LogicalBusId, BusIndex>,
    inner: Mutex<Inner>,
}

impl Topology {
    /// Attaches a device to a logical bus in this topology.
    ///
    /// # Errors
    ///
    /// Fails if the logical bus is not present in the topology.
    pub fn pci_attach(
        &self,
        bus: LogicalBusId,
        location: BusLocation,
        dev: Arc<dyn Endpoint>,
        lintr_cfg: Option<LintrCfg>,
    ) -> Result<(), PciTopologyError> {
        if let Some(bus_index) = self.logical_buses.get(&bus) {
            let bus = &self.buses[bus_index.0];
            bus.attach(location, dev, lintr_cfg);
            Ok(())
        } else {
            Err(PciTopologyError::LogicalBusNotFound(bus))
        }
    }

    /// Issues a configuration space I/O to a device at the supplied location.
    pub fn pci_cfg_rw(
        &self,
        bus: RoutedBusId,
        location: BusLocation,
        rwo: RWOp,
    ) -> Option<()> {
        let guard = self.inner.lock().unwrap();
        let device = match guard.routed_buses.get(&bus) {
            Some(bus_index) => {
                let bus = &self.buses[bus_index.0];
                bus.device_at(location)
            }
            None => None,
        };

        // Don't call into the device with the lock held to avoid recursive
        // acquisition (the device may be a bridge, and this operation may need
        // to reconfigure part of the topology).
        drop(guard);
        if let Some(device) = device {
            device.cfg_rw(rwo);
            Some(())
        } else {
            None
        }
    }

    /// Configures the topology so that routed traffic to the supplied routed
    /// bus ID will be directed to the supplied logical bus (if `logical_id` is
    /// Some) or to no logical bus (if it is None).
    pub(super) fn set_bus_route(
        &self,
        routed_id: RoutedBusId,
        logical_id: Option<LogicalBusId>,
    ) {
        // This is only used by PCI topology elements like bridges that know
        // their own logical bus numbers, so absent a code bug the index
        // corresponding to this logical bus should always be in the map.
        if let Some(logical_id) = logical_id {
            let bus_index =
                self.logical_buses.get(&logical_id).unwrap_or_else(|| {
                    panic!(
                        "Failed to find logical bus {} while routing bus {}",
                        logical_id.0, routed_id.0
                    )
                });
            let mut guard = self.inner.lock().unwrap();
            let _old = guard.routed_buses.insert(routed_id, *bus_index);
            assert!(_old.is_none());
        } else {
            let mut guard = self.inner.lock().unwrap();
            let _old = guard.routed_buses.remove(&routed_id);
            assert!(_old.is_some());
        }
    }

    #[cfg(test)]
    /// Create a basic (bus 0 only) topology for unit tests
    pub(crate) fn new_test(bus0: Bus) -> Arc<Self> {
        let mut logical_buses = BTreeMap::new();
        let mut inner = Inner { routed_buses: BTreeMap::new() };
        logical_buses.insert(LogicalBusId(0), BusIndex(0));
        inner.routed_buses.insert(RoutedBusId(0), BusIndex(0));

        Arc::new(Self {
            buses: vec![bus0],
            logical_buses,
            inner: Mutex::new(inner),
        })
    }
}

struct Inner {
    routed_buses: BTreeMap<RoutedBusId, BusIndex>,
}
impl Inner {
    fn new() -> Self {
        Self { routed_buses: BTreeMap::new() }
    }
}

/// An abstract description of a PCI bridge that should be added to a topology.
#[derive(Debug, Clone, Copy)]
pub struct BridgeDescription {
    downstream_bus_id: LogicalBusId,
    attachment_addr: Bdf,
    vendor_id: u16,
    device_id: u16,
}

impl BridgeDescription {
    /// Creates a new PCI bridge description using the Oxide PCI-PCI bridge
    /// vendor and device IDs.
    ///
    /// # Arguments
    ///
    /// - `downstream_bus_id`: The logical bus ID to associate with the bridge's
    ///   downstream bus.
    /// - `attachment_addr`: The bus/device/function at which to attach the
    ///   bridge, where the bus is a logical bus number. A bridge may attach to
    ///   the downstream bus of another bridge.
    pub fn new(downstream_bus_id: LogicalBusId, attachment_addr: Bdf) -> Self {
        Self::with_pci_ids(
            downstream_bus_id,
            attachment_addr,
            ids::pci::VENDOR_OXIDE,
            ids::pci::PROPOLIS_BRIDGE_DEV_ID,
        )
    }

    /// Creates a new PCI bridge description with an explicitly supplied vendor
    /// and device ID. See the documentation for [`new`](Self::new).
    pub fn with_pci_ids(
        downstream_bus_id: LogicalBusId,
        attachment_addr: Bdf,
        vendor_id: u16,
        device_id: u16,
    ) -> Self {
        Self { downstream_bus_id, attachment_addr, vendor_id, device_id }
    }
}

/// A builder used to construct a PCI topology incrementally.
pub struct Builder {
    bridges: Vec<BridgeDescription>,
    downstream_buses: BTreeSet<LogicalBusId>,
    attachment_addrs: BTreeSet<Bdf>,
}

impl Builder {
    /// Creates a new topology builder. Buses created by this builder will
    /// associate themselves with the supplied port I/O and MMIO buses.
    pub fn new() -> Self {
        let mut this = Self {
            bridges: Vec::new(),
            downstream_buses: BTreeSet::new(),
            attachment_addrs: BTreeSet::new(),
        };
        this.downstream_buses.insert(LogicalBusId(0));
        this
    }

    /// Asks the builder to create a new PCI-PCI bridge.
    ///
    /// # Errors
    ///
    /// Fails if a bridge was already registered with the same logical bus or
    /// the same attachment address as the bridge being registered.
    pub fn add_bridge(
        &mut self,
        desc: BridgeDescription,
    ) -> Result<(), PciTopologyError> {
        if self.downstream_buses.contains(&desc.downstream_bus_id) {
            Err(PciTopologyError::LogicalBusAlreadyExists(
                desc.downstream_bus_id,
            ))
        } else if self.attachment_addrs.contains(&desc.attachment_addr) {
            Err(PciTopologyError::DeviceAlreadyAttached(desc.attachment_addr))
        } else {
            self.downstream_buses.insert(desc.downstream_bus_id);
            self.attachment_addrs.insert(desc.attachment_addr);
            self.bridges.push(desc);
            Ok(())
        }
    }

    /// Constructs a completed topology with the requested buses and bridges.
    ///
    /// # Errors
    ///
    /// Fails if a bridge had an invalid attachment address (i.e. one whose
    /// logical bus number is invalid).
    pub fn finish(
        self,
        machine: &Machine,
    ) -> Result<FinishedTopology, PciTopologyError> {
        let mut buses = Vec::new();
        let mut logical_buses = BTreeMap::new();
        let mut inner = Inner::new();

        let pio_bus = &machine.bus_pio;
        let mmio_bus = &machine.bus_mmio;

        // Bus 0 is always present and always routes to itself.
        buses.push(Bus::new(
            pio_bus,
            mmio_bus,
            machine.acc_mem.child(Some("PCI bus 0".to_string())),
            machine.acc_msi.child(Some("PCI bus 0".to_string())),
        ));
        logical_buses.insert(LogicalBusId(0), BusIndex(0));
        inner.routed_buses.insert(RoutedBusId(0), BusIndex(0));

        for bridge in &self.bridges {
            let idx = buses.len();
            logical_buses.insert(
                LogicalBusId(bridge.downstream_bus_id.0),
                BusIndex(idx),
            );
            // TODO: wire up accessors to mirror actual bus topology
            buses.push(Bus::new(
                &pio_bus,
                &mmio_bus,
                machine.acc_mem.child(Some(format!("PCI bus {idx}"))),
                machine.acc_msi.child(Some(format!("PCI bus {idx}"))),
            ));
        }

        let topology = Arc::new(Topology {
            buses,
            logical_buses,
            inner: Mutex::new(inner),
        });

        let bridges = self
            .bridges
            .iter()
            .map(|bdesc| {
                let bridge = Bridge::new(
                    bdesc.vendor_id,
                    bdesc.device_id,
                    &topology,
                    bdesc.downstream_bus_id,
                );
                topology.pci_attach(
                    LogicalBusId(bdesc.attachment_addr.bus.get()),
                    bdesc.attachment_addr.location,
                    bridge.clone(),
                    None,
                )?;

                Ok((bdesc.attachment_addr, bridge))
            })
            .collect::<Result<Vec<(Bdf, Arc<Bridge>)>, _>>()?;

        Ok(FinishedTopology { topology, bridges })
    }
}

pub struct FinishedTopology {
    pub topology: Arc<Topology>,
    pub bridges: Vec<(Bdf, Arc<Bridge>)>,
}

#[cfg(test)]
mod test {
    use crate::common::ReadOp;
    use crate::vmm::Machine;

    use super::*;

    #[test]
    fn build_without_bridges() {
        let machine = Machine::new_test().unwrap();
        let builder = Builder::new();

        assert!(builder.finish(&machine).is_ok());
    }

    #[test]
    fn build_with_bridges() {
        let machine = Machine::new_test().unwrap();
        let mut builder = Builder::new();

        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(1),
                Bdf::new(0, 1, 0).unwrap(),
            ))
            .is_ok());
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(4),
                Bdf::new(0, 4, 0).unwrap(),
            ))
            .is_ok());

        assert!(builder.finish(&machine).is_ok());
    }

    #[test]
    fn builder_bus_zero_reserved() {
        let mut builder = Builder::new();
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(0),
                Bdf::new(0, 3, 0).unwrap()
            ))
            .is_err());
    }

    #[test]
    fn builder_conflicts() {
        let mut builder = Builder::new();
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(7),
                Bdf::new(0, 7, 0).unwrap()
            ))
            .is_ok());
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(7),
                Bdf::new(0, 4, 0).unwrap()
            ))
            .is_err());
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(4),
                Bdf::new(0, 7, 0).unwrap()
            ))
            .is_err());
    }

    #[test]
    fn cfg_read() {
        let machine = Machine::new_test().unwrap();
        let mut builder = Builder::new();
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(1),
                Bdf::new(0, 1, 0).unwrap()
            ))
            .is_ok());

        let topology = builder.finish(&machine).unwrap().topology;
        let mut buf = [0u8; 1];
        let mut ro = ReadOp::from_buf(0, &mut buf);
        assert!(topology
            .pci_cfg_rw(
                RoutedBusId(0),
                BusLocation::new(1, 0).unwrap(),
                RWOp::Read(&mut ro),
            )
            .is_some());
        assert!(topology
            .pci_cfg_rw(
                RoutedBusId(1),
                BusLocation::new(1, 0).unwrap(),
                RWOp::Read(&mut ro),
            )
            .is_none());
    }

    #[test]
    fn created_bridges() {
        let machine = Machine::new_test().unwrap();

        let mut builder = Builder::new();
        assert!(builder
            .add_bridge(BridgeDescription::new(
                LogicalBusId(1),
                Bdf::new(0, 1, 0).unwrap()
            ))
            .is_ok());
        let FinishedTopology { bridges, .. } =
            builder.finish(&machine).unwrap();

        assert_eq!(bridges.len(), 1);
        assert_eq!(bridges[0].0, Bdf::new(0, 1, 0).unwrap());
    }
}


================================================
FILE: lib/propolis/src/hw/ps2/ctrl.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::VecDeque;
use std::convert::TryFrom;
use std::mem::replace;
use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::hw::ibmpc;
use crate::intr_pins::IntrPin;
use crate::migrate::*;
use crate::pio::{PioBus, PioFn};

use rfb::proto::KeyEvent;

use super::keyboard::KeyEventRep;

/// PS/2 Controller (Intel 8042) Emulation
///
/// Here we emulate both the PS/2 controller and a virtual keyboard device.
///
/// CONTROLLER OVERVIEW
///
///     I/O PORTS
///
///     There are two I/O ports: a control port (0x64) and a data port (0x60).
///     These ports may be used by are the OS to send commands to the controller
///     and PS/2 devices; the controller can respond to commands; and the
///     controller can send device data such as keyboard data to the OS.
///
///     Control Port:
///     - Reads will read from the Controller Status Register ([CtrlStatus]).
///     - Writes will be interpreted as commands to the controller. See
///     `PS2C_CMD_*` for example commands. Some commands require an additional
///     byte, which should be written to the data port.
///
///     Data Port:
///     - Reads may return responses to commands, or device data, depending on
///     the state of the controller.
///     - Writes are interpreted as command data for outstanding commands, or as
///     commands to devices. Commands to specific devices are issued by the OS
///     writing a `PS2_CMD_WRITE_{PRI_AUX}_OUT` command to the Control port,
///     then writing the device-specific command on the Data port.
///
///     REGISTERS
///
///     There are 3 8-bit registers involved in communicating with the CPU:
///     - input buffer: can be written by the CPU by writing to either port
///     - output buffer: can be read by the CPU by reading the data port
///     - controller status register ([CtrlStatus])
///
///     The input buffer is not directly represented, as our hardware is virtual
///     as well. See [CtrlOutPort] for the output buffer representation.
///
///     CONFIGURATION
///
///     The OS may read and write from the Controller Configuration Byte, which
///     lives at byte 0 in the PS/2 internal RAM. See [CtrlCfg].
///
/// INTERRUPTS
///
/// Interrupts are edge-triggered, so we pulse the interrupt to notify the guest
/// of outgoing data.
///
/// INTERACTION WITH VNC
///
/// Instead of reacting to an input buffer, the controller is notified of input
/// keyboard data via the VNC server. VNC uses keysyms to represent keys;
/// internally, we must convert these to scan codes from a keyboard.
///
/// The flow looks like this. An end user interacting with a guest over VNC
/// presses a key. The local VNC client sends its associated keysym and whether
/// the key is pressed in a VNC key event message. The propolis VNC server for
/// the guest receives the message, and will pass the key event to the
/// controller, which translates the keysym into a scan code representation,
/// then places the scan code in the output buffer. The controller also notifies
/// the guest via a keyboard interrupt.

#[usdt::provider(provider = "propolis")]
mod probes {
    // Controller Configuration updates from OS
    fn ps2ctrl_ctrlcfg_update(ctrl_cfg: u8) {}

    // reads/writes on control/data ports
    fn ps2ctrl_data_read(val: u8) {}
    fn ps2ctrl_data_read_empty() {}
    fn ps2ctrl_data_write(v: u8) {}
    fn ps2ctrl_cmd_write(v: u8) {}
    fn ps2ctrl_unknown_cmd(v: u8) {}

    // reads of Controller Status register
    fn ps2ctrl_status_read(status: u8) {}

    // interrupts: fire when the controller issues pri/aux interrupts
    fn ps2ctrl_pulse_pri() {}
    fn ps2ctrl_pulse_aux() {}

    // keyboard event probes
    fn ps2ctrl_keyevent(
        keysym_raw: u32,
        scan_code_set: u8,
        s0: u8,
        s1: u8,
        s2: u8,
        s3: u8,
    ) {
    }
    fn ps2ctrl_keyevent_dropped(
        keysym_raw: u32,
        is_pressed: u8,
        scan_code_set: u8,
    ) {
    }

    // internal device buffer writes
    fn ps2ctrl_keyboard_data(v: u8) {}
    fn ps2ctrl_mouse_data(v: u8) {}
    fn ps2ctrl_keyboard_overflow(v: u8) {}
    fn ps2ctrl_mouse_overflow(v: u8) {}

    // internal device buffer reads
    fn ps2ctrl_keyboard_data_read(v: u8) {}
    fn ps2ctrl_mouse_data_read(v: u8) {}

    // device commands
    fn ps2ctrl_keyboard_cmd(v: u8) {}
    fn ps2ctrl_mouse_cmd(v: u8) {}
    fn ps2ctrl_unknown_keyboard_cmd(v: u8) {}
    fn ps2ctrl_unknown_mouse_cmd(v: u8) {}
}

bitflags! {
    /// Controller Status Register
    ///
    /// An 8-bit register indicating the status of the controller, accessed by
    /// reading from the Control Port.
    #[derive(Default)]
    pub struct CtrlStatus: u8 {
        /// Output Buffer Status
        /// This bit must be set to 1 (indicating the buffer is full) before the
        /// OS attempts to read data from the data port.
        const OUT_FULL = 1 << 0;

        /// Input Buffer Status
        /// 0 if the input buffer is empty; 1 if the input buffer is full and
        /// shouldn't be written to by the OS.
        /// XXX(JPH): Not sure if this should be used somewhere.
        const IN_FULL = 1 << 1;

        /// System Flag
        /// This bit should be cleared to 0 by the controller on reset, and set
        /// to 1 if the system passes self tests.
        const SYS_FLAG = 1 << 2;

        /// Command/Data
        /// 1 if the last write to the input buffer (data port) was a command; 0
        ///   if the last write to the input buffer was data.
        const CMD_DATA = 1 << 3;

        /// Keyboard Locked
        const UNLOCKED = 1 << 4;

        /// Auxiliary Output Buffer contains data
        const AUX_FULL = 1 << 5;

        /// Timeout Error (0 = no error, 1 = timeout error)
        const TMO = 1 << 6;

        /// Parity Error with last byte (0 = no error, 1 = timeout error)
        const PARITY = 1 << 7;
    }
}

bitflags! {
    /// Controller Configuration Byte
    /// The OS can read and write this byte to configure the controller.
    #[derive(Default)]
    pub struct CtrlCfg: u8 {
        /// Primary Port Interrupt (1 = enabled, 0 = disabled)
        const PRI_INTR_EN = 1 << 0;

        /// Auxiliary Port Interrupt (1 = enabled, 0 = disabled)
        const AUX_INTR_EN = 1 << 1;

        /// System Flag (1 = system tests passed)
        const SYS_FLAG = 1 << 2;

        // bit 3: must be 0

        /// Primary Port Clock (1 = disabled, 0 = enabled)
        const PRI_CLOCK_DIS = 1 << 4;

        /// Auxiliary Port Clock (1 = disabled, 0 = enabled)
        const AUX_CLOCK_DIS = 1 << 5;

        /// Primary Port Translation (1 = enabled, 0 = disabled)
        /// If enabled, the controller should translate keyboard data to scan
        /// code set 1.
        const PRI_XLATE_EN = 1 << 6;

        // bit 7: must be 0
    }
}

bitflags! {
    /// Controller Output Port
    #[derive(Default, Copy, Clone)]
    pub struct CtrlOutPort: u8 {
        // bit 0: system reset
        // should always be set to 1
        // XXX(JPH): why does this work

        /// A20 Gate
        const A20 = 1 << 1;

        /// Auxiliary Port Clock
        const AUX_CLOCK = 1 << 2;

        /// Auxiliary Port Data
        const AUX_DATA = 1 << 3;

        /// Primary Port Output Buffer Full
        const PRI_FULL = 1 << 4;

        /// Auxiliary Port Output Buffer Full
        const AUX_FULL = 1 << 5;

        /// Primary Port Clock
        const PRI_CLOCK = 1 << 6;

        /// Primary Port Data
        const PRI_DATA = 1 << 7;

        // PRI_FULL and AUX_FULL are dynamic
        const DYN_FLAGS = (1 << 4) | (1 << 5);
    }
}

// Controller Commands

// Read/write Controller Configuration Byte (byte 0 of controller internal RAM)
const PS2C_CMD_READ_CTRL_CFG: u8 = 0x20;
const PS2C_CMD_WRITE_CTRL_CFG: u8 = 0x60;

// Read byte N from controller internal RAM, where N is in the range: 0x21-0x3f
const PS2C_CMD_READ_RAM_START: u8 = 0x21;
const PS2C_CMD_READ_RAM_END: u8 = 0x3f;

// Write byte N from controller internal RAM, where N is in the range: 0x21-0x3f
const PS2C_CMD_WRITE_RAM_START: u8 = 0x61;
const PS2C_CMD_WRITE_RAM_END: u8 = 0x7f;

// Disable/enable auxiliary port
const PS2C_CMD_AUX_PORT_DIS: u8 = 0xa7;
const PS2C_CMD_AUX_PORT_ENA: u8 = 0xa8;

// Test auxiliary port
const PS2C_CMD_AUX_PORT_TEST: u8 = 0xa9;

// Test controller (and response, if test passes)
const PS2C_CMD_CTRL_TEST: u8 = 0xaa;
const PS2C_R_CTRL_TEST_PASS: u8 = 0x55;

// Test primary port (and response, if test passes)
const PS2C_CMD_PRI_PORT_TEST: u8 = 0xab;
const PS2C_R_PORT_TEST_PASS: u8 = 0x00;

// Disable/enable primary port
const PS2C_CMD_PRI_PORT_DIS: u8 = 0xad;
const PS2C_CMD_PRI_PORT_ENA: u8 = 0xae;

// Read/write next byte to the Controller Output Port
const PS2C_CMD_READ_CTLR_OUT: u8 = 0xd0;
const PS2C_CMD_WRITE_CTLR_OUT: u8 = 0xd1;

// Write next byte to the primary port output
const PS2C_CMD_WRITE_PRI_OUT: u8 = 0xd2;

// Write next byte to the auxiliary port output
const PS2C_CMD_WRITE_AUX_OUT: u8 = 0xd3;

// Write next byte to the auxiliary port input
const PS2C_CMD_WRITE_AUX_IN: u8 = 0xd4;

// Pulse output line low
const PS2C_CMD_PULSE_START: u8 = 0xf0;
const PS2C_CMD_PULSE_END: u8 = 0xff;

const PS2C_RAM_LEN: usize =
    (PS2C_CMD_WRITE_RAM_END - PS2C_CMD_WRITE_RAM_START) as usize;

#[derive(Default)]
struct PS2State {
    resp: Option<u8>,
    cmd_prefix: Option<u8>,
    ctrl_cfg: CtrlCfg,
    ctrl_out_port: CtrlOutPort,
    ram: [u8; PS2C_RAM_LEN],

    pri_port: PS2Kbd,
    aux_port: PS2Mouse,

    pri_pin: Option<Box<dyn IntrPin>>,
    aux_pin: Option<Box<dyn IntrPin>>,
    reset_pin: Option<Arc<dyn IntrPin>>,
}

pub struct PS2Ctrl {
    state: Mutex<PS2State>,
}

impl PS2Ctrl {
    pub fn create() -> Arc<Self> {
        Arc::new(Self { state: Mutex::new(PS2State::default()) })
    }
    pub fn attach(
        self: &Arc<Self>,
        bus: &PioBus,
        pri_pin: Box<dyn IntrPin>,
        aux_pin: Box<dyn IntrPin>,
        reset_pin: Arc<dyn IntrPin>,
    ) {
        let this = Arc::clone(self);
        let piofn = Arc::new(move |port: u16, rwo: RWOp| this.pio_rw(port, rwo))
            as Arc<PioFn>;

        bus.register(ibmpc::PORT_PS2_DATA, 1, Arc::clone(&piofn)).unwrap();
        bus.register(ibmpc::PORT_PS2_CMD_STATUS, 1, piofn).unwrap();

        let mut state = self.state.lock().unwrap();
        state.pri_pin = Some(pri_pin);
        state.aux_pin = Some(aux_pin);
        state.reset_pin = Some(reset_pin);
    }

    pub fn key_event(&self, ke: KeyEvent) {
        let mut state = self.state.lock().unwrap();
        let translate = state.ctrl_cfg.contains(CtrlCfg::PRI_XLATE_EN);
        let key_rep;

        match KeyEventRep::try_from(ke) {
            Ok(kr) => {
                key_rep = kr;
            }
            Err(_) => {
                // ignore any unrecognized keys
                probes::ps2ctrl_keyevent_dropped!(|| {
                    let set = if translate { 1 } else { 2 };
                    let is_pressed = if ke.is_pressed { 1 } else { 0 };
                    (ke.keysym_raw, is_pressed, set)
                });
                return;
            }
        };

        // If the translation bit is set, the guest expects Scan Code Set 1; otherwise, the general
        // default is set 2.
        let scan_code = if translate {
            key_rep.to_scan_code(PS2ScanCodeSet::Set1)
        } else {
            key_rep.to_scan_code(PS2ScanCodeSet::Set2)
        };

        // In the event that we need to debug why a specific key isn't behaving as expected, it
        // might be nice to have access to the underlying keysym value that was sent and what scan
        // code was produced.
        probes::ps2ctrl_keyevent!(|| {
            let set = if translate { 1 } else { 2 };
            let sc_len = scan_code.len();

            let (mut s0, mut s1, mut s2, mut s3) = (0, 0, 0, 0);

            if sc_len > 0 {
                s0 = scan_code[0];
            }

            if sc_len > 1 {
                s1 = scan_code[1];
            }

            if sc_len > 2 {
                s2 = scan_code[2];
            }

            if sc_len > 3 {
                s3 = scan_code[3];
            }

            (key_rep.keysym_raw, set, s0, s1, s2, s3)
        });

        state.pri_port.recv_scancode(scan_code);
        self.update_intr(&mut state);
    }

    fn pio_rw(&self, port: u16, rwo: RWOp) {
        assert_eq!(rwo.len(), 1);
        match port {
            ibmpc::PORT_PS2_DATA => match rwo {
                RWOp::Read(ro) => ro.write_u8(self.data_read()),
                RWOp::Write(wo) => self.data_write(wo.read_u8()),
            },

            ibmpc::PORT_PS2_CMD_STATUS => match rwo {
                RWOp::Read(ro) => ro.write_u8(self.status_read()),
                RWOp::Write(wo) => self.cmd_write(wo.read_u8()),
            },
            _ => {
                panic!("unexpected pio in {:x}", port);
            }
        }
    }

    fn data_write(&self, v: u8) {
        let mut state = self.state.lock().unwrap();
        let cmd_prefix = replace(&mut state.cmd_prefix, None);

        probes::ps2ctrl_data_write!(|| v);

        // If there's an outstanding command (written to the Control Port), then the remaining part
        // of the command will be on the data port.
        //
        // Otherwise, assume the value is a command for the keyboard.
        if let Some(prefix) = cmd_prefix {
            match prefix {
                PS2C_CMD_WRITE_CTRL_CFG => {
                    let cfg = CtrlCfg::from_bits_truncate(v);
                    probes::ps2ctrl_ctrlcfg_update!(|| cfg.bits());
                    state.ctrl_cfg = cfg;
                }
                PS2C_CMD_WRITE_RAM_START..=PS2C_CMD_WRITE_RAM_END => {
                    let off = v - PS2C_CMD_WRITE_RAM_START;
                    state.ram[off as usize] = v;
                }
                PS2C_CMD_WRITE_CTLR_OUT => {
                    state.ctrl_out_port = CtrlOutPort::from_bits_truncate(v);
                    state.ctrl_out_port.remove(CtrlOutPort::DYN_FLAGS);
                }
                PS2C_CMD_WRITE_PRI_OUT => {
                    state.pri_port.loopback(v);
                }
                PS2C_CMD_WRITE_AUX_OUT => {
                    state.aux_port.loopback(v);
                }
                PS2C_CMD_WRITE_AUX_IN => {
                    state.aux_port.cmd_input(v);
                }
                _ => {
                    panic!("unexpected chain cmd {:x}", prefix);
                }
            }
        } else {
            state.pri_port.cmd_input(v);
        }
        self.update_intr(&mut state);
    }
    fn data_read(&self) -> u8 {
        let mut state = self.state.lock().unwrap();
        if let Some(rval) = state.resp {
            state.resp = None;
            probes::ps2ctrl_data_read!(|| rval);
            rval
        } else if state.pri_port.has_output() {
            let rval = state.pri_port.read_output().unwrap();
            probes::ps2ctrl_keyboard_data_read!(|| rval);
            self.update_intr(&mut state);
            rval
        } else if state.aux_port.has_output() {
            let rval = state.aux_port.read_output().unwrap();
            probes::ps2ctrl_mouse_data_read!(|| rval);
            self.update_intr(&mut state);
            rval
        } else {
            probes::ps2ctrl_data_read_empty!(|| {});
            0
        }
    }
    fn cmd_write(&self, v: u8) {
        let mut state = self.state.lock().unwrap();
        probes::ps2ctrl_cmd_write!(|| v);
        match v {
            PS2C_CMD_READ_CTRL_CFG => {
                state.resp = Some(state.ctrl_cfg.bits());
            }
            PS2C_CMD_READ_RAM_START..=PS2C_CMD_READ_RAM_END => {
                let off = v - PS2C_CMD_READ_RAM_START;
                state.resp = Some(state.ram[off as usize])
            }
            PS2C_CMD_CTRL_TEST => {
                state.resp = Some(PS2C_R_CTRL_TEST_PASS);
            }

            PS2C_CMD_PRI_PORT_TEST => {
                state.resp = Some(PS2C_R_PORT_TEST_PASS);
            }
            PS2C_CMD_AUX_PORT_TEST => {
                state.resp = Some(PS2C_R_PORT_TEST_PASS);
            }
            PS2C_CMD_PRI_PORT_ENA | PS2C_CMD_PRI_PORT_DIS => {
                state
                    .ctrl_cfg
                    .set(CtrlCfg::PRI_CLOCK_DIS, v == PS2C_CMD_PRI_PORT_DIS);
            }
            PS2C_CMD_AUX_PORT_ENA | PS2C_CMD_AUX_PORT_DIS => {
                state
                    .ctrl_cfg
                    .set(CtrlCfg::AUX_CLOCK_DIS, v == PS2C_CMD_AUX_PORT_DIS);
            }

            PS2C_CMD_READ_CTLR_OUT => {
                let mut val = state.ctrl_out_port;
                val.set(CtrlOutPort::PRI_FULL, state.pri_port.has_output());
                val.set(CtrlOutPort::AUX_FULL, state.aux_port.has_output());
                state.resp = Some(val.bits());
            }

            // commands with a following byte to complete
            PS2C_CMD_WRITE_CTRL_CFG
            | PS2C_CMD_WRITE_CTLR_OUT
            | PS2C_CMD_WRITE_PRI_OUT
            | PS2C_CMD_WRITE_AUX_OUT
            | PS2C_CMD_WRITE_AUX_IN
            | PS2C_CMD_WRITE_RAM_START..=PS2C_CMD_WRITE_RAM_END => {
                state.cmd_prefix = Some(v)
            }

            PS2C_CMD_PULSE_START..=PS2C_CMD_PULSE_END => {
                let to_pulse = v - PS2C_CMD_PULSE_START;
                if to_pulse == 0xe {
                    state.reset_pin.as_ref().unwrap().pulse();
                }
            }

            _ => {
                // ignore all other unrecognized commands
                probes::ps2ctrl_unknown_cmd!(|| v);
            }
        }
    }
    fn status_read(&self) -> u8 {
        let state = self.state.lock().unwrap();
        // Always report unlocked
        let mut val = CtrlStatus::UNLOCKED;

        if state.resp.is_some()
            || state.pri_port.has_output()
            || state.aux_port.has_output()
        {
            val.insert(CtrlStatus::OUT_FULL);
        }
        val.set(CtrlStatus::AUX_FULL, state.aux_port.has_output());
        val.set(CtrlStatus::CMD_DATA, state.cmd_prefix.is_some());
        val.set(
            CtrlStatus::SYS_FLAG,
            state.ctrl_cfg.contains(CtrlCfg::SYS_FLAG),
        );

        probes::ps2ctrl_status_read!(|| val.bits());

        val.bits()
    }
    fn update_intr(&self, state: &mut PS2State) {
        // We currently choose to mimic qemu, which gates the keyboard interrupt
        // with the keyboard-clock-disable in addition to the interrupt enable.
        let pri_pin = state.pri_pin.as_ref().unwrap();
        if state.ctrl_cfg.contains(CtrlCfg::PRI_INTR_EN)
            && !state.ctrl_cfg.contains(CtrlCfg::PRI_CLOCK_DIS)
            && state.pri_port.has_output()
        {
            probes::ps2ctrl_pulse_pri!(|| {});
            pri_pin.pulse();
        }

        let aux_pin = state.aux_pin.as_ref().unwrap();
        if state.ctrl_cfg.contains(CtrlCfg::AUX_INTR_EN)
            && state.aux_port.has_output()
        {
            probes::ps2ctrl_pulse_aux!(|| {});
            aux_pin.pulse();
        }
    }
    fn reset(&self) {
        let mut state = self.state.lock().unwrap();
        state.pri_port.reset();
        state.aux_port.reset();
        state.resp = None;
        state.cmd_prefix = None;
        state.ctrl_cfg = CtrlCfg::default();
        state.ctrl_out_port = CtrlOutPort::default();
        for b in state.ram.iter_mut() {
            *b = 0;
        }
        self.update_intr(&mut state);
    }
}
impl Lifecycle for PS2Ctrl {
    fn type_name(&self) -> &'static str {
        "lpc-ps2ctrl"
    }
    fn reset(&self) {
        PS2Ctrl::reset(self);
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}
impl MigrateSingle for PS2Ctrl {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let state = self.state.lock().unwrap();
        let kbd = &state.pri_port;
        let mouse = &state.aux_port;

        Ok(migrate::PS2CtrlV1 {
            ctrl: migrate::PS2CtrlStateV1 {
                response: state.resp,
                cmd_prefix: state.cmd_prefix,
                ctrl_cfg: state.ctrl_cfg.bits(),
                ctrl_out_port: state.ctrl_out_port.bits(),
                ram: state.ram,
            },
            kbd: migrate::PS2KbdV1 {
                buf: kbd.buf.clone().into(),
                current_cmd: kbd.cur_cmd,
                enabled: kbd.enabled,
                led_status: kbd.led_status,
                typematic: kbd.typematic,
                scan_code_set: kbd.scan_code_set.as_byte(),
            },
            mouse: migrate::PS2MouseV1 {
                buf: mouse.buf.clone().into(),
                current_cmd: mouse.cur_cmd,
                status: mouse.status.bits(),
                resolution: mouse.resolution,
                sample_rate: mouse.sample_rate,
            },
        }
        .into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let migrate::PS2CtrlV1 {
            ctrl: saved_ctrl,
            kbd: saved_kbd,
            mouse: saved_mouse,
        } = offer.parse()?;

        let mut inner = self.state.lock().unwrap();

        inner.resp = saved_ctrl.response;
        inner.cmd_prefix = saved_ctrl.cmd_prefix;
        inner.ctrl_cfg =
            CtrlCfg::from_bits(saved_ctrl.ctrl_cfg).ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PS2 ctrl_cfg: failed to import value {:#x}",
                    saved_ctrl.ctrl_cfg
                ))
            })?;
        inner.ctrl_out_port = CtrlOutPort::from_bits(saved_ctrl.ctrl_out_port)
            .ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PS2 ctrl_out_port: failed to import value {:#x}",
                    saved_ctrl.ctrl_cfg
                ))
            })?;
        inner.ram = saved_ctrl.ram;

        let kbd = &mut inner.pri_port;
        kbd.cur_cmd = saved_kbd.current_cmd;
        kbd.enabled = saved_kbd.enabled;
        kbd.led_status = saved_kbd.led_status;
        kbd.typematic = saved_kbd.typematic;
        kbd.scan_code_set = PS2ScanCodeSet::from_byte(saved_kbd.scan_code_set)
            .ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PS2 kbd scan code: failed to import value {}",
                    saved_kbd.scan_code_set
                ))
            })?;
        kbd.buf = VecDeque::from(saved_kbd.buf);

        let mouse = &mut inner.aux_port;
        mouse.cur_cmd = saved_mouse.current_cmd;
        mouse.status =
            PS2MStatus::from_bits(saved_mouse.status).ok_or_else(|| {
                MigrateStateError::ImportFailed(format!(
                    "PS2 mouse status: failed to import value {:#x}",
                    saved_mouse.status
                ))
            })?;
        mouse.resolution = saved_mouse.resolution;
        mouse.sample_rate = saved_mouse.sample_rate;
        mouse.buf = VecDeque::from(saved_mouse.buf);

        Ok(())
    }
}

// Keyboard-specific commands

// Set LEDs: bit 0 is ScrollLock, bit 1 is NumberLock, bit 2 is CapsLock
const PS2K_CMD_SET_LEDS: u8 = 0xed;

const PS2K_CMD_SCAN_CODE: u8 = 0xf0;
const PS2K_CMD_TYPEMATIC: u8 = 0xf3;

const PS2K_CMD_ECHO: u8 = 0xee;
const PS2K_CMD_IDENT: u8 = 0xf2;
const PS2K_CMD_SCAN_EN: u8 = 0xf4;
const PS2K_CMD_SCAN_DIS: u8 = 0xf5;
const PS2K_CMD_SET_DEFAULT: u8 = 0xf6;
const PS2K_CMD_RESEND: u8 = 0xfe;
const PS2K_CMD_RESET: u8 = 0xff;

const PS2K_R_ACK: u8 = 0xfa;
const PS2K_R_ECHO: u8 = 0xee;
const PS2K_R_SELF_TEST_PASS: u8 = 0xaa;

const PS2K_TYPEMATIC_MASK: u8 = 0x7f;

const PS2_KBD_BUFSZ: usize = 16;

pub(crate) enum PS2ScanCodeSet {
    Set1,
    Set2,
    // ignoring fancy set3
}
impl PS2ScanCodeSet {
    fn as_byte(&self) -> u8 {
        match self {
            PS2ScanCodeSet::Set1 => 0x1,
            PS2ScanCodeSet::Set2 => 0x2,
        }
    }

    fn from_byte(b: u8) -> Option<Self> {
        match b {
            1 => Some(PS2ScanCodeSet::Set1),
            2 => Some(PS2ScanCodeSet::Set2),
            _ => None,
        }
    }
}

// TODO: wire up remote console to enabled/led_status/typematic
#[allow(unused)]
struct PS2Kbd {
    buf: VecDeque<u8>,
    cur_cmd: Option<u8>,
    enabled: bool,
    led_status: u8,
    typematic: u8,
    scan_code_set: PS2ScanCodeSet,
}
impl PS2Kbd {
    fn new() -> Self {
        Self {
            buf: VecDeque::with_capacity(PS2_KBD_BUFSZ),
            cur_cmd: None,
            enabled: true,
            led_status: 0,
            typematic: 0,
            scan_code_set: PS2ScanCodeSet::Set1,
        }
    }
    fn cmd_input(&mut self, v: u8) {
        probes::ps2ctrl_keyboard_cmd!(|| v);
        if let Some(cmd) = self.cur_cmd {
            self.cur_cmd = None;
            match cmd {
                PS2K_CMD_SET_LEDS => {
                    // low three bits set scroll/num/caps lock
                    self.led_status = v & 0b111;
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_SCAN_CODE => {
                    match v {
                        0 => {
                            // get scan code set
                            self.resp(PS2K_R_ACK);
                            self.resp(self.scan_code_set.as_byte());
                        }
                        1 => {
                            self.resp(PS2K_R_ACK);
                            self.scan_code_set = PS2ScanCodeSet::Set1;
                        }
                        2 => {
                            self.resp(PS2K_R_ACK);
                            self.scan_code_set = PS2ScanCodeSet::Set2;
                        }
                        _ => {}
                    }
                }
                PS2K_CMD_TYPEMATIC => {
                    self.typematic = v & PS2K_TYPEMATIC_MASK;
                    self.resp(PS2K_R_ACK);
                }
                _ => {
                    panic!("bad multi-part ps2 cmd {}", cmd);
                }
            }
        } else {
            match v {
                PS2K_CMD_SET_LEDS | PS2K_CMD_SCAN_CODE | PS2K_CMD_TYPEMATIC => {
                    // multi-part command, wait for next byte
                    self.cur_cmd = Some(v);
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_ECHO => {
                    self.resp(PS2K_R_ECHO);
                }
                PS2K_CMD_IDENT => {
                    self.resp(PS2K_R_ACK);
                    // MF2 keyboard
                    self.resp(0xab);
                    self.resp(0x83);
                }
                PS2K_CMD_SCAN_EN => {
                    self.enabled = true;
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_SCAN_DIS => {
                    self.enabled = false;
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_SET_DEFAULT => {
                    // XXX which things to reset?
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_RESEND => {
                    // XXX we do not track last byte for now
                    self.resp(PS2K_R_ACK);
                }
                PS2K_CMD_RESET => {
                    self.reset();
                    // Even for reset, ack is expected
                    self.resp(PS2K_R_ACK);
                    self.resp(PS2K_R_SELF_TEST_PASS);
                }
                _ => {
                    // ignore unrecognized cmds
                    probes::ps2ctrl_unknown_keyboard_cmd!(|| v);
                }
            }
        }
    }
    fn resp(&mut self, v: u8) {
        let remain = PS2_KBD_BUFSZ - self.buf.len();
        match remain {
            0 => {
                // overrun already in progress, do nothing
                probes::ps2ctrl_keyboard_overflow!(|| v);
            }
            1 => {
                // indicate overflow instead
                probes::ps2ctrl_keyboard_overflow!(|| v);
                self.buf.push_back(0xff)
            }
            _ => {
                probes::ps2ctrl_keyboard_data!(|| v);
                self.buf.push_back(v);
            }
        }
    }
    fn reset(&mut self) {
        // XXX  what should the defaults be?
        self.cur_cmd = None;
        self.enabled = true;
        self.led_status = 0;
        self.typematic = 0;
        self.scan_code_set = PS2ScanCodeSet::Set1;
        self.buf.clear();
    }
    fn has_output(&self) -> bool {
        !self.buf.is_empty()
    }
    fn read_output(&mut self) -> Option<u8> {
        self.buf.pop_front()
    }
    fn loopback(&mut self, v: u8) {
        self.resp(v);
    }

    fn recv_scancode(&mut self, scan_code: Vec<u8>) {
        for s in scan_code.into_iter() {
            self.resp(s);
        }
    }
}
impl Default for PS2Kbd {
    fn default() -> Self {
        Self::new()
    }
}

// Mouse-specific commands
const PS2M_CMD_RESET: u8 = 0xff;
const PS2M_CMD_RESEND: u8 = 0xfe;
const PS2M_CMD_SET_DEFAULTS: u8 = 0xf6;
const PS2M_CMD_DATA_REP_DIS: u8 = 0xf5;
const PS2M_CMD_DATA_REP_ENA: u8 = 0xf4;
const PS2M_CMD_SET_SAMP_RATE: u8 = 0xf3;
const PS2M_CMD_GET_DEVID: u8 = 0xf2;
const PS2M_CMD_REMOTE_MODE_SET: u8 = 0xf0;
const PS2M_CMD_WRAP_MODE_SET: u8 = 0xee;
const PS2M_CMD_WRAP_MODE_RESET: u8 = 0xec;
const PS2M_CMD_READ_DATA: u8 = 0xeb;
const PS2M_CMD_STREAM_MODE_SET: u8 = 0xea;
const PS2M_CMD_STATUS_REQ: u8 = 0xe9;
const PS2M_CMD_RESOLUTION_SET: u8 = 0xe8;
const PS2M_CMD_SCALING1_SET: u8 = 0xe7;
const PS2M_CMD_SCALING2_SET: u8 = 0xe6;

const PS2M_R_ACK: u8 = 0xfa;
const PS2M_R_SELF_TEST_PASS: u8 = 0xaa;
// basic mouse device ID
const PS2M_R_DEVID: u8 = 0x00;

bitflags! {
    #[derive(Default)]
    pub struct PS2MStatus: u8 {
        const B_LEFT = 1 << 0;
        const B_RIGHT = 1 << 1;
        const B_MID = 1 << 2;

        const SCALE2 = 1 << 4;
        const ENABLE = 1 << 5;
        const REMOTE = 1 << 6;
    }
}

struct PS2Mouse {
    buf: VecDeque<u8>,
    cur_cmd: Option<u8>,
    status: PS2MStatus,
    resolution: u8,
    sample_rate: u8,
}
impl PS2Mouse {
    fn new() -> Self {
        Self {
            buf: VecDeque::with_capacity(PS2_KBD_BUFSZ),
            cur_cmd: None,
            status: PS2MStatus::empty(),
            resolution: 0,
            sample_rate: 10,
        }
    }
    fn cmd_input(&mut self, v: u8) {
        probes::ps2ctrl_mouse_cmd!(|| v);
        if let Some(cmd) = self.cur_cmd {
            self.cur_cmd = None;
            match cmd {
                PS2M_CMD_SET_SAMP_RATE => {
                    // XXX: check for valid values?
                    self.sample_rate = v;
                }
                PS2M_CMD_RESOLUTION_SET => {
                    // XXX: check for valid values?
                    self.resolution = v;
                }
                _ => {
                    panic!("bad multi-part ps2 cmd {}", cmd);
                }
            }
        } else {
            match v {
                PS2M_CMD_RESET => {
                    self.reset();
                    self.resp(PS2M_R_ACK);
                    self.resp(PS2M_R_SELF_TEST_PASS);
                    self.resp(PS2M_R_DEVID);
                }
                PS2M_CMD_RESEND => {
                    // XXX: no last byte tracking for now
                    self.resp(PS2M_R_ACK);
                }
                PS2M_CMD_SET_DEFAULTS => {
                    // XXX: set which defaults?
                    self.resp(PS2M_R_ACK);
                }
                PS2M_CMD_DATA_REP_DIS => {
                    self.resp(PS2M_R_ACK);
                    self.status.remove(PS2MStatus::ENABLE);
                }
                PS2M_CMD_DATA_REP_ENA => {
                    self.resp(PS2M_R_ACK);
                    self.status.insert(PS2MStatus::ENABLE);
                }
                PS2M_CMD_GET_DEVID => {
                    self.resp(PS2M_R_ACK);
                    // standard ps2 mouse dev id
                    self.resp(PS2M_R_DEVID);
                }
                PS2M_CMD_REMOTE_MODE_SET => {
                    self.resp(PS2M_R_ACK);
                    self.status.insert(PS2MStatus::REMOTE);
                }
                PS2M_CMD_WRAP_MODE_SET | PS2M_CMD_WRAP_MODE_RESET => {
                    self.resp(PS2M_R_ACK);
                }

                PS2M_CMD_READ_DATA => {
                    self.resp(PS2M_R_ACK);
                    self.movement();
                }
                PS2M_CMD_STREAM_MODE_SET => {
                    // XXX wire to what?
                }
                PS2M_CMD_STATUS_REQ => {
                    // status, resolution, sample rate
                    self.resp(PS2M_R_ACK);
                    self.resp(self.status.bits());
                    self.resp(self.resolution);
                    self.resp(self.sample_rate);
                }

                PS2M_CMD_SET_SAMP_RATE | PS2M_CMD_RESOLUTION_SET => {
                    self.cur_cmd = Some(v);
                    self.resp(PS2M_R_ACK);
                }
                PS2M_CMD_SCALING1_SET | PS2M_CMD_SCALING2_SET => {
                    self.status
                        .set(PS2MStatus::SCALE2, v == PS2M_CMD_SCALING2_SET);
                    self.resp(PS2M_R_ACK);
                }

                _ => {
                    // ignore unrecognized cmds
                    probes::ps2ctrl_unknown_mouse_cmd!(|| v);
                }
            }
        }
    }
    fn resp(&mut self, v: u8) {
        let remain = PS2_KBD_BUFSZ - self.buf.len();
        match remain {
            0 => {
                probes::ps2ctrl_mouse_overflow!(|| v);
                // overrun already in progress, do nothing
            }
            1 => {
                // indicate overflow instead
                probes::ps2ctrl_mouse_overflow!(|| v);
                self.buf.push_back(0xff)
            }
            _ => {
                probes::ps2ctrl_mouse_data!(|| v);
                self.buf.push_back(v);
            }
        }
    }
    fn reset(&mut self) {
        // XXX  what should the defaults be?
        self.buf.clear();
        self.cur_cmd = None;
        self.status = PS2MStatus::empty();
        self.resolution = 0;
        self.sample_rate = 10;
    }
    fn has_output(&self) -> bool {
        !self.buf.is_empty()
    }
    fn read_output(&mut self) -> Option<u8> {
        self.buf.pop_front()
    }
    fn loopback(&mut self, v: u8) {
        self.resp(v);
    }
    fn movement(&mut self) {
        // no buttons, just the always-one bit
        self.resp(0b00001000);
        // no X movement
        self.resp(0x00);
        // no Y movement
        self.resp(0x00);
    }
}
impl Default for PS2Mouse {
    fn default() -> Self {
        Self::new()
    }
}

pub mod migrate {
    use crate::migrate::*;

    use super::PS2C_RAM_LEN;
    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct PS2CtrlV1 {
        pub ctrl: PS2CtrlStateV1,
        pub kbd: PS2KbdV1,
        pub mouse: PS2MouseV1,
    }
    impl Schema<'_> for PS2CtrlV1 {
        fn id() -> SchemaId {
            ("ps2-ctrl", 1)
        }
    }

    #[derive(Deserialize, Serialize)]
    pub struct PS2CtrlStateV1 {
        pub response: Option<u8>,
        pub cmd_prefix: Option<u8>,
        pub ctrl_cfg: u8,
        pub ctrl_out_port: u8,
        pub ram: [u8; PS2C_RAM_LEN],
    }
    #[derive(Deserialize, Serialize)]
    pub struct PS2KbdV1 {
        pub buf: Vec<u8>,
        pub current_cmd: Option<u8>,
        pub enabled: bool,
        pub led_status: u8,
        pub typematic: u8,
        pub scan_code_set: u8,
    }
    #[derive(Deserialize, Serialize)]
    pub struct PS2MouseV1 {
        pub buf: Vec<u8>,
        pub current_cmd: Option<u8>,
        pub status: u8,
        pub resolution: u8,
        pub sample_rate: u8,
    }
}


================================================
FILE: lib/propolis/src/hw/ps2/keyboard/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

mod scan_code_1;
mod scan_code_2;

use std::convert::TryFrom;

use anyhow::{anyhow, Result};
use rfb::keysym::AsciiChar;
use rfb::keysym::KeySym::*;
use rfb::proto::KeyEvent;

use scan_code_1::*;
use scan_code_2::*;

use super::ctrl::PS2ScanCodeSet;

/// A struct that contains all information necessary to construct a scan code
/// for multiple scan code sets, as scan codes from all sets have a similar
/// structure.
///
/// Keyboards send a "make" code when a key is pressed, and a "break" code when
/// a key is released.
///
/// A key's make code is generally 1 byte, which here is referred to as the
/// "base value" of the scan code. Some keys are represented by the "extended"
/// set of codes, which have a prefix code indicating to the controller that the
/// next byte is also relevant. A handful of keys, for backwards compatibility
/// reasons, are represented by multiple scan codes.
///
/// So possible make codes are:
/// - 1 byte: [base value]
/// - 2 bytes: [extended prefix] + [base value]
/// - multibyte sequences composed of 2 or more make codes
///
/// A key's break code varies by scan code set. For Set 1, the highest bit of
/// the make code base value is set. For Set 2, an additional release code is
/// sent ahead of the make code, but after the extended prefix if one is used.
///
/// So possible break codes for Scan Code Set 1 are:
/// - 1 byte: [base value | release flag]
/// - 2 bytes: [extended prefix] + [base value | release flag]
/// - multibyte sequences composed of 2 or more break codes
///
/// And possible break codes for Scan Code Set 2 are:
/// - 2 bytes: [release code] + [base value]
/// - 3 bytes: [extended prefix] + [release code] + [base value]
/// - multibyte sequences composed of 2 or more break codes
///
/// Exceptions:
/// - In Set 2, the pause key does not have a break code.
///
#[derive(Debug)]
pub struct ScanCodeBase {
    base_val: u8,
    prefix: Option<Vec<u8>>,
}

// A representation of a key event that allows us to map keysyms, which are a
// much richer diversity of keys, to representation in scan codes, the set of
// which is more limited.
#[derive(Debug)]
pub struct KeyEventRep {
    pub keysym_raw: u32,
    pub is_pressed: bool,
    pub scan_code_1: ScanCodeBase,
    pub scan_code_2: ScanCodeBase,
}

impl KeyEventRep {
    /// Convert the given key to its scan code within a given scan code set.
    pub(crate) fn to_scan_code(&self, sc_set: PS2ScanCodeSet) -> Vec<u8> {
        let mut bytes = Vec::new();
        let sc = match sc_set {
            PS2ScanCodeSet::Set1 => &self.scan_code_1,
            PS2ScanCodeSet::Set2 => &self.scan_code_2,
        };

        if let Some(prefix) = &sc.prefix {
            bytes.extend(prefix);
        }

        if self.is_pressed {
            bytes.push(sc.base_val);
        } else {
            match sc_set {
                PS2ScanCodeSet::Set1 => {
                    bytes.push(sc.base_val | SC1_RELEASE_FLAG)
                }
                PS2ScanCodeSet::Set2 => {
                    bytes.push(SC2_RELEASE_CODE);
                    bytes.push(sc.base_val);
                }
            }
        }

        bytes
    }
}

impl TryFrom<KeyEvent> for KeyEventRep {
    type Error = anyhow::Error;

    fn try_from(keyevent: KeyEvent) -> Result<Self, Self::Error> {
        let (base_val_1, base_val_2) = match keyevent.keysym {
            Ascii(ascii_char) => match ascii_char {
                AsciiChar::BackSpace => (SC1_BACKSPACE, SC2_BACKSPACE),
                AsciiChar::Tab => (SC1_TAB, SC2_TAB),
                AsciiChar::ESC => (SC1_ESCAPE, SC2_ESCAPE),
                AsciiChar::Space => (SC1_SPACE, SC2_SPACE),
                AsciiChar::Apostrophe | AsciiChar::Quotation => {
                    (SC1_QUOTE_DBLQUOTE, SC2_QUOTE_DBLQUOTE)
                }
                AsciiChar::_1 | AsciiChar::Exclamation => {
                    (SC1_1_EXCLAMATION, SC2_1_EXCLAMATION)
                }
                AsciiChar::_2 | AsciiChar::At => (SC1_2_AT, SC2_2_AT),
                AsciiChar::_3 | AsciiChar::Hash => (SC1_3_HASH, SC2_3_HASH),
                AsciiChar::_4 | AsciiChar::Dollar => {
                    (SC1_4_DOLLAR, SC2_4_DOLLAR)
                }
                AsciiChar::_5 | AsciiChar::Percent => {
                    (SC1_5_PERCENT, SC2_5_PERCENT)
                }
                AsciiChar::_6 | AsciiChar::Caret => (SC1_6_CARET, SC2_6_CARET),
                AsciiChar::_7 | AsciiChar::Ampersand => {
                    (SC1_7_AMPERSAND, SC2_7_AMPERSAND)
                }
                AsciiChar::_9 | AsciiChar::ParenOpen => {
                    (SC1_9_LPAREN, SC2_9_LPAREN)
                }
                AsciiChar::_0 | AsciiChar::ParenClose => {
                    (SC1_0_RPAREN, SC2_0_RPAREN)
                }
                AsciiChar::_8 | AsciiChar::Asterisk => {
                    (SC1_8_ASTERISK, SC2_8_ASTERISK)
                }
                AsciiChar::Equal | AsciiChar::Plus => {
                    (SC1_EQUALS_PLUS, SC2_EQUALS_PLUS)
                }
                AsciiChar::Comma | AsciiChar::LessThan => {
                    (SC1_COMMA_LESSTHAN, SC2_COMMA_LESSTHAN)
                }
                AsciiChar::Minus | AsciiChar::UnderScore => {
                    (SC1_DASH_UNDERSCORE, SC2_DASH_UNDERSCORE)
                }
                AsciiChar::Dot | AsciiChar::GreaterThan => {
                    (SC1_PERIOD_GREATERTHAN, SC2_PERIOD_GREATERTHAN)
                }
                AsciiChar::Slash | AsciiChar::Question => {
                    (SC1_SLASH_QUESTIONMARK, SC2_SLASH_QUESTIONMARK)
                }
                AsciiChar::Semicolon | AsciiChar::Colon => {
                    (SC1_SEMICOLON_COLON, SC2_SEMICOLON_COLON)
                }
                AsciiChar::A | AsciiChar::a => (SC1_A, SC2_A),
                AsciiChar::B | AsciiChar::b => (SC1_B, SC2_B),
                AsciiChar::C | AsciiChar::c => (SC1_C, SC2_C),
                AsciiChar::D | AsciiChar::d => (SC1_D, SC2_D),
                AsciiChar::E | AsciiChar::e => (SC1_E, SC2_E),
                AsciiChar::F | AsciiChar::f => (SC1_F, SC2_F),
                AsciiChar::G | AsciiChar::g => (SC1_G, SC2_G),
                AsciiChar::H | AsciiChar::h => (SC1_H, SC2_H),
                AsciiChar::I | AsciiChar::i => (SC1_I, SC2_I),
                AsciiChar::J | AsciiChar::j => (SC1_J, SC2_J),
                AsciiChar::K | AsciiChar::k => (SC1_K, SC2_K),
                AsciiChar::L | AsciiChar::l => (SC1_L, SC2_L),
                AsciiChar::M | AsciiChar::m => (SC1_M, SC2_M),
                AsciiChar::N | AsciiChar::n => (SC1_N, SC2_N),
                AsciiChar::O | AsciiChar::o => (SC1_O, SC2_O),
                AsciiChar::P | AsciiChar::p => (SC1_P, SC2_P),
                AsciiChar::Q | AsciiChar::q => (SC1_Q, SC2_Q),
                AsciiChar::R | AsciiChar::r => (SC1_R, SC2_R),
                AsciiChar::S | AsciiChar::s => (SC1_S, SC2_S),
                AsciiChar::T | AsciiChar::t => (SC1_T, SC2_T),
                AsciiChar::U | AsciiChar::u => (SC1_U, SC2_U),
                AsciiChar::V | AsciiChar::v => (SC1_V, SC2_V),
                AsciiChar::W | AsciiChar::w => (SC1_W, SC2_W),
                AsciiChar::X | AsciiChar::x => (SC1_X, SC2_X),
                AsciiChar::Y | AsciiChar::y => (SC1_Y, SC2_Y),
                AsciiChar::Z | AsciiChar::z => (SC1_Z, SC2_Z),
                AsciiChar::BracketOpen | AsciiChar::CurlyBraceOpen => {
                    (SC1_LBRACKET_LCURLY, SC2_LBRACKET_LCURLY)
                }
                AsciiChar::BackSlash | AsciiChar::VerticalBar => {
                    (SC1_BACKSLASH_PIPE, SC2_BACKSLASH_PIPE)
                }
                AsciiChar::BracketClose | AsciiChar::CurlyBraceClose => {
                    (SC1_RBRACKET_RCURLY, SC2_RBRACKET_RCURLY)
                }
                AsciiChar::Grave | AsciiChar::Tilde => {
                    (SC1_BACKTICK_TILDE, SC2_BACKTICK_TILDE)
                }
                AsciiChar::DEL => (SC1_DELETE, SC2_DELETE),

                // values that are unrepresentable, or we are explicitly ignoring
                AsciiChar::Null
                | AsciiChar::SOH
                | AsciiChar::SOX
                | AsciiChar::ETX
                | AsciiChar::EOT
                | AsciiChar::ENQ
                | AsciiChar::ACK
                | AsciiChar::Bell
                | AsciiChar::LineFeed
                | AsciiChar::VT
                | AsciiChar::FF
                | AsciiChar::CarriageReturn
                | AsciiChar::SI
                | AsciiChar::SO
                | AsciiChar::DLE
                | AsciiChar::DC1
                | AsciiChar::DC2
                | AsciiChar::DC3
                | AsciiChar::DC4
                | AsciiChar::NAK
                | AsciiChar::SYN
                | AsciiChar::ETB
                | AsciiChar::CAN
                | AsciiChar::EM
                | AsciiChar::SUB
                | AsciiChar::FS
                | AsciiChar::GS
                | AsciiChar::RS
                | AsciiChar::US => (0x0, 0x0),
            },
            Backspace => (SC1_BACKSPACE, SC2_BACKSPACE),
            Tab => (SC1_TAB, SC2_TAB),
            ReturnOrEnter => (SC1_ENTER, SC2_ENTER),
            Escape => (SC1_ESCAPE, SC2_ESCAPE),
            Insert => (SC1_INSERT, SC2_INSERT),
            Delete => (SC1_DELETE, SC2_DELETE),
            Home => (SC1_HOME, SC2_HOME),
            End => (SC1_END, SC2_END),
            PageUp => (SC1_PGUP, SC2_PGUP),
            PageDown => (SC1_PGDN, SC2_PGDN),
            Print => (SC1_PRINTSCREEN, SC2_PRINTSCREEN),
            CapsLock => (SC1_CAPS_LOCK, SC2_CAPS_LOCK),
            SuperLeft => (SC1_SUPER_LEFT, SC2_SUPER_LEFT),
            SuperRight => (SC1_SUPER_RIGHT, SC2_SUPER_RIGHT),

            Left => (SC1_LEFT, SC2_LEFT),
            Up => (SC1_UP, SC2_UP),
            Right => (SC1_RIGHT, SC2_RIGHT),
            Down => (SC1_DOWN, SC2_DOWN),

            FunctionKey(1) => (SC1_F1, SC2_F1),
            FunctionKey(2) => (SC1_F2, SC2_F2),
            FunctionKey(3) => (SC1_F3, SC2_F3),
            FunctionKey(4) => (SC1_F4, SC2_F4),
            FunctionKey(5) => (SC1_F5, SC2_F5),
            FunctionKey(6) => (SC1_F6, SC2_F6),
            FunctionKey(7) => (SC1_F7, SC2_F7),
            FunctionKey(8) => (SC1_F8, SC2_F8),
            FunctionKey(9) => (SC1_F9, SC2_F9),
            FunctionKey(10) => (SC1_F10, SC2_F10),
            FunctionKey(11) => (SC1_F11, SC2_F11),
            FunctionKey(12) => (SC1_F12, SC2_F12),

            ShiftLeft => (SC1_SHIFT_LEFT, SC2_SHIFT_LEFT),
            ShiftRight => (SC1_SHIFT_RIGHT, SC2_SHIFT_RIGHT),
            ControlLeft => (SC1_CTRL_LEFT, SC2_CTRL_LEFT),
            ControlRight => (SC1_CTRL_RIGHT, SC2_CTRL_RIGHT),
            AltLeft => (SC1_ALT_LEFT, SC2_ALT_LEFT),
            AltRight => (SC1_ALT_RIGHT, SC2_ALT_RIGHT),
            ScrollLock => (SC1_SCROLL_LOCK, SC2_SCROLL_LOCK),
            NumLock => (SC1_NUM_LOCK, SC2_NUM_LOCK),

            KeypadSlash => (SC1_KP_SLASH, SC2_KP_SLASH),
            KeypadAsterisk => (SC1_KP_ASTERISK, SC2_KP_ASTERISK),
            KeypadMinus => (SC1_KP_MINUS, SC2_KP_MINUS),
            KeypadPlus => (SC1_KP_PLUS, SC2_KP_PLUS),
            KeypadEnter => (SC1_KP_ENTER, SC2_KP_ENTER),
            KeypadPeriod | KeypadDelete => {
                (SC1_KP_DEL_PERIOD, SC2_KP_DEL_PERIOD)
            }
            Keypad0 | KeypadInsert => (SC1_0_INSERT, SC2_0_INSERT),
            Keypad1 | KeypadEnd => (SC1_1_END, SC2_1_END),
            Keypad2 | KeypadDown => (SC1_2_DOWN, SC2_2_DOWN),
            Keypad3 | KeypadPgDown => (SC1_3_PGDN, SC2_3_PGDN),
            Keypad4 | KeypadLeft => (SC1_4_LEFT, SC2_4_LEFT),
            Keypad5 | KeypadEmpty => (SC1_5_CENTER, SC2_5_CENTER),
            Keypad6 | KeypadRight => (SC1_6_RIGHT, SC2_6_RIGHT),
            Keypad7 | KeypadHome => (SC1_7_HOME, SC2_7_HOME),
            Keypad8 | KeypadUp => (SC1_8_UP, SC2_8_UP),
            Keypad9 | KeypadPgUp => (SC1_9_PGUP, SC2_9_PGUP),

            // Keys we're choosing to drop explicitly for now
            FunctionKey(_) | Pause | Menu => (0x0, 0x0),
        };

        if matches!((base_val_1, base_val_2), (0x0, 0x0)) {
            return Err(anyhow!(
                "unrecognized keysym value: 0x{:x}",
                keyevent.keysym_raw
            ));
        }

        let prefix_1 = match keyevent.keysym {
            AltRight | ControlRight | Home | Insert | End | PageUp
            | PageDown | KeypadSlash | KeypadEnter | SuperLeft | SuperRight
            | Left | Right | Up | Down => Some(vec![SC1_EXTENDED_PREFIX_0]),
            _ => None,
        };

        let prefix_2 = match keyevent.keysym {
            AltRight | ControlRight | Home | Insert | End | PageUp
            | PageDown | KeypadSlash | KeypadEnter | SuperLeft | SuperRight
            | Left | Right | Up | Down => Some(vec![SC2_EXTENDED_PREFIX_0]),
            _ => None,
        };

        let sc1 = ScanCodeBase { base_val: base_val_1, prefix: prefix_1 };
        let sc2 = ScanCodeBase { base_val: base_val_2, prefix: prefix_2 };

        Ok(Self {
            keysym_raw: keyevent.keysym_raw,
            is_pressed: keyevent.is_pressed,
            scan_code_1: sc1,
            scan_code_2: sc2,
        })
    }
}


================================================
FILE: lib/propolis/src/hw/ps2/keyboard/scan_code_1.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Scan Code Set 1 codes

pub const SC1_RELEASE_FLAG: u8 = 0x80;
pub const SC1_EXTENDED_PREFIX_0: u8 = 0xe0;

// Letter keys
pub const SC1_A: u8 = 0x1e;
pub const SC1_B: u8 = 0x30;
pub const SC1_C: u8 = 0x2e;
pub const SC1_D: u8 = 0x20;
pub const SC1_E: u8 = 0x12;
pub const SC1_F: u8 = 0x21;
pub const SC1_G: u8 = 0x22;
pub const SC1_H: u8 = 0x23;
pub const SC1_I: u8 = 0x17;
pub const SC1_J: u8 = 0x24;
pub const SC1_K: u8 = 0x25;
pub const SC1_L: u8 = 0x26;
pub const SC1_M: u8 = 0x32;
pub const SC1_N: u8 = 0x31;
pub const SC1_O: u8 = 0x18;
pub const SC1_P: u8 = 0x19;
pub const SC1_Q: u8 = 0x10;
pub const SC1_R: u8 = 0x13;
pub const SC1_S: u8 = 0x1f;
pub const SC1_T: u8 = 0x14;
pub const SC1_U: u8 = 0x16;
pub const SC1_V: u8 = 0x2f;
pub const SC1_W: u8 = 0x11;
pub const SC1_X: u8 = 0x2d;
pub const SC1_Y: u8 = 0x15;
pub const SC1_Z: u8 = 0x2c;

// Number keys
pub const SC1_0_RPAREN: u8 = 0x0b;
pub const SC1_1_EXCLAMATION: u8 = 0x02;
pub const SC1_2_AT: u8 = 0x03;
pub const SC1_3_HASH: u8 = 0x04;
pub const SC1_4_DOLLAR: u8 = 0x05;
pub const SC1_5_PERCENT: u8 = 0x06;
pub const SC1_6_CARET: u8 = 0x07;
pub const SC1_7_AMPERSAND: u8 = 0x08;
pub const SC1_8_ASTERISK: u8 = 0x09;
pub const SC1_9_LPAREN: u8 = 0x0a;

// Numpad Keys
pub const SC1_1_END: u8 = 0x4f;
pub const SC1_2_DOWN: u8 = 0x50;
pub const SC1_3_PGDN: u8 = 0x51;
pub const SC1_4_LEFT: u8 = 0x4b;
pub const SC1_5_CENTER: u8 = 0x4c;
pub const SC1_6_RIGHT: u8 = 0x4d;
pub const SC1_7_HOME: u8 = 0x47;
pub const SC1_8_UP: u8 = 0x48;
pub const SC1_9_PGUP: u8 = 0x49;
pub const SC1_0_INSERT: u8 = 0x52;

// Symbols
pub const SC1_QUOTE_DBLQUOTE: u8 = 0x28;
pub const SC1_COMMA_LESSTHAN: u8 = 0x33;
pub const SC1_DASH_UNDERSCORE: u8 = 0x0c;
pub const SC1_PERIOD_GREATERTHAN: u8 = 0x34;
pub const SC1_SLASH_QUESTIONMARK: u8 = 0x35;
pub const SC1_SEMICOLON_COLON: u8 = 0x27;
pub const SC1_EQUALS_PLUS: u8 = 0x0d;
pub const SC1_LBRACKET_LCURLY: u8 = 0x1a;
pub const SC1_BACKSLASH_PIPE: u8 = 0x2b;
pub const SC1_RBRACKET_RCURLY: u8 = 0x1b;
pub const SC1_BACKTICK_TILDE: u8 = 0x29;

pub const SC1_BACKSPACE: u8 = 0x0e;
pub const SC1_TAB: u8 = 0x0f;

pub const SC1_SPACE: u8 = 0x39;
pub const SC1_DELETE: u8 = 0x53;

pub const SC1_ENTER: u8 = 0x1c;
pub const SC1_ESCAPE: u8 = 0x01;
pub const SC1_PRINTSCREEN: u8 = 0x37;
pub const SC1_F1: u8 = 0x3b;
pub const SC1_F2: u8 = 0x3c;
pub const SC1_F3: u8 = 0x3d;
pub const SC1_F4: u8 = 0x3e;
pub const SC1_F5: u8 = 0x3f;
pub const SC1_F6: u8 = 0x40;
pub const SC1_F7: u8 = 0x41;
pub const SC1_F8: u8 = 0x42;
pub const SC1_F9: u8 = 0x43;
pub const SC1_F10: u8 = 0x44;
pub const SC1_F11: u8 = 0x57;
pub const SC1_F12: u8 = 0x58;

pub const SC1_KP_ENTER: u8 = 0x1c;
pub const SC1_KP_DEL_PERIOD: u8 = 0x53;

// Gray keys
pub const SC1_KP_SLASH: u8 = 0x35;
pub const SC1_KP_ASTERISK: u8 = 0x37;
pub const SC1_KP_MINUS: u8 = 0x4a;
pub const SC1_KP_PLUS: u8 = 0x4e;
pub const SC1_DOWN: u8 = 0x50;
pub const SC1_LEFT: u8 = 0x4b;
pub const SC1_RIGHT: u8 = 0x4d;
pub const SC1_UP: u8 = 0x48;
pub const SC1_HOME: u8 = 0x47;
pub const SC1_INSERT: u8 = 0x52;
pub const SC1_END: u8 = 0x4f;
pub const SC1_PGUP: u8 = 0x49;
pub const SC1_PGDN: u8 = 0x51;

pub const SC1_CTRL_LEFT: u8 = 0x1d;
pub const SC1_CTRL_RIGHT: u8 = 0x1d;
pub const SC1_SHIFT_LEFT: u8 = 0x2a;
pub const SC1_CAPS_LOCK: u8 = 0x3a;
pub const SC1_NUM_LOCK: u8 = 0x45;
pub const SC1_SCROLL_LOCK: u8 = 0x46;
pub const SC1_SHIFT_RIGHT: u8 = 0x36;
pub const SC1_ALT_LEFT: u8 = 0x38;
pub const SC1_ALT_RIGHT: u8 = 0x38;

// TODO(JPH): pause key

pub const SC1_SUPER_LEFT: u8 = 0x5b;
pub const SC1_SUPER_RIGHT: u8 = 0x5c;


================================================
FILE: lib/propolis/src/hw/ps2/keyboard/scan_code_2.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Scan Code Set 2 codes

pub const SC2_RELEASE_CODE: u8 = 0xf0;
pub const SC2_EXTENDED_PREFIX_0: u8 = 0xe0;

// Letter keys
pub const SC2_A: u8 = 0x1c;
pub const SC2_B: u8 = 0x32;
pub const SC2_C: u8 = 0x21;
pub const SC2_D: u8 = 0x23;
pub const SC2_E: u8 = 0x24;
pub const SC2_F: u8 = 0x2b;
pub const SC2_G: u8 = 0x34;
pub const SC2_H: u8 = 0x33;
pub const SC2_I: u8 = 0x43;
pub const SC2_J: u8 = 0x3b;
pub const SC2_K: u8 = 0x42;
pub const SC2_L: u8 = 0x4b;
pub const SC2_M: u8 = 0x3a;
pub const SC2_N: u8 = 0x31;
pub const SC2_O: u8 = 0x44;
pub const SC2_P: u8 = 0x4d;
pub const SC2_Q: u8 = 0x15;
pub const SC2_R: u8 = 0x2d;
pub const SC2_S: u8 = 0x1b;
pub const SC2_T: u8 = 0x2c;
pub const SC2_U: u8 = 0x3c;
pub const SC2_V: u8 = 0x2a;
pub const SC2_W: u8 = 0x1d;
pub const SC2_X: u8 = 0x22;
pub const SC2_Y: u8 = 0x35;
pub const SC2_Z: u8 = 0x1a;

// Number keys
pub const SC2_0_RPAREN: u8 = 0x45;
pub const SC2_1_EXCLAMATION: u8 = 0x16;
pub const SC2_2_AT: u8 = 0x1e;
pub const SC2_3_HASH: u8 = 0x26;
pub const SC2_4_DOLLAR: u8 = 0x25;
pub const SC2_5_PERCENT: u8 = 0x2e;
pub const SC2_6_CARET: u8 = 0x36;
pub const SC2_7_AMPERSAND: u8 = 0x3d;
pub const SC2_8_ASTERISK: u8 = 0x3e;
pub const SC2_9_LPAREN: u8 = 0x46;

// Numpad Keys
pub const SC2_1_END: u8 = 0x69;
pub const SC2_2_DOWN: u8 = 0x72;
pub const SC2_3_PGDN: u8 = 0x7a;
pub const SC2_4_LEFT: u8 = 0x6b;
pub const SC2_5_CENTER: u8 = 0x73;
pub const SC2_6_RIGHT: u8 = 0x74;
pub const SC2_7_HOME: u8 = 0x6c;
pub const SC2_8_UP: u8 = 0x75;
pub const SC2_9_PGUP: u8 = 0x7d;
pub const SC2_0_INSERT: u8 = 0x70;

// Symbols
pub const SC2_QUOTE_DBLQUOTE: u8 = 0x52;
pub const SC2_COMMA_LESSTHAN: u8 = 0x41;
pub const SC2_DASH_UNDERSCORE: u8 = 0x4e;
pub const SC2_PERIOD_GREATERTHAN: u8 = 0x49;
pub const SC2_SLASH_QUESTIONMARK: u8 = 0x4a;
pub const SC2_SEMICOLON_COLON: u8 = 0x4c;
pub const SC2_EQUALS_PLUS: u8 = 0x55;
pub const SC2_LBRACKET_LCURLY: u8 = 0x54;
pub const SC2_BACKSLASH_PIPE: u8 = 0x5d;
pub const SC2_RBRACKET_RCURLY: u8 = 0x5b;
pub const SC2_BACKTICK_TILDE: u8 = 0x0e;

pub const SC2_BACKSPACE: u8 = 0x66;
pub const SC2_TAB: u8 = 0x0d;

pub const SC2_SPACE: u8 = 0x29;
pub const SC2_DELETE: u8 = 0x71;

pub const SC2_ENTER: u8 = 0x5a;
pub const SC2_ESCAPE: u8 = 0x76;
pub const SC2_PRINTSCREEN: u8 = 0x7c;
pub const SC2_F1: u8 = 0x05;
pub const SC2_F2: u8 = 0x06;
pub const SC2_F3: u8 = 0x04;
pub const SC2_F4: u8 = 0x0c;
pub const SC2_F5: u8 = 0x03;
pub const SC2_F6: u8 = 0x0b;
pub const SC2_F7: u8 = 0x83;
pub const SC2_F8: u8 = 0x0a;
pub const SC2_F9: u8 = 0x01;
pub const SC2_F10: u8 = 0x09;
pub const SC2_F11: u8 = 0x78;
pub const SC2_F12: u8 = 0x07;

pub const SC2_KP_ENTER: u8 = 0x5a;
pub const SC2_KP_DEL_PERIOD: u8 = 0x71;

// Gray keys
pub const SC2_KP_SLASH: u8 = 0x4a;
pub const SC2_KP_ASTERISK: u8 = 0x7c;
pub const SC2_KP_MINUS: u8 = 0x7b;
pub const SC2_KP_PLUS: u8 = 0x79;
pub const SC2_DOWN: u8 = 0x72;
pub const SC2_LEFT: u8 = 0x6b;
pub const SC2_RIGHT: u8 = 0x74;
pub const SC2_UP: u8 = 0x75;
pub const SC2_HOME: u8 = 0x6c;
pub const SC2_INSERT: u8 = 0x70;
pub const SC2_END: u8 = 0x69;
pub const SC2_PGUP: u8 = 0x7d;
pub const SC2_PGDN: u8 = 0x7a;

pub const SC2_CTRL_LEFT: u8 = 0x14;
pub const SC2_CTRL_RIGHT: u8 = 0x14;
pub const SC2_SHIFT_LEFT: u8 = 0x12;
pub const SC2_CAPS_LOCK: u8 = 0x58;
pub const SC2_NUM_LOCK: u8 = 0x77;
pub const SC2_SCROLL_LOCK: u8 = 0x7e;
pub const SC2_SHIFT_RIGHT: u8 = 0x59;
pub const SC2_ALT_LEFT: u8 = 0x11;
pub const SC2_ALT_RIGHT: u8 = 0x11;

// TODO(JPH): pause key

pub const SC2_SUPER_LEFT: u8 = 0x1f;
pub const SC2_SUPER_RIGHT: u8 = 0x27;


================================================
FILE: lib/propolis/src/hw/ps2/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod ctrl;
mod keyboard;


================================================
FILE: lib/propolis/src/hw/qemu/debug.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::chardev::{BlockingSource, BlockingSourceConsumer, ConsumerCell};
use crate::common::*;
use crate::pio::{PioBus, PioFn};

const QEMU_DEBUG_IOPORT: u16 = 0x0402;
const QEMU_DEBUG_IDENT: u8 = 0xe9;

pub struct QemuDebugPort {
    consumer: ConsumerCell,
}
impl QemuDebugPort {
    pub fn create(pio: &PioBus) -> Arc<Self> {
        let this = Arc::new(Self { consumer: ConsumerCell::new() });

        let piodev = this.clone();
        let piofn = Arc::new(move |_port: u16, rwo: RWOp| piodev.pio_rw(rwo))
            as Arc<PioFn>;
        pio.register(QEMU_DEBUG_IOPORT, 1, piofn).unwrap();
        this
    }

    fn pio_rw(&self, rwo: RWOp) {
        match rwo {
            RWOp::Read(ro) => {
                ro.write_u8(QEMU_DEBUG_IDENT);
            }
            RWOp::Write(wo) => {
                let c = wo.read_u8();
                self.consumer.consume(&[c]);
            }
        }
    }
}

impl BlockingSource for QemuDebugPort {
    fn set_consumer(&self, f: Option<BlockingSourceConsumer>) {
        self.consumer.set(f);
    }
}

impl Lifecycle for QemuDebugPort {
    fn type_name(&self) -> &'static str {
        "qemu-lpc-debug"
    }
}


================================================
FILE: lib/propolis/src/hw/qemu/fwcfg.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::{btree_map, BTreeMap};
use std::io::Write;
use std::mem::size_of;
use std::ops::{Deref, DerefMut};
use std::sync::{Arc, Mutex, MutexGuard};

use crate::accessors::MemAccessor;
use crate::common::*;
use crate::hw::qemu::ramfb::RamFb;
use crate::migrate::*;
use crate::pio::{PioBus, PioFn};
use crate::vmm::MemCtx;
use bits::*;

use thiserror::Error;
use zerocopy::IntoBytes;

const SIGNATURE_VALUE: &[u8; 4] = b"QEMU";

#[allow(unused)]
#[derive(Copy, Clone)]
#[repr(u16)]
pub enum LegacyId {
    Signature = 0x0000,
    Id = 0x0001,
    Uuid = 0x0002,
    RamSize = 0x0003,
    GraphicsEna = 0x0004,
    SmpCpuCount = 0x0005,
    MachineId = 0x0006,
    KernelAddr = 0x0007,
    KernelSize = 0x0008,
    KernelCmdline = 0x0009,
    InitrdAddr = 0x000a,
    InitrdSize = 0x000b,
    BootDevice = 0x000c,
    NumaData = 0x000d,
    BootMenu = 0x000e,
    MaxCpuCount = 0x000f,
    KernelEntry = 0x0010,
    KernelData = 0x0011,
    InitrdData = 0x0012,
    CmdlineAddr = 0x0013,
    CmdlineSize = 0x0014,
    CmdlineData = 0x0015,
    KernelSetupAddr = 0x0016,
    KernelSetupSize = 0x0017,
    KernelSetupData = 0x0018,
    FileDir = 0x0019,
}
impl LegacyId {
    const fn name(&self) -> &'static str {
        match self {
            LegacyId::Signature => "signature",
            LegacyId::Id => "id",
            LegacyId::Uuid => "uuid",
            LegacyId::RamSize => "ram_size",
            LegacyId::GraphicsEna => "nographic",
            LegacyId::SmpCpuCount => "nb_cpus",
            LegacyId::MachineId => "machine_id",
            LegacyId::KernelAddr => "kernel_addr",
            LegacyId::KernelSize => "kernel_size",
            LegacyId::KernelCmdline => "kernel_cmdline",
            LegacyId::InitrdAddr => "initrd_addr",
            LegacyId::InitrdSize => "initrd_size",
            LegacyId::BootDevice => "boot_device",
            LegacyId::NumaData => "numa",
            LegacyId::BootMenu => "boot_menu",
            LegacyId::MaxCpuCount => "max_cpus",
            LegacyId::KernelEntry => "kernel_entry",
            LegacyId::KernelData => "kernel_data",
            LegacyId::InitrdData => "initrd_data",
            LegacyId::CmdlineAddr => "cmdline_addr",
            LegacyId::CmdlineSize => "cmdline_size",
            LegacyId::CmdlineData => "cmdline_data",
            LegacyId::KernelSetupAddr => "setup_addr",
            LegacyId::KernelSetupSize => "setup_size",
            LegacyId::KernelSetupData => "setup_data",
            LegacyId::FileDir => "file_dir",
        }
    }
}
pub enum LegacyX86Id {
    AcpiTables = 0x8000,
    SmbiosTables = 0x8001,
    Irq0Override = 0x8002,
    E820Table = 0x8003,
    HpetData = 0x8004,
}

#[derive(Debug)]
pub enum Entry {
    FileDir,
    RamFb,
    Bytes(Vec<u8>),
}
impl Entry {
    pub fn fixed_u32(value: u32) -> Self {
        Self::Bytes(value.to_le_bytes().to_vec())
    }
}

#[derive(Debug, Error)]
pub enum InsertError {
    #[error("invalid selector")]
    InvalidSelector,

    #[error("selector {0} already exists")]
    SelectorExists(u16),

    #[error("name {0:?} already in use")]
    NameExists(String),

    #[error("no capacity")]
    NoCapacity,
}

struct Directory {
    entries: BTreeMap<u16, (Entry, String)>,
    names: BTreeMap<String, u16>,
    next_named: u16,
}
impl Directory {
    fn new() -> Self {
        Self {
            entries: BTreeMap::new(),
            names: BTreeMap::new(),
            next_named: ITEMS_FILE_START,
        }
    }
    fn insert(
        &mut self,
        selector: u16,
        name: String,
        entry: Entry,
    ) -> Result<(), InsertError> {
        #[allow(clippy::map_entry)]
        if selector == ITEM_INVALID {
            Err(InsertError::InvalidSelector)
        } else if self.names.contains_key(&name) {
            Err(InsertError::NameExists(name))
        } else if self.entries.contains_key(&selector) {
            Err(InsertError::SelectorExists(selector))
        } else {
            self.names.insert(name.clone(), selector);
            self.entries.insert(selector, (entry, name));
            if selector == self.next_named {
                self.next_named += 1;
            }
            Ok(())
        }
    }
    fn insert_legacy(
        &mut self,
        id: LegacyId,
        entry: Entry,
    ) -> Result<(), InsertError> {
        let name = id.name().to_owned();
        self.insert(id as u16, name, entry)
    }
    fn remove(&mut self, selector: u16) -> Option<Entry> {
        let (entry, name) = self.entries.remove(&selector)?;
        let name_to_selector = self.names.remove(&name);
        assert_eq!(Some(selector), name_to_selector);
        if (ITEMS_FILE_START..ITEMS_FILE_END).contains(&selector) {
            self.next_named = u16::min(self.next_named, selector);
        }
        Some(entry)
    }
    /// Find the next available selector "slot" for a named entry
    fn next_named_selector(&self) -> Option<u16> {
        // Assume that consumers will be relatively well-behaved, and not be
        // adding/removing entries with reckless abandon.  This naive search can
        // be improved later if it becomes a problem.
        (self.next_named..ITEMS_FILE_END)
            .find(|selector| !self.entries.contains_key(&selector))
    }
    fn entry(&mut self, selector: u16) -> Option<&mut Entry> {
        self.entries.get_mut(&selector).map(|(ent, _name)| ent)
    }
    /// Look up (by `name`) the selector for an entry, if present
    fn named_selector(&mut self, name: &str) -> Option<u16> {
        self.names.get(name).copied()
    }
    fn entries(&self) -> Entries<'_> {
        Entries { iter: self.names.iter(), entries: &self.entries }
    }
    fn clear(&mut self) {
        self.entries.clear();
        self.names.clear();
        self.next_named = ITEMS_FILE_START;
    }
    /// Render the directory into the format expected by a guest reading it via
    /// the `fw_cfg` interface.
    fn render(&self) -> Vec<u8> {
        let rendered_size =
            size_of::<u32>() + self.entries.len() * size_of::<FwCfgFile>();
        let mut buf: Vec<u8> = Vec::with_capacity(rendered_size);
        buf.write_all(&(self.entries.len() as u32).to_be_bytes()).unwrap();

        for (selector, name, entry) in self.entries() {
            let size = match entry {
                Entry::FileDir => rendered_size,
                Entry::RamFb => RamFb::FWCFG_ENTRY_SIZE,
                Entry::Bytes(buf) => buf.len(),
            };
            let entry = FwCfgFile::new(size as u32, selector, name);
            buf.write_all(entry.as_bytes()).unwrap();
        }
        assert_eq!(buf.len(), rendered_size);
        buf
    }
}

#[derive(thiserror::Error, Debug)]
enum FwCfgErr {
    #[error("No entry selected")]
    NoneSelected,
    #[error("Entry is read-only")]
    ReadOnly,
    #[error("Bad DMA address")]
    BadAddr,
    #[error("DMA command not recognized")]
    UnrecognizedDmaCmd,
    #[error("Operation was not successful")]
    OpUnsuccessful,
}

struct Entries<'a> {
    iter: btree_map::Iter<'a, String, u16>,
    entries: &'a BTreeMap<u16, (Entry, String)>,
}
impl<'a> Iterator for Entries<'a> {
    type Item = (u16, &'a str, &'a Entry);

    fn next(&mut self) -> Option<Self::Item> {
        let (name, selector) = self.iter.next()?;
        let (entry, _name) = self.entries.get(selector).unwrap();
        debug_assert_eq!(name, _name);
        Some((*selector, name, entry))
    }
}

struct State {
    directory: Directory,

    selected: Option<SelectedEntry>,

    dma_addr_high: u32,
    dma_addr_low: u32,

    /// RamFB device associated with any [Entry::RamFb] type entry(s)
    ramfb: Option<Arc<RamFb>>,
}
impl State {
    fn dma_addr(&self) -> u64 {
        (u64::from(self.dma_addr_high) << 32) | u64::from(self.dma_addr_low)
    }
    fn reset(&mut self) {
        self.selected = None;
        self.dma_addr_high = 0;
        self.dma_addr_low = 0;
    }
}
struct SelectedEntry {
    selector: u16,
    offset: u32,
    cached_value: Option<Vec<u8>>,
}

pub struct FwCfg {
    state: Mutex<State>,
    acc_mem: MemAccessor,
}
impl FwCfg {
    /// Create a new `fw_cfg` device and populate it with the basic minimum
    /// entries ([LegacyId::Signature], [LegacyId::Id], [LegacyId::FileDir]).
    pub fn new() -> Arc<Self> {
        let mut directory = Directory::new();

        directory
            .insert_legacy(
                LegacyId::Signature,
                Entry::Bytes(SIGNATURE_VALUE.to_vec()),
            )
            .unwrap();
        directory
            .insert_legacy(
                LegacyId::Id,
                Entry::fixed_u32(FW_CFG_VER_BASE | FW_CFG_VER_DMA),
            )
            .unwrap();
        directory.insert_legacy(LegacyId::FileDir, Entry::FileDir).unwrap();

        Arc::new(Self {
            state: Mutex::new(State {
                directory,
                selected: None,

                dma_addr_high: 0,
                dma_addr_low: 0,

                ramfb: None,
            }),
            acc_mem: MemAccessor::new_orphan(),
        })
    }

    pub fn attach(self: &Arc<Self>, pio: &PioBus, acc_mem: &MemAccessor) {
        acc_mem.adopt(&self.acc_mem, Some("fw_cfg".to_string()));
        let ports = [
            (FW_CFG_IOP_SELECTOR, 1),
            (FW_CFG_IOP_DATA, 1),
            (FW_CFG_IOP_DMA_HI, 4),
            (FW_CFG_IOP_DMA_LO, 4),
        ];
        let this = self.clone();
        let piofn = Arc::new(move |port: u16, rwo: RWOp| match rwo {
            RWOp::Read(ro) => this.pio_read(port, ro),
            RWOp::Write(wo) => this.pio_write(port, wo),
        }) as Arc<PioFn>;
        for (port, len) in ports.iter() {
            pio.register(*port, *len, piofn.clone()).unwrap()
        }
    }

    /// Change the [RamFb] attachment for any [Entry::RamFb] entry(s)
    pub fn attach_ramfb(
        &self,
        ramfb: Option<Arc<RamFb>>,
    ) -> Option<Arc<RamFb>> {
        let mut state = self.state.lock().unwrap();
        std::mem::replace(&mut state.ramfb, ramfb)
    }

    /// Insert entry using [LegacyId] identifier (and its appropriately derived
    /// name)
    pub fn insert_legacy(
        &self,
        id: LegacyId,
        entry: Entry,
    ) -> Result<(), InsertError> {
        let mut state = self.state.lock().unwrap();
        state.directory.insert_legacy(id, entry)
    }
    /// Insert entry with specified `name`
    ///
    /// Note: Per the qemu docs for `fw_cfg`, the chosen `name` should be ASCII
    pub fn insert_named(
        &self,
        name: &str,
        entry: Entry,
    ) -> Result<u16, InsertError> {
        let mut state = self.state.lock().unwrap();
        let selector = state
            .directory
            .next_named_selector()
            .ok_or(InsertError::NoCapacity)?;
        state.directory.insert(selector, name.to_owned(), entry)?;
        Ok(selector)
    }

    pub fn remove(&self, selector: u16) -> Option<Entry> {
        let mut state = self.state.lock().unwrap();
        let entry = state.directory.remove(selector)?;
        Self::ensure_valid_selected(&mut state);
        Some(entry)
    }
    pub fn remove_named(&self, name: &str) -> Option<Entry> {
        let mut state = self.state.lock().unwrap();
        let selector = state.directory.named_selector(name)?;
        let entry = state
            .directory
            .remove(selector)
            .expect("entry is present for translated selector");
        Self::ensure_valid_selected(&mut state);
        Some(entry)
    }

    /// Ensure that the selected entry (if any) is still valid after a change to
    /// the directory, clearing the selection if there was trouble.
    ///
    /// This should not happen unless the VMM chooses to remove an item which
    /// the running guest had selected.  Doing so is rather unsporting.
    fn ensure_valid_selected(state: &mut MutexGuard<State>) {
        let selector = match state.selected.as_ref() {
            None => {
                return;
            }
            Some(s) => s.selector,
        };
        if state.directory.entry(selector).is_none() {
            state.selected = None;
        }
    }

    fn pio_read(&self, port: u16, ro: &mut ReadOp) {
        let mut state = self.state.lock().unwrap();
        match port {
            FW_CFG_IOP_SELECTOR => {
                if ro.len() == 2 {
                    let selector = state
                        .selected
                        .as_ref()
                        .map(|s| s.selector)
                        .unwrap_or(ITEM_INVALID);
                    ro.write_u16(selector);
                } else {
                    ro.fill(0);
                }
            }
            FW_CFG_IOP_DATA => {
                if ro.len() != 1 {
                    ro.fill(0);
                    return;
                }

                match self.read(&mut state, &mut ReadOp::new_child(0, ro, 0..1))
                {
                    Ok(1) => {}
                    Ok(_) | Err(_) => {
                        ro.write_u8(0);
                    }
                }
            }
            FW_CFG_IOP_DMA_HI => {
                if ro.len() == 4 {
                    ro.write_u32(state.dma_addr_high.to_be());
                } else {
                    ro.fill(0);
                }
            }
            FW_CFG_IOP_DMA_LO => {
                if ro.len() == 4 {
                    ro.write_u32(state.dma_addr_low.to_be());
                } else {
                    ro.fill(0);
                }
            }
            _ => {
                panic!("unexpected port {:x}", port);
            }
        }
    }

    fn pio_write(&self, port: u16, wo: &mut WriteOp) {
        let mut state = self.state.lock().unwrap();
        match port {
            FW_CFG_IOP_SELECTOR => {
                if wo.len() == 2 {
                    self.select(&mut state, wo.read_u16())
                }
            }
            FW_CFG_IOP_DATA => {
                // Writes through the legacy (non-DMA) interface are not
                // supported, and thus ignored.
            }
            FW_CFG_IOP_DMA_HI => {
                if wo.len() == 4 {
                    state.dma_addr_high = u32::from_be(wo.read_u32());
                }
            }
            FW_CFG_IOP_DMA_LO => {
                if wo.len() == 4 {
                    state.dma_addr_low = u32::from_be(wo.read_u32());
                    let _ = self.dma_initiate(&mut state);
                }
            }
            _ => {
                panic!("unexpected port {:x}", port);
            }
        }
    }

    fn select(&self, state: &mut MutexGuard<State>, selector: u16) {
        let _ = state.selected.take();
        if let Some(entry) = state.directory.entry(selector) {
            let value_buffer = match entry {
                // Cache the rendered file directory, if selected
                Entry::FileDir => Some(state.directory.render()),
                Entry::RamFb | Entry::Bytes(_) => None,
            };
            state.selected = Some(SelectedEntry {
                selector,
                offset: 0,
                cached_value: value_buffer,
            });
        }
    }

    fn read(
        &self,
        state: &mut MutexGuard<State>,
        ro: &mut ReadOp,
    ) -> Result<usize, FwCfgErr> {
        let state = state.deref_mut();

        // Reads to a non-existent entry result in no emitted bytes (and the
        // caller filling the remaining buffer with zeros).  This is in contrast
        // to attempted writes to missing entries resulting in a hard error.
        if state.selected.is_none() {
            return Ok(0);
        }
        let selected = state.selected.as_mut().unwrap();

        let entry = state
            .directory
            .entry(selected.selector)
            .expect("selected entry is present");
        // Encode the current offset into the ReadOp (as a child)
        let mut ro = ReadOp::new_child(selected.offset as usize, ro, ..);

        fn write_buf(buf: &[u8], ro: &mut ReadOp) -> usize {
            let off = ro.offset();
            if off >= buf.len() {
                0
            } else {
                let remain = &buf[off..];
                let copy_len = usize::min(remain.len(), ro.avail());

                ro.write_bytes(&remain[..copy_len]);
                copy_len
            }
        }

        let len = if let Some(buf) = selected.cached_value.as_ref() {
            write_buf(buf, &mut ro)
        } else {
            match entry {
                Entry::RamFb => {
                    if let Some(ramfb) = state.ramfb.as_ref() {
                        ramfb
                            .fwcfg_rw(RWOp::Read(&mut ro))
                            .map_err(|_| FwCfgErr::OpUnsuccessful)?;
                        ro.bytes_written()
                    } else {
                        0
                    }
                }
                Entry::Bytes(buf) => write_buf(&buf, &mut ro),
                Entry::FileDir => {
                    panic!("expected intact cached buffer for static entry");
                }
            }
        };
        selected.offset = selected
            .offset
            .checked_add(len as u32)
            .expect("offset does not overflow");
        Ok(len)
    }

    fn write(
        &self,
        state: &mut MutexGuard<State>,
        wo: &mut WriteOp,
    ) -> Result<usize, FwCfgErr> {
        let state = state.deref_mut();
        let selected = state.selected.as_mut().ok_or(FwCfgErr::NoneSelected)?;
        let entry = state
            .directory
            .entry(selected.selector)
            .expect("selected entry is present");
        // Encode the current offset into the WriteOp (as a child)
        let mut wo = WriteOp::new_child(selected.offset as usize, wo, ..);

        let len = match entry {
            Entry::FileDir | Entry::Bytes(_) => Err(FwCfgErr::ReadOnly),
            Entry::RamFb => {
                if let Some(ramfb) = state.ramfb.as_ref() {
                    ramfb
                        .fwcfg_rw(RWOp::Write(&mut wo))
                        .map_err(|_| FwCfgErr::OpUnsuccessful)?;
                    Ok(wo.bytes_read())
                } else {
                    Ok(0)
                }
            }
        }?;
        selected.offset = selected
            .offset
            .checked_add(len as u32)
            .expect("offset does not overflow");
        Ok(len)
    }

    fn dma_initiate(
        &self,
        state: &mut MutexGuard<State>,
    ) -> Result<(), FwCfgErr> {
        // initiating a DMA transfer clears the addr contents
        let addr = state.dma_addr();
        state.dma_addr_high = 0;
        state.dma_addr_low = 0;

        let mem_guard = self.acc_mem.access().expect("usable mem accessor");
        let mem = mem_guard.deref();
        let req: GuestData<FwCfgDmaAccess> =
            mem.read(GuestAddr(addr)).ok_or(FwCfgErr::BadAddr)?;

        fn dma_write_result(
            is_success: bool,
            req_addr: GuestAddr,
            mem: &MemCtx,
        ) {
            let result_val = match is_success {
                true => 0,
                false => u32::to_be(FwCfgDmaCtrl::ERROR.bits()),
            };
            mem.write(req_addr, &result_val);
        }

        // Do we recognize all of the control bits?
        //
        // The upper 16 bits are masked out, as they will contain the entry
        // selector when the SELECT function is specified.
        if FwCfgDmaCtrl::from_bits(req.ctrl.get() & 0xffff).is_none() {
            dma_write_result(false, GuestAddr(addr), mem);
            return Err(FwCfgErr::UnrecognizedDmaCmd);
        }

        let res = self.dma_operation(state, req, mem);

        dma_write_result(res.is_ok(), GuestAddr(addr), mem);
        Ok(())
    }

    fn dma_operation(
        &self,
        state: &mut MutexGuard<State>,
        req: GuestData<FwCfgDmaAccess>,
        mem: &MemCtx,
    ) -> Result<(), FwCfgErr> {
        let opts = FwCfgDmaCtrl::from_bits_truncate(req.ctrl.get());
        if opts.contains(FwCfgDmaCtrl::SELECT) {
            let selector = (req.ctrl.get() >> 16) as u16;
            self.select(state, selector);
        }

        // The expressed precedence of the available operations here is entirely
        // intentional.  Per the (paraphrased) fw_cfg documentation in qemu:
        //
        // - If the READ bit is set, a read operation will be performed
        // - If the WRITE bit is set (and not READ), a write operation will
        //   be performed
        // - If the SKIP bit is set (and neither READ nor WRITE), a skip
        //   operation will be performed
        if opts.contains(FwCfgDmaCtrl::READ) {
            let buf_len = req.len.get() as usize;
            let map = mem
                .writable_region(&GuestRegion(
                    GuestAddr(req.addr.get()),
                    buf_len,
                ))
                .ok_or(FwCfgErr::BadAddr)?;
            let mut ro = ReadOp::from_mapping(0, map);

            let nread = self.read(state, &mut ro)?;
            if nread < buf_len {
                // If the item being read did not cover the entire DMA region in
                // the request, zero out the rest
                assert!(ro.avail() > 0);
                ro.fill(0);
            }
        } else if opts.contains(FwCfgDmaCtrl::WRITE) {
            let buf_len = req.len.get() as usize;
            let map = mem
                .readable_region(&GuestRegion(
                    GuestAddr(req.addr.get()),
                    buf_len,
                ))
                .ok_or(FwCfgErr::BadAddr)?;
            let mut wo = WriteOp::from_mapping(0, map);
            self.write(state, &mut wo)?;
        } else if opts.contains(FwCfgDmaCtrl::SKIP) {
            if let Some(selected) = state.selected.as_mut() {
                selected.offset = selected.offset.saturating_add(req.len.get());
            }
        }

        Ok(())
    }
}

impl Lifecycle for FwCfg {
    fn type_name(&self) -> &'static str {
        "qemu-fwcfg"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
    fn reset(&self) {
        self.state.lock().unwrap().reset();
    }
}
impl MigrateSingle for FwCfg {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let state = self.state.lock().unwrap();
        let selected =
            state.selected.as_ref().map(|sel| migrate::FwCfgSelectedV2 {
                selector: sel.selector,
                offset: sel.offset,
                cached_value: sel.cached_value.clone(),
            });
        let entries = state
            .directory
            .entries()
            .map(|(selector, name, entry)| migrate::FwCfgEntryV2 {
                selector,
                name: name.to_owned(),
                value: entry.into(),
            })
            .collect::<Vec<_>>();

        Ok(migrate::FwCfgV2 { dma_addr: state.dma_addr(), selected, entries }
            .into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let mut data: migrate::FwCfgV2 = offer.parse()?;

        let mut state = self.state.lock().unwrap();
        state.dma_addr_low = data.dma_addr as u32;
        state.dma_addr_high = (data.dma_addr >> 32) as u32;
        state.selected = data.selected.take().map(|s| SelectedEntry {
            selector: s.selector,
            offset: s.offset,
            cached_value: s.cached_value,
        });

        state.directory.clear();
        for migrate::FwCfgEntryV2 { selector, name, value } in data.entries {
            state.directory.insert(selector, name, value.into()).map_err(
                |e| {
                    MigrateStateError::ImportFailed(format!(
                        "error importing fwcfg entry: {e:?}"
                    ))
                },
            )?;
        }
        Self::ensure_valid_selected(&mut state);
        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;

    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct FwCfgV2 {
        pub dma_addr: u64,
        pub selected: Option<FwCfgSelectedV2>,
        pub entries: Vec<FwCfgEntryV2>,
    }
    #[derive(Deserialize, Serialize)]
    pub struct FwCfgSelectedV2 {
        pub selector: u16,
        pub offset: u32,
        pub cached_value: Option<Vec<u8>>,
    }
    #[derive(Deserialize, Serialize)]
    pub struct FwCfgEntryV2 {
        pub selector: u16,
        pub name: String,
        pub value: FwCfgEntryValueV2,
    }
    #[derive(Deserialize, Serialize, Clone)]
    pub enum FwCfgEntryValueV2 {
        FileDir,
        RamFb,
        Bytes(Vec<u8>),
    }
    impl From<&super::Entry> for FwCfgEntryValueV2 {
        fn from(value: &super::Entry) -> Self {
            match value {
                super::Entry::FileDir => Self::FileDir,
                super::Entry::RamFb => Self::RamFb,
                super::Entry::Bytes(buf) => Self::Bytes(buf.clone()),
            }
        }
    }
    impl From<FwCfgEntryValueV2> for super::Entry {
        fn from(value: FwCfgEntryValueV2) -> Self {
            match value {
                FwCfgEntryValueV2::FileDir => Self::FileDir,
                FwCfgEntryValueV2::RamFb => Self::RamFb,
                FwCfgEntryValueV2::Bytes(buf) => Self::Bytes(buf),
            }
        }
    }

    impl Schema<'_> for FwCfgV2 {
        fn id() -> SchemaId {
            ("qemu-fwcfg", 2)
        }
    }
}

mod bits {
    #![allow(unused)]

    use zerocopy::byteorder::big_endian::{
        U16 as BE16, U32 as BE32, U64 as BE64,
    };
    use zerocopy::{FromBytes, Immutable, IntoBytes};

    pub const FW_CFG_IOP_SELECTOR: u16 = 0x0510;
    pub const FW_CFG_IOP_DATA: u16 = 0x0511;
    pub const FW_CFG_IOP_DMA_HI: u16 = 0x0514;
    pub const FW_CFG_IOP_DMA_LO: u16 = 0x0518;

    pub const ITEM_INVALID: u16 = 0xffff;
    pub const ITEMS_FILE_START: u16 = 0x0020;
    pub const ITEMS_FILE_END: u16 = 0x1000;
    pub const ITEMS_ARCH_START: u16 = 0x8000;
    pub const ITEMS_ARCH_END: u16 = 0x9000;

    pub const FW_CFG_VER_BASE: u32 = 1 << 0;
    pub const FW_CFG_VER_DMA: u32 = 1 << 1;

    bitflags! {
        pub struct FwCfgDmaCtrl: u32 {
            const ERROR = 1 << 0;
            const READ = 1 << 1;
            const SKIP = 1 << 2;
            const SELECT = 1 << 3;
            const WRITE = 1 << 4;
        }
    }

    pub const FWCFG_FILENAME_LEN: usize = 56;

    #[derive(IntoBytes, Immutable)]
    #[repr(C)]
    pub struct FwCfgFile {
        size: BE32,
        select: BE16,
        reserved: u16,
        name: [u8; FWCFG_FILENAME_LEN],
    }
    impl FwCfgFile {
        pub fn new(size: u32, select: u16, name: &str) -> Self {
            let name_len = name.len();
            assert!(name_len < FWCFG_FILENAME_LEN);

            let mut this = Self {
                size: BE32::new(size),
                select: BE16::new(select),
                reserved: 0,
                name: [0; FWCFG_FILENAME_LEN],
            };
            this.name[..name_len].copy_from_slice(name.as_bytes());
            this
        }
    }

    #[derive(IntoBytes, Default, Copy, Clone, Debug, FromBytes)]
    #[repr(C)]
    pub struct FwCfgDmaAccess {
        pub ctrl: BE32,
        pub len: BE32,
        pub addr: BE64,
    }
}

#[cfg(test)]
mod test {
    use super::*;

    use crate::accessors::MemAccessor;
    use crate::common::GuestAddr;
    use crate::vmm::Machine;

    use zerocopy::{FromBytes, Immutable, IntoBytes};

    fn pio_write<T: IntoBytes + Immutable>(dev: &FwCfg, port: u16, data: T) {
        let buf = data.as_bytes();
        let mut wo = WriteOp::from_buf(0, buf);
        dev.pio_write(port, &mut wo);
    }
    fn pio_read<T: IntoBytes + Immutable + FromBytes + Copy + Default>(
        dev: &FwCfg,
        port: u16,
    ) -> T {
        let mut val = T::default();
        let mut ro = ReadOp::from_buf(0, val.as_mut_bytes());
        dev.pio_read(port, &mut ro);
        val
    }
    fn pio_read_data<T: IntoBytes + FromBytes + Copy + Default>(
        dev: &FwCfg,
    ) -> T {
        let mut val = T::default();
        for c in val.as_mut_bytes().iter_mut() {
            *c = pio_read(dev, FW_CFG_IOP_DATA);
        }
        val
    }

    #[test]
    fn struct_sizing() {
        assert_eq!(std::mem::size_of::<FwCfgFile>(), 64);
        assert_eq!(std::mem::size_of::<FwCfgDmaAccess>(), 16);
    }

    #[test]
    fn pio_read_basic() {
        let dev = FwCfg::new();

        pio_write(&dev, FW_CFG_IOP_SELECTOR, LegacyId::Signature as u16);
        let rbuf = pio_read_data::<[u8; 4]>(&dev);

        assert_eq!(&rbuf, "QEMU".as_bytes());

        pio_write(&dev, FW_CFG_IOP_SELECTOR, LegacyId::Id as u16);
        let _rbuf = pio_read_data::<[u8; 4]>(&dev);
    }
    #[test]
    fn pio_read_missing() {
        let dev = FwCfg::new();

        pio_write(&dev, FW_CFG_IOP_SELECTOR, 0xfffe);
        let rbuf = pio_read_data::<[u8; 4]>(&dev);
        // missing entry should just be all zeroes
        assert_eq!(rbuf, [0u8; 4]);
    }

    #[test]
    fn read_version() {
        let dev = FwCfg::new();

        pio_write(&dev, FW_CFG_IOP_SELECTOR, LegacyId::Id as u16);

        let rbuf = pio_read_data::<[u8; 4]>(&dev);
        let version = u32::from_ne_bytes(rbuf);
        assert_eq!(version, FW_CFG_VER_BASE | FW_CFG_VER_DMA);
    }

    fn machine_setup() -> (Machine, Arc<FwCfg>, MemAccessor) {
        let machine = Machine::new_test().unwrap();

        let dev = FwCfg::new();
        dev.attach(&machine.bus_pio, &machine.acc_mem);

        let acc_mem = machine.acc_mem.child(None);

        (machine, dev, acc_mem)
    }

    struct DmaReq {
        ctrl: u32,
        len: u32,
        addr: u64,
    }
    fn write_dma_req(mem: &MemCtx, req_addr: u64, req: DmaReq) {
        mem.write(GuestAddr(req_addr), &u32::to_be(req.ctrl));
        mem.write(GuestAddr(req_addr + 4), &u32::to_be(req.len));
        mem.write(GuestAddr(req_addr + 8), &u64::to_be(req.addr));
    }
    fn submit_dma_req(dev: &FwCfg, req_addr: u64) {
        pio_write(dev, FW_CFG_IOP_DMA_HI, u32::to_be((req_addr >> 32) as u32));
        pio_write(dev, FW_CFG_IOP_DMA_LO, u32::to_be(req_addr as u32));
    }

    #[test]
    fn dma_read_basic() {
        let (_machine, dev, acc_mem) = machine_setup();
        let mem = acc_mem.access().unwrap();

        // Select signature entry and read 4 bytes
        let (req_addr, dma_addr) = (0x10_1000, 0x10_2000);
        write_dma_req(
            &mem,
            req_addr,
            DmaReq {
                ctrl: (u32::from(LegacyId::Signature as u16) << 16) | 0x000a,
                len: 4,
                addr: dma_addr,
            },
        );
        submit_dma_req(&dev, req_addr);

        // DMA should have successfully completed now
        assert_eq!(*mem.read::<u32>(GuestAddr(req_addr)).unwrap(), 0);
        let data = mem.read::<[u8; 4]>(GuestAddr(dma_addr)).unwrap();
        assert_eq!(&*data, "QEMU".as_bytes());
    }

    #[test]
    fn dma_read_missing() {
        let (_machine, dev, acc_mem) = machine_setup();
        let mem = acc_mem.access().unwrap();

        // Select missing entry and attempt to read 4 bytes
        let (req_addr, dma_addr) = (0x10_1000, 0x10_2000);
        write_dma_req(
            &mem,
            req_addr,
            DmaReq { ctrl: (0xfffe << 16) | 0x000a, len: 4, addr: dma_addr },
        );

        // Put garbage at dma destination to confirm it gets overwritten
        mem.write(GuestAddr(dma_addr), &[0xffu8; 4]);

        submit_dma_req(&dev, req_addr);

        // DMA should have successfully completed now
        assert_eq!(*mem.read::<u32>(GuestAddr(req_addr)).unwrap(), 0);
        let data = mem.read::<[u8; 4]>(GuestAddr(dma_addr)).unwrap();
        assert_eq!(*data, [0u8; 4]);
    }

    #[test]
    fn state_cleared_on_reset() {
        let (_machine, dev, _acc_mem) = machine_setup();

        // select an item
        pio_write(&dev, FW_CFG_IOP_SELECTOR, LegacyId::Id as u16);

        // ... and write the high DMA field
        // (Since the low field would initiate the op)
        let dma_val = 0x1234_5678;
        pio_write(&dev, FW_CFG_IOP_DMA_HI, dma_val);

        // Confirm those were set
        assert_eq!(
            LegacyId::Id as u16,
            pio_read::<u16>(&dev, FW_CFG_IOP_SELECTOR)
        );
        assert_eq!(dma_val, pio_read::<u32>(&dev, FW_CFG_IOP_DMA_HI));

        dev.reset();

        //... and are cleared after the reset
        assert_eq!(
            bits::ITEM_INVALID,
            pio_read::<u16>(&dev, FW_CFG_IOP_SELECTOR)
        );
        assert_eq!(0, pio_read::<u32>(&dev, FW_CFG_IOP_DMA_HI));
    }
}

pub mod formats {
    use super::Entry;
    use crate::hw::pci;
    use zerocopy::{Immutable, IntoBytes};

    /// A type for a range described in an E820 map entry.
    ///
    /// This is canonically defined as the ACPI "Address Range Types", though we
    /// only define the types we use, which are a subset of the types that EDK2
    /// is known to care about, which itself is a subset of types that ACPI and
    /// OEMs define or guest OSes may care about.
    #[derive(IntoBytes, Immutable)]
    #[repr(u32)]
    enum EfiAcpiMemoryType {
        Memory = 1,
        Reserved = 2,
        // For reference, though these types are unused.
        // Acpi = 3,
        // Nvs = 4,
    }

    /// One address/length/type entry in the E820 map.
    ///
    /// This is... almost defined by ACPI's "Address Range Descriptor Structure"
    /// table, under "INT 15H, E820". Critically, ACPI defines this structure
    /// with an additional "Extended Attributes" field which EDK2 does not know
    /// about and so we do not provide. Consequently the size of this struct is
    /// 20 bytes as defined in `OvmfPkg/Include/IndustryStandard/E820.h` rather
    /// than the ACPI definition's 24 bytes.
    #[derive(IntoBytes, Immutable)]
    #[repr(C, packed)]
    struct E820Entry64 {
        base_addr: u64,
        length: u64,
        ty: EfiAcpiMemoryType,
    }

    /// A list of E820 memory map entries.
    ///
    /// This is not defined by ACPI, but is an EDK2 implementation of a QMEU
    /// construct to communicate an E820 map to the firmware. It is parsed by
    /// EDK2 and added to its EFI memory map; it is not, itself, the memory map
    /// that OVMF presents via UEFI services. It is not required to be sorted,
    /// and EDK2 ignores entries starting below 4 GiB. Adding additional
    /// low-memory entries is not harmful, but not valuable to EDK2 either.
    pub struct E820Table(Vec<E820Entry64>);
    impl E820Table {
        pub fn new() -> Self {
            Self(Vec::new())
        }

        /// Add an address range corresponding to usable memory.
        pub fn add_mem(&mut self, base_addr: u64, length: u64) {
            self.0.push(E820Entry64 {
                base_addr,
                length,
                ty: EfiAcpiMemoryType::Memory,
            });
        }

        /// Add a reserved address, not to be used by the guest OS.
        pub fn add_reserved(&mut self, base_addr: u64, length: u64) {
            self.0.push(E820Entry64 {
                base_addr,
                length,
                ty: EfiAcpiMemoryType::Reserved,
            });
        }

        pub fn finish(self) -> Entry {
            Entry::Bytes(self.0.as_bytes().to_vec())
        }
    }

    #[cfg(test)]
    mod test_e820 {
        use super::{E820Entry64, E820Table};
        use crate::hw::qemu::fwcfg::Entry;

        #[test]
        fn entry_size_is_correct() {
            // Compare the size of our definition of an E820 to EDK2's
            // definition. EDK2 interprets our provided bytes by its definition,
            // so they must match.
            assert_eq!(std::mem::size_of::<E820Entry64>(), 20);
        }

        #[test]
        fn basic() {
            let mut e820_table = E820Table::new();

            // Arbitrary bit patterns here, just to make eyeballing the layout
            // more straightforward.
            //
            // Also note the E820 table itself does not check if ranges overlap.
            // In practice it is directly constructed from an ASpace, which does
            // perform those checks.
            e820_table.add_mem(0x0102_0304_0506_0010, 0x1122_3344_5566_7788);
            e820_table
                .add_reserved(0x0102_0304_0506_fff0, 0xffee_ddcc_bbaa_9988);

            // We also don't require the E820 map to be ordered. ACPI does not
            // imply that it should be, nor do EDK2 or guest OSes, even though
            // entries are often enumerated in address order.
            e820_table.add_mem(0x0102_0304_0506_0000, 0x1122_3344_5566_7799);

            // rustfmt::skip here and below because eight bytes per line helps
            // eyeball with the entries as written above. rustfmt would try to
            // fit ten bytes per row to pack the 80-column width and that's just
            // annoying here.
            #[rustfmt::skip]
            const FIRST_ENTRY: [u8; 20] = [
                0x10, 0x00, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
                0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
                0x01, 0x00, 0x00, 0x00,
            ];

            #[rustfmt::skip]
            const SECOND_ENTRY: [u8; 20] = [
                0xf0, 0xff, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
                0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
                0x02, 0x00, 0x00, 0x00,
            ];

            #[rustfmt::skip]
            const THIRD_ENTRY: [u8; 20] = [
                0x00, 0x00, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
                0x99, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
                0x01, 0x00, 0x00, 0x00,
            ];

            let entry = e820_table.finish();
            let Entry::Bytes(bytes) = entry else {
                panic!("entry did not produce bytes, but instead {:?}", entry);
            };

            let expected_size =
                FIRST_ENTRY.len() + SECOND_ENTRY.len() + THIRD_ENTRY.len();
            assert_eq!(bytes.len(), expected_size);

            let tests = [
                (&bytes[0..20], &FIRST_ENTRY, "First E820 entry"),
                (&bytes[20..40], &SECOND_ENTRY, "Second E820 entry"),
                (&bytes[40..60], &THIRD_ENTRY, "Third E820 entry"),
            ];

            for (actual, expected, entry_name) in tests.iter() {
                assert_eq!(
                    actual, expected,
                    "{} contents are incorrect",
                    entry_name
                );
            }
        }
    }

    /// Collect one or more device elections for use in generating a boot order
    /// `fw_cfg` entry, suitable for consumption by OVMF bootrom.
    pub struct BootOrder(Vec<String>);
    impl BootOrder {
        pub fn new() -> Self {
            Self(Vec::new())
        }

        /// Add a generic disk
        pub fn add_disk(&mut self, loc: pci::BusLocation) {
            // The OVMF logic is looking for "scsi"
            let pci_path = Self::format_pci(loc, "scsi");
            self.0.push(format!("{pci_path}/disk@0,0"));
        }

        /// Add generic PCI device
        ///
        /// For example, one might add an entry for an ethernet NIC as such:
        /// ```
        /// # use propolis::hw::qemu::fwcfg::formats::BootOrder;
        /// # use propolis::hw::pci::BusLocation;
        /// # let mut bootorder = BootOrder::new();
        /// # let bus_loc = BusLocation::new(1, 0).unwrap();
        /// bootorder.add_pci(bus_loc, "ethernet");
        /// ```
        pub fn add_pci(&mut self, loc: pci::BusLocation, kind: &str) {
            self.0.push(Self::format_pci(loc, kind));
        }

        /// Add an NVMe disk.  This assumes namespace 1, as our NVMe emulation
        /// does not currently support multiple namespaces in a device.
        pub fn add_nvme(&mut self, loc: pci::BusLocation, eui: u64) {
            // The decoding in OVMF demands that the bootorder entry identify
            // nvme devices as vendor=0x8086 and device=5845.  While that
            // hardcoded logic exists, we must encode our entry to match, even
            // when the device in question may bear different identity info.
            //
            // For more details, see the TranslatePciOfwNodes() function in
            // OvmfPkg/Library/QemuBootOrderLib/QemuBootOrderLib.c.
            let pci_path = Self::format_pci(loc, "pci8086,5845");
            let ns = 1;
            self.0.push(format!("{pci_path}/namespace@{ns:x},{eui:x}"));
        }

        /// Render the contained boot order selections into a `fw_cfg` [Entry]
        pub fn finish(self) -> Entry {
            let Self(mut entries) = self;
            entries.push("HALT\0".to_owned());

            Entry::Bytes(entries.join("\n").to_string().into())
        }

        fn format_pci(loc: pci::BusLocation, name: &str) -> String {
            let (slot, func): (u8, u8) = (loc.dev.into(), loc.func.into());
            format!("/pci@i0cf8/{name}@{slot:x},{func:x}")
        }
    }

    #[cfg(test)]
    mod test_bootorder {
        use super::BootOrder;
        use crate::hw::pci::BusLocation;
        use crate::hw::qemu::fwcfg;

        #[test]
        fn basic() {
            let mut bo = BootOrder::new();

            bo.add_disk(BusLocation::new_unchecked(1, 2));
            bo.add_pci(BusLocation::new_unchecked(10, 3), "ethernet");
            bo.add_nvme(BusLocation::new_unchecked(31, 4), 0x123456789abcd);

            let raw = match bo.finish() {
                fwcfg::Entry::Bytes(v) => v,
                other => {
                    panic!("Unexpected entry type: {other:?}");
                }
            };
            let expected = [
                "/pci@i0cf8/scsi@1,2/disk@0,0",
                "/pci@i0cf8/ethernet@a,3",
                "/pci@i0cf8/pci8086,5845@1f,4/namespace@1,123456789abcd",
                // Trailing NUL is load-bearing
                "HALT\0",
            ];
            let entries = std::str::from_utf8(&raw)
                .expect("bootorder is valid utf8")
                .split('\n')
                .collect::<Vec<_>>();
            assert_eq!(&expected[..], &entries[..]);
        }
    }
}


================================================
FILE: lib/propolis/src/hw/qemu/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod debug;
pub mod fwcfg;
pub mod pvpanic;
pub mod ramfb;


================================================
FILE: lib/propolis/src/hw/qemu/pvpanic.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::pio::{PioBus, PioFn};

/// Implements the QEMU [pvpanic device], which
/// may be used by guests to notify the host when a kernel panic has occurred.
///
/// QEMU exposes the pvpanic virtual device as a device on the ISA bus (I/O port
/// 0x505), a PCI device, and through ACPI. Currently, Propolis only implements
/// the ISA bus pvpanic device, but the PCI device may be implemented in the
/// future.
///
/// [pvpanic device]: https://www.qemu.org/docs/master/specs/pvpanic.html
#[derive(Debug)]
pub struct QemuPvpanic {
    counts: Mutex<PanicCounts>,
    log: slog::Logger,
}

/// Counts the number of guest kernel panics reported using the [`QemuPvpanic`]
/// virtual device.
#[derive(Copy, Clone, Debug)]
pub struct PanicCounts {
    /// Counts the number of guest kernel panics handled by the host.
    pub host_handled: usize,
    /// Counts the number of guest kernel panics handled by the guest.
    pub guest_handled: usize,
}

pub const DEVICE_NAME: &str = "qemu-pvpanic";

/// Indicates that a guest panic has happened and should be processed by the
/// host
const HOST_HANDLED: u8 = 0b01;
/// Indicates a guest panic has happened and will be handled by the guest; the
/// host should record it or report it, but should not affect the execution of
/// the guest.
const GUEST_HANDLED: u8 = 0b10;

#[usdt::provider(provider = "propolis")]
mod probes {
    fn pvpanic_pio_write(value: u8) {}
}

impl QemuPvpanic {
    const IOPORT: u16 = 0x505;

    pub fn create(log: slog::Logger) -> Arc<Self> {
        Arc::new(Self {
            counts: Mutex::new(PanicCounts {
                host_handled: 0,
                guest_handled: 0,
            }),
            log,
        })
    }

    /// Attaches this pvpanic device to the provided [`PioBus`].
    pub fn attach_pio(self: &Arc<Self>, pio: &PioBus) {
        let piodev = self.clone();
        let piofn = Arc::new(move |_port: u16, rwo: RWOp| piodev.pio_rw(rwo))
            as Arc<PioFn>;
        pio.register(Self::IOPORT, 1, piofn).unwrap();
    }

    /// Returns the current panic counts reported by the guest.
    pub fn panic_counts(&self) -> PanicCounts {
        *self.counts.lock().unwrap()
    }

    fn pio_rw(&self, rwo: RWOp) {
        match rwo {
            RWOp::Read(ro) => {
                ro.write_u8(HOST_HANDLED | GUEST_HANDLED);
            }
            RWOp::Write(wo) => {
                let value = wo.read_u8();
                probes::pvpanic_pio_write!(|| value);
                let host_handled = value & HOST_HANDLED != 0;
                let guest_handled = value & GUEST_HANDLED != 0;
                slog::debug!(
                    self.log,
                    "guest kernel panic";
                    "host_handled" => host_handled,
                    "guest_handled" => guest_handled,
                );

                let mut counts = self.counts.lock().unwrap();

                if host_handled {
                    counts.host_handled += 1;
                }

                if guest_handled {
                    counts.guest_handled += 1;
                }
            }
        }
    }
}

impl Lifecycle for QemuPvpanic {
    fn type_name(&self) -> &'static str {
        DEVICE_NAME
    }
}


================================================
FILE: lib/propolis/src/hw/qemu/ramfb.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::future::Future;
use std::num::NonZeroUsize;
use std::pin::Pin;
use std::sync::{Arc, Mutex};
use std::task::{Context, Poll};
use std::time::Instant;

use crate::accessors::MemAccessor;
use crate::common::*;
use crate::migrate::*;
use crate::util::regmap::RegMap;
use crate::vmm::mem::SubMapping;
use crate::vmm::MemCtx;

use lazy_static::lazy_static;
use pin_project_lite::pin_project;
use rgb_frame::{FourCC, Frame, Spec};
use tokio::sync::{futures::Notified, Notify};

#[derive(Copy, Clone, Eq, PartialEq)]
enum Reg {
    Addr,
    FourCC,
    Flags,
    Width,
    Height,
    Stride,
}

lazy_static! {
    static ref CFG_REGS: RegMap<Reg> = {
        let layout = [
            (Reg::Addr, 8),
            (Reg::FourCC, 4),
            (Reg::Flags, 4),
            (Reg::Width, 4),
            (Reg::Height, 4),
            (Reg::Stride, 4),
        ];
        RegMap::create_packed(RamFb::FWCFG_ENTRY_SIZE, &layout, None)
    };
}

#[derive(Default, Debug)]
#[repr(C, packed)]
struct Config {
    addr: u64,
    fourcc: u32,
    flags: u32,
    width: u32,
    height: u32,
    stride: u32,
}
impl Config {
    /// Attempt to get a readable mapping to the guest memory backing this ramfb
    fn mapping<'a>(&self, mem: &'a MemCtx) -> Option<SubMapping<'a>> {
        if self.height == 0 || self.width == 0 {
            return None;
        }
        let bytepp = FourCC::from_repr(self.fourcc)?.bytes_per_pixel().get();

        let stride = self.stride(bytepp);
        let linesize = (self.width as usize).checked_mul(bytepp)?;
        let len =
            usize::checked_mul((self.height - 1) as usize, stride)? + linesize;
        mem.readable_region(&GuestRegion(GuestAddr(self.addr), len))
    }

    fn stride(&self, bytepp: usize) -> usize {
        if self.stride == 0 {
            (self.width as usize).checked_mul(bytepp).unwrap_or(0)
        } else {
            self.stride as usize
        }
    }
}

pub enum ConfigError {
    ZeroWidth,
    ZeroHeight,
    UnknownFourCC(u32),
    SizeOverflow,
}
impl TryFrom<&Config> for Spec {
    type Error = ConfigError;

    fn try_from(value: &Config) -> Result<Self, Self::Error> {
        let width = NonZeroUsize::new(value.width as usize)
            .ok_or(ConfigError::ZeroWidth)?;
        let height = NonZeroUsize::new(value.height as usize)
            .ok_or(ConfigError::ZeroHeight)?;
        let fourcc = FourCC::from_repr(value.fourcc)
            .ok_or(ConfigError::UnknownFourCC(value.fourcc))?;
        // The Frame initializer will choose an appropriate stride when it
        // allocates a buffer for the pixel data.  Until then, just emit one
        // consistent with a contiguous buffer.
        let stride = fourcc
            .bytes_per_pixel()
            .checked_mul(width)
            .ok_or(ConfigError::SizeOverflow)?;

        Ok(Spec { width, height, stride, fourcc })
    }
}

#[derive(Clone, Copy)]
pub struct FramebufferSpec {
    pub width: usize,
    pub height: usize,
    pub stride: usize,
    pub fourcc: u32,
}

pub struct FrameSnap {
    pub frame: Frame,
    pub when: Instant,
}
impl FrameSnap {
    fn read_from(config: &Config, mem: &MemCtx) -> Option<Self> {
        let mapping = config.mapping(mem)?;

        // With a valid Spec for the frame, we know the bytes-per-pixel
        let spec: Spec = config.try_into().ok()?;
        let bytepp = spec.fourcc.bytes_per_pixel().get();

        let fb_linesize = (config.width as usize).checked_mul(bytepp)?;
        let fb_stride = config.stride(bytepp);
        if fb_stride <= fb_linesize {
            // Pixel data is contiguous, so its a single big copy
            let len = mapping.len();
            let frame = Frame::new_uninit(spec, |buf, buf_stride| {
                // Expect that the Frame allocation layout matches that of the
                // framebuffer, since it is contiguous.
                assert_eq!(fb_stride, buf_stride.get());
                assert_eq!(len, buf.len());

                // Use raw pointer instead of MaybeUninit::write
                let buf_ptr = buf.as_mut_ptr() as *mut u8;
                unsafe {
                    mapping
                        .raw_readable()
                        .unwrap()
                        .copy_to_nonoverlapping(buf_ptr, len);
                }
            });
            Some(Self { frame, when: Instant::now() })
        } else {
            // Pixel data has "empty" space in stride to skip over
            let frame = Frame::new_uninit(spec, |buf, buf_stride| {
                // While the framebuffer is non-contiguous, we still expect the
                // Frame allocation to be (at this time)
                assert_eq!(fb_linesize, buf_stride.get());

                unsafe {
                    // Use raw pointer instead of MaybeUninit::write
                    let write_ptr = buf.as_mut_ptr() as *mut u8;
                    let read_ptr = mapping.raw_readable().unwrap();
                    for n in 0..(config.height as usize) {
                        read_ptr.add(n * fb_stride).copy_to_nonoverlapping(
                            write_ptr.add(n * fb_linesize),
                            fb_linesize,
                        );
                    }
                };
            });

            Some(Self { frame, when: Instant::now() })
        }
    }
}

struct Inner {
    config: Config,
    last_update: Instant,
}

pub struct RamFb {
    state: Mutex<Inner>,
    acc_mem: MemAccessor,
    notify: Notify,
    log: slog::Logger,
}
impl RamFb {
    /// Size of the entry exposed via `fw_cfg` interface
    pub const FWCFG_ENTRY_SIZE: usize = 28;

    /// Expected name of entry exposed via `fw_cfg` interface
    pub const FWCFG_ENTRY_NAME: &'static str = "etc/ramfb";

    pub fn create(log: slog::Logger) -> Arc<Self> {
        Arc::new(Self {
            state: Mutex::new(Inner {
                config: Config::default(),
                last_update: Instant::now(),
            }),
            notify: Notify::new(),
            acc_mem: MemAccessor::new_orphan(),
            log,
        })
    }
    pub fn attach(&self, acc_mem: &MemAccessor) {
        acc_mem.adopt(&self.acc_mem, Some("ramfb".to_string()));
    }

    /// Attempt to read contents of framebuffer
    ///
    /// A [Spec] representing the current device configuration will
    /// be passed to the `validate_bpp` callback, which will determine if the
    /// [Frame] contents should be fetched, and if so, what bits-per-pixel
    /// should be used for the configured `fourcc` of the device.
    ///
    /// Returns a [Frame] if `validate_bpp` returned `Some(bpp)`, and the frame
    /// contents could be copied from the region of guest memory specified in
    /// the configuration register.
    pub fn read_framebuffer(
        &self,
        interested: impl FnOnce(&Spec) -> bool,
    ) -> Option<FrameSnap> {
        let state = self.state.lock().unwrap();
        let mem = self.acc_mem.access()?;

        // Is the configuration even remotely valid?
        let spec = (&state.config).try_into().ok()?;

        // Is the consumer interested in the buffer as configured?
        if !interested(&spec) {
            return None;
        }

        FrameSnap::read_from(&state.config, &mem)
    }

    /// Get [Spec] representing the current device configuration, if it happens
    /// to be valid for a [Frame].
    pub fn read_spec(&self) -> Result<Spec, ConfigError> {
        let state = self.state.lock().unwrap();
        Spec::try_from(&state.config)
    }

    pub fn updated_since(&self, when: Instant) -> UpdatedSince<'_> {
        UpdatedSince {
            ramfb: self,
            notified: self.notify.notified(),
            since: when,
        }
    }

    pub(crate) fn fwcfg_rw(&self, mut rwo: RWOp) -> Result<(), ()> {
        let mut state = self.state.lock().unwrap();

        // Writes outside the bounds of the config register are not allowed
        if let RWOp::Write(wo) = &rwo {
            let start = wo.offset();
            let end = start.saturating_add(wo.len());
            if start >= Self::FWCFG_ENTRY_SIZE || end > Self::FWCFG_ENTRY_SIZE {
                return Err(());
            }
        }

        CFG_REGS.process(&mut rwo, |id, rwo| {
            let config = &mut state.config;
            match rwo {
                RWOp::Read(ro) => match id {
                    Reg::Addr => ro.write_u64(config.addr.to_be()),
                    Reg::FourCC => ro.write_u32(config.fourcc.to_be()),
                    Reg::Flags => ro.write_u32(config.flags.to_be()),
                    Reg::Width => ro.write_u32(config.width.to_be()),
                    Reg::Height => ro.write_u32(config.height.to_be()),
                    Reg::Stride => ro.write_u32(config.stride.to_be()),
                },
                RWOp::Write(wo) => match id {
                    Reg::Addr => config.addr = u64::from_be(wo.read_u64()),
                    Reg::FourCC => config.fourcc = u32::from_be(wo.read_u32()),
                    Reg::Flags => config.flags = u32::from_be(wo.read_u32()),
                    Reg::Width => config.width = u32::from_be(wo.read_u32()),
                    Reg::Height => config.height = u32::from_be(wo.read_u32()),
                    Reg::Stride => config.stride = u32::from_be(wo.read_u32()),
                },
            }
        });
        if rwo.is_write() {
            slog::debug!(self.log, "ramfb change"; "config" => ?state.config);
            state.last_update = Instant::now();
            self.notify.notify_waiters();
        }
        Ok(())
    }
}

pin_project! {
    pub struct UpdatedSince<'a> {
        ramfb: &'a RamFb,
        #[pin]
        notified: Notified<'a>,
        since: Instant,
    }
}
impl Future for UpdatedSince<'_> {
    type Output = ();

    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
        let since = self.since;
        let mut this = self.project();
        loop {
            if this.ramfb.state.lock().unwrap().last_update > since {
                return Poll::Ready(());
            }
            if let Poll::Ready(_) = Notified::poll(this.notified.as_mut(), cx) {
                // refresh the now-consumed Notified, and take another lap to
                // check the status
                this.notified.set(this.ramfb.notify.notified());
                continue;
            } else {
                return Poll::Pending;
            }
        }
    }
}

impl Lifecycle for RamFb {
    fn type_name(&self) -> &'static str {
        "qemu-ramfb"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}

impl MigrateSingle for RamFb {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let state = self.state.lock().unwrap();
        let config = &state.config;
        Ok(migrate::RamFbV1 {
            addr: config.addr,
            fourcc: config.fourcc,
            flags: config.flags,
            width: config.width,
            height: config.height,
            stride: config.stride,
        }
        .into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let data: migrate::RamFbV1 = offer.parse()?;

        let mut state = self.state.lock().unwrap();
        let config = &mut state.config;
        config.addr = data.addr;
        config.fourcc = data.fourcc;
        config.flags = data.flags;
        config.width = data.width;
        config.height = data.height;
        config.stride = data.stride;
        state.last_update = Instant::now();

        Ok(())
    }
}

pub mod migrate {
    use crate::migrate::*;

    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct RamFbV1 {
        pub addr: u64,
        pub fourcc: u32,
        pub flags: u32,
        pub width: u32,
        pub height: u32,
        pub stride: u32,
    }
    impl Schema<'_> for RamFbV1 {
        fn id() -> SchemaId {
            ("qemu-ramfb", 1)
        }
    }
}
#[cfg(test)]
mod test {
    use super::*;
    use std::mem::size_of;

    #[test]
    fn config_reg_size() {
        assert_eq!(size_of::<Config>(), RamFb::FWCFG_ENTRY_SIZE);
    }
}


================================================
FILE: lib/propolis/src/hw/testdev.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Devices intended for testing purposes.
//!
//! These devices do things which are generally unwanted in real life, such as
//! "intentionally breaking Propolis", "intentionally breaking the guest OS", or
//! some combination of the two.
use std::sync::{
    atomic::{AtomicUsize, Ordering},
    Arc,
};

use crate::common::Lifecycle;
use crate::migrate::*;

use serde::{Deserialize, Serialize};
use slog::info;

/// A test device for simulating migration failures.
pub struct MigrationFailureDevice {
    log: slog::Logger,
    exports: AtomicUsize,
    imports: AtomicUsize,
    fail: MigrationFailures,
}

pub struct MigrationFailures {
    pub exports: usize,
    pub imports: usize,
}

#[derive(Clone, Default, Deserialize, Serialize)]
struct MigrationFailurePayloadV1 {}

impl MigrationFailureDevice {
    const NAME: &'static str = "test-migration-failure";

    pub fn create(log: &slog::Logger, fail: MigrationFailures) -> Arc<Self> {
        let log =
            log.new(slog::o!("component" => "testdev", "dev" => Self::NAME));
        info!(log,
            "Injecting simulated migration failures";
            "fail_exports" => %fail.exports,
            "fail_imports" => %fail.imports,
        );
        Arc::new(Self {
            log,
            exports: AtomicUsize::new(0),
            imports: AtomicUsize::new(0),
            fail,
        })
    }
}

impl Lifecycle for MigrationFailureDevice {
    fn type_name(&self) -> &'static str {
        MigrationFailureDevice::NAME
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }
}

impl MigrateSingle for MigrationFailureDevice {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let export_num = self.exports.fetch_add(1, Ordering::Relaxed);
        if export_num < self.fail.exports {
            info!(
                self.log,
                "failing export";
                "export_num" => %export_num,
                "fail_exports" => %self.fail.exports
            );
            return Err(MigrateStateError::Io(std::io::Error::new(
                std::io::ErrorKind::Other,
                "somebody set up us the bomb",
            )));
        }

        info!(
            self.log,
            "exporting device";
            "export_num" => %export_num,
        );
        Ok(MigrationFailurePayloadV1 {}.into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let import_num = self.imports.fetch_add(1, Ordering::Relaxed);
        let fail = import_num < self.fail.imports;
        info!(
            self.log,
            "importing device";
            "import_num" => %import_num,
            "will_fail" => %fail,
        );
        let MigrationFailurePayloadV1 {} = offer.parse()?;
        if fail {
            info!(self.log, "failing import");
            return Err(MigrateStateError::ImportFailed(
                "you have no chance to survive, make your time".to_string(),
            ));
        }
        Ok(())
    }
}

impl Schema<'_> for MigrationFailurePayloadV1 {
    fn id() -> SchemaId {
        ("testdev-migration-failure", 1)
    }
}


================================================
FILE: lib/propolis/src/hw/uart/lpc.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Mutex};

use super::uart16550::{migrate, Uart};
use crate::chardev::*;
use crate::common::*;
use crate::intr_pins::IntrPin;
use crate::migrate::*;
use crate::pio::{PioBus, PioFn};

// Low Pin Count UART

pub const REGISTER_LEN: usize = 8;

struct UartState {
    uart: Uart,
    irq_pin: Box<dyn IntrPin>,
    auto_discard: bool,

    // In the absence of better interfaces for chardev save/restore behavior,
    // allow the device to be coarsely paused (dropping all reads and writes).
    paused: bool,
}

impl UartState {
    fn sync_intr_pin(&self) {
        if self.uart.intr_state() {
            self.irq_pin.assert()
        } else {
            self.irq_pin.deassert()
        }
    }
}

pub struct LpcUart {
    state: Mutex<UartState>,
    notify_readable: NotifierCell<dyn Source>,
    notify_writable: NotifierCell<dyn Sink>,
}

impl LpcUart {
    pub fn new(irq_pin: Box<dyn IntrPin>) -> Arc<Self> {
        Arc::new(Self {
            state: Mutex::new(UartState {
                uart: Uart::new(),
                irq_pin,
                auto_discard: true,
                paused: false,
            }),
            notify_readable: NotifierCell::new(),
            notify_writable: NotifierCell::new(),
        })
    }
    pub fn attach(self: &Arc<Self>, bus: &PioBus, port: u16) {
        let this = self.clone();
        let piofn = Arc::new(move |_port: u16, rwo: RWOp| this.pio_rw(rwo))
            as Arc<PioFn>;
        bus.register(port, REGISTER_LEN as u16, piofn).unwrap();
    }
    fn pio_rw(&self, rwo: RWOp) {
        assert!(rwo.offset() < REGISTER_LEN);
        assert!(rwo.len() != 0);
        let mut state = self.state.lock().unwrap();
        let readable_before = state.uart.is_readable();
        let writable_before = state.uart.is_writable();

        match rwo {
            RWOp::Read(ro) => {
                ro.write_u8(state.uart.reg_read(ro.offset() as u8));
            }
            RWOp::Write(wo) => {
                state.uart.reg_write(wo.offset() as u8, wo.read_u8());
            }
        }
        if state.auto_discard {
            while let Some(_val) = state.uart.data_read() {}
        }

        state.sync_intr_pin();

        let read_notify = !readable_before && state.uart.is_readable();
        let write_notify = !writable_before && state.uart.is_writable();

        // The uart state lock cannot be held while dispatching notifications
        // since those callbacks could immediately attempt to read/write the
        // pending data.
        drop(state);
        if read_notify {
            self.notify_readable.notify(self as &dyn Source);
        }
        if write_notify {
            self.notify_writable.notify(self as &dyn Sink);
        }
    }
    fn reset(&self) {
        let mut state = self.state.lock().unwrap();
        state.uart.reset();
        state.sync_intr_pin();
    }
}

impl Sink for LpcUart {
    fn write(&self, data: u8) -> bool {
        let mut state = self.state.lock().unwrap();

        if state.paused {
            return false;
        }

        let res = state.uart.data_write(data);
        state.sync_intr_pin();
        res
    }
    fn set_notifier(&self, f: Option<SinkNotifier>) {
        self.notify_writable.set(f);
    }
}
impl Source for LpcUart {
    fn read(&self) -> Option<u8> {
        let mut state = self.state.lock().unwrap();

        if state.paused {
            return None;
        }

        let res = state.uart.data_read();
        state.sync_intr_pin();
        res
    }
    fn discard(&self, count: usize) -> usize {
        let mut state = self.state.lock().unwrap();
        let mut discarded = 0;
        while discarded < count {
            if let Some(_val) = state.uart.data_read() {
                discarded += 1;
            } else {
                break;
            }
        }
        state.sync_intr_pin();
        discarded
    }
    fn set_notifier(&self, f: Option<SourceNotifier>) {
        self.notify_readable.set(f);
    }
    fn set_autodiscard(&self, active: bool) {
        let mut state = self.state.lock().unwrap();
        state.auto_discard = active;
    }
}

impl Lifecycle for LpcUart {
    fn type_name(&self) -> &'static str {
        "lpc-uart"
    }
    fn reset(&self) {
        LpcUart::reset(self);
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Single(self)
    }

    fn pause(&self) {
        let mut state = self.state.lock().unwrap();
        state.paused = true;
    }

    fn resume(&self) {
        let mut state = self.state.lock().unwrap();
        state.paused = false;
    }
}
impl MigrateSingle for LpcUart {
    fn export(
        &self,
        _ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError> {
        let state = self.state.lock().unwrap();
        Ok(state.uart.export().into())
    }

    fn import(
        &self,
        mut offer: PayloadOffer,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let data = offer.parse::<migrate::Uart16550V1>()?;
        let mut state = self.state.lock().unwrap();
        state.uart.import(&data)?;
        state.irq_pin.import_state(state.uart.intr_state());
        Ok(())
    }
}


================================================
FILE: lib/propolis/src/hw/uart/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

mod lpc;
mod uart16550;

pub use lpc::*;
pub use uart16550::*;


================================================
FILE: lib/propolis/src/hw/uart/uart16550.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! 16550 UART
//!
//! Host -> Device Data Path:
//! The host writes data to the UART via its Transmitter Holding Register (THR),
//! which is backed by tx_fifo. After this data is received out of the THR, the
//! UART will raise the Transmitter Holding Register Empty interrupt, which
//! notifies the host that it can write more data.

use std::collections::VecDeque;
use std::convert::AsRef;

use crate::migrate::MigrateStateError;

use serde::{Deserialize, Serialize};
use strum::{AsRefStr, FromRepr};

#[usdt::provider(provider = "propolis")]
mod probes {
    fn uart_reg_read(offset: u8, is_dlab: u8, val: u8) {}
    fn uart_reg_write(offset: u8, is_dlab: u8, data: u8) {}
    fn uart_tx_discard(data: u8) {}
    fn uart_ign_write(offset: u8, is_dlab: u8, data: u8) {}
    fn uart_ign_read(offset: u8, is_dlab: u8) {}
}

pub struct Uart {
    reg_intr_enable: IntrEnaReg,
    reg_intr_ident: IntrIdentReg,
    // TODO: add FIFO support
    // reg_fifo_ctrl: u8,
    reg_line_ctrl: LineCtrlReg,
    reg_line_status: LineStatusReg,
    reg_modem_ctrl: ModemCtrlReg,
    reg_modem_status: u8,
    reg_scratch: u8,
    reg_div_low: u8,
    reg_div_high: u8,

    /// Transmitter-Holding-Register-Empty interrupt status
    ///
    /// Since reading IIR while THRE is asserted will clear the THRE interrupt,
    /// but leave the associated THRE/TEMT bits asserted in RLS, we must track
    /// the interrupt state seperately.
    thre_intr: bool,
    intr_pin: bool,

    rx_fifo: Fifo,
    tx_fifo: Fifo,
}
impl Uart {
    pub fn new() -> Self {
        Uart {
            reg_intr_enable: IntrEnaReg::default(),
            reg_intr_ident: IntrIdentReg::default(),
            // reg_fifo_ctrl: 0,
            reg_line_ctrl: LineCtrlReg::default(),
            reg_line_status: LineStatusReg::default(),
            reg_modem_ctrl: ModemCtrlReg::default(),
            reg_modem_status: 0,
            reg_scratch: 0,
            reg_div_low: 0,
            reg_div_high: 0,

            thre_intr: false,
            intr_pin: false,
            // TODO: Don't deal with "real" sized fifos for now
            rx_fifo: Fifo::new(1),
            tx_fifo: Fifo::new(1),
        }
    }
    /// Read UART register
    pub fn reg_read(&mut self, offset: u8) -> u8 {
        let is_dlab = self.is_dlab();

        let val = match UartReg::for_read(offset, is_dlab) {
            Some(UartReg::DivisorLow) => self.reg_div_low,
            Some(UartReg::DivisorHigh) => self.reg_div_high,

            Some(UartReg::RecvHold) => {
                if let Some(d) = self.rx_fifo.read() {
                    self.update_dr();
                    self.update_isr();
                    d
                } else {
                    0
                }
            }
            Some(UartReg::IntrEnable) => self.reg_intr_enable.bits(),
            Some(UartReg::IntrIdent) => {
                let val = self.reg_intr_ident;
                if let Some(IntrIdent::THRE) = val.get_intr() {
                    // Reading the ISR can clear the THRE interrupt.
                    // The flag will remain in RLS, though.
                    self.thre_intr = false;
                    self.update_isr();
                }
                val.bits()
            }
            Some(UartReg::LineCtrl) => self.reg_line_ctrl.bits(),
            Some(UartReg::ModemCtrl) => self.reg_modem_ctrl.bits(),
            Some(UartReg::LineStatus) => {
                let val = self.reg_line_status;
                self.reg_line_status.remove(LineStatusReg::OE);
                self.update_isr();

                val.bits()
            }
            Some(UartReg::ModemStatus) => self.reg_modem_status,
            Some(UartReg::Scratch) => self.reg_scratch,
            Some(reg) => {
                assert!(!reg.is_readable());
                panic!(
                    "uart reg {} should not decode to be readable",
                    reg.as_ref()
                );
            }
            None => {
                probes::uart_ign_read!(|| (offset, u8::from(is_dlab)));
                0
            }
        };

        probes::uart_reg_read!(|| (offset, u8::from(is_dlab), val));

        val
    }

    /// Write UART register
    pub fn reg_write(&mut self, offset: u8, data: u8) {
        let is_dlab = self.is_dlab();

        probes::uart_reg_write!(|| (offset, u8::from(is_dlab), data));
        match UartReg::for_write(offset, is_dlab) {
            Some(UartReg::DivisorLow) => {
                self.reg_div_low = data;
            }
            Some(UartReg::DivisorHigh) => {
                self.reg_div_high = data;
            }
            Some(UartReg::TransmitHold) => {
                if !self.is_loopback() {
                    if !self.tx_fifo.write(data) {
                        // There is no error flag for when the TX buffer is
                        // overrun, but we can at least fire a probe.
                        probes::uart_tx_discard!(|| { data });
                    }
                    self.set_thre(false);
                } else {
                    if !self.rx_fifo.write(data) {
                        self.reg_line_status.insert(LineStatusReg::OE);
                    }
                    self.update_dr();
                    self.set_thre(true);
                }
            }
            Some(UartReg::IntrEnable) => {
                let old = self.reg_intr_enable;
                let new = IntrEnaReg::from_bits_truncate(data);
                self.reg_intr_enable = new;
                // Although not specified in the datasheet, some consumers
                // expect a THRE interrupt to be raised when toggling that on in
                // IER.
                if !old.contains(IntrEnaReg::ETBEI)
                    && new.contains(IntrEnaReg::ETBEI)
                {
                    if self.tx_fifo.is_empty() {
                        self.thre_intr = true
                    }
                }
                self.update_isr();
            }
            Some(UartReg::FifoCtrl) => {
                // TODO: add FIFO support
                // self.reg_fifo_ctrl = ?;
            }
            Some(UartReg::LineCtrl) => {
                // Accept any line control configuration.
                // We don't pay heed to anything but DLAB
                self.reg_line_ctrl = LineCtrlReg::from_bits_retain(data);
            }
            Some(UartReg::ModemCtrl) => {
                self.reg_modem_ctrl = ModemCtrlReg::from_bits_truncate(data);
            }
            Some(UartReg::Scratch) => {
                self.reg_scratch = data;
            }
            Some(reg) => {
                assert!(!reg.is_writable());
                panic!(
                    "uart reg {} should not decode to be writable",
                    reg.as_ref()
                );
            }
            None => {
                probes::uart_ign_read!(|| (offset, u8::from(is_dlab), data));
            }
        }
    }
    /// Read data transmitted from the uart
    pub fn data_read(&mut self) -> Option<u8> {
        if let Some(d) = self.tx_fifo.read() {
            self.set_thre(self.tx_fifo.is_empty());
            Some(d)
        } else {
            None
        }
    }
    /// Write data to be received by the uart
    pub fn data_write(&mut self, data: u8) -> bool {
        if self.is_loopback() {
            // Per the datasheet, the serial input pin is disconnected.
            // Simply discard all incoming data.
            true
        } else {
            let res = self.rx_fifo.write(data);
            self.update_dr();
            self.update_isr();
            res
        }
    }
    pub fn intr_state(&self) -> bool {
        self.intr_pin
    }
    pub fn is_readable(&self) -> bool {
        !self.tx_fifo.is_empty()
    }
    pub fn is_writable(&self) -> bool {
        self.is_loopback() || !self.rx_fifo.is_full()
    }

    pub fn reset(&mut self) {
        self.reg_intr_enable = IntrEnaReg::default();
        self.reg_intr_ident = IntrIdentReg::default();
        // self.reg_fifo_ctrl = 0;
        self.reg_line_ctrl = LineCtrlReg::default();
        self.reg_line_status = LineStatusReg::default();
        self.reg_modem_ctrl = ModemCtrlReg::default();
        self.reg_modem_status = 0;
        self.reg_scratch = 0;
        self.reg_div_low = 0;
        self.reg_div_high = 0;

        self.thre_intr = false;
        self.intr_pin = false;

        self.rx_fifo.reset();
        self.tx_fifo.reset();
    }

    #[inline(always)]
    fn is_dlab(&self) -> bool {
        self.reg_line_ctrl.contains(LineCtrlReg::DLAB)
    }
    #[inline(always)]
    fn is_loopback(&self) -> bool {
        self.reg_modem_ctrl.contains(ModemCtrlReg::LOOP)
    }

    fn next_intr(&self) -> Option<IntrIdent> {
        if self.reg_intr_enable.contains(IntrEnaReg::ELSI)
            && self.reg_line_status.contains(LineStatusReg::OE)
        {
            // This ignores Parity Error, Framing Error, and Break
            Some(IntrIdent::RLS)
        } else if self.reg_intr_enable.contains(IntrEnaReg::ERBFI)
            && self.reg_line_status.contains(LineStatusReg::DR)
        {
            Some(IntrIdent::DR)
        } else if self.reg_intr_enable.contains(IntrEnaReg::ETBEI)
            && self.thre_intr
        {
            Some(IntrIdent::THRE)
        } else if self.reg_intr_enable.contains(IntrEnaReg::EDSSI)
            && self.reg_modem_status != 0
        {
            // This ignores that MSR is fixed to 0
            Some(IntrIdent::MDM)
        } else {
            None
        }
    }

    fn update_isr(&mut self) {
        let new_isr = self.next_intr();
        self.reg_intr_ident.set_intr(new_isr);
        self.intr_pin = new_isr.is_some();
    }

    fn set_thre(&mut self, state: bool) {
        self.reg_line_status
            .set(LineStatusReg::THRE | LineStatusReg::TEMT, state);
        if self.thre_intr != state {
            self.thre_intr = state;
        }
        self.update_isr();
    }
    fn update_dr(&mut self) {
        self.reg_line_status.set(LineStatusReg::DR, !self.rx_fifo.is_empty())
    }

    pub(super) fn export(&self) -> migrate::Uart16550V1 {
        migrate::Uart16550V1 {
            intr_enable: self.reg_intr_enable.bits(),
            intr_status: self.reg_intr_ident.bits(),
            line_ctrl: self.reg_line_ctrl.bits(),
            line_status: self.reg_line_status.bits(),
            modem_ctrl: self.reg_modem_ctrl.bits(),
            modem_status: self.reg_modem_status,
            scratch: self.reg_scratch,
            div_low: self.reg_div_low,
            div_high: self.reg_div_high,
            thre_state: self.thre_intr,
            rx_fifo: self.rx_fifo.buf.clone().into(),
            tx_fifo: self.tx_fifo.buf.clone().into(),
        }
    }
    pub(super) fn import(
        &mut self,
        state: &migrate::Uart16550V1,
    ) -> Result<(), MigrateStateError> {
        if self.rx_fifo.len < state.rx_fifo.len() {
            return Err(MigrateStateError::ImportFailed(format!(
                "RX FIFO contents too long: {}",
                state.rx_fifo.len()
            )));
        }
        if self.tx_fifo.len < state.tx_fifo.len() {
            return Err(MigrateStateError::ImportFailed(format!(
                "TX FIFO contents too long: {}",
                state.rx_fifo.len()
            )));
        }

        self.reg_intr_enable =
            IntrEnaReg::from_bits_truncate(state.intr_enable);
        self.reg_intr_ident =
            IntrIdentReg::from_bits_truncate(state.intr_status);
        self.reg_line_ctrl = LineCtrlReg::from_bits_retain(state.line_ctrl);
        self.reg_line_status =
            LineStatusReg::from_bits_truncate(state.line_status);
        self.reg_modem_ctrl =
            ModemCtrlReg::from_bits_truncate(state.modem_ctrl);
        self.reg_modem_status = state.modem_status;
        self.reg_scratch = state.scratch;
        self.reg_div_low = state.div_low;
        self.reg_div_high = state.div_high;
        self.thre_intr = state.thre_state;
        self.rx_fifo.buf = state.rx_fifo.clone().into();
        self.tx_fifo.buf = state.tx_fifo.clone().into();

        // synthesize interrupt pin state like update_isr()
        self.intr_pin = self.reg_intr_ident.get_intr().is_some();

        Ok(())
    }
}

#[derive(Deserialize, Serialize, Clone)]
pub struct Fifo {
    len: usize,
    buf: VecDeque<u8>,
}

impl Fifo {
    fn new(max_len: usize) -> Self {
        Fifo { len: max_len, buf: VecDeque::with_capacity(max_len) }
    }
    fn write(&mut self, data: u8) -> bool {
        if self.buf.len() < self.len {
            self.buf.push_back(data);
            true
        } else {
            false
        }
    }
    fn read(&mut self) -> Option<u8> {
        self.buf.pop_front()
    }
    fn reset(&mut self) {
        self.buf.clear();
    }
    fn is_empty(&self) -> bool {
        self.buf.len() == 0
    }
    fn is_full(&self) -> bool {
        self.buf.len() == self.len
    }
}

pub mod migrate {
    use crate::migrate::*;

    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct Uart16550V1 {
        pub intr_enable: u8,
        pub intr_status: u8,
        pub line_ctrl: u8,
        pub line_status: u8,
        pub modem_ctrl: u8,
        pub modem_status: u8,
        pub scratch: u8,
        pub div_low: u8,
        pub div_high: u8,
        pub thre_state: bool,
        pub rx_fifo: Vec<u8>,
        pub tx_fifo: Vec<u8>,
    }
    impl Schema<'_> for Uart16550V1 {
        fn id() -> SchemaId {
            ("uart-16550", 1)
        }
    }
}

bitflags! {
    /// Interrupt Enable Register (IER)
    #[derive(Default, Copy, Clone)]
    struct IntrEnaReg: u8 {
        /// Receiver Data Available interrupt enable
        const ERBFI = 1 << 0;
        /// Transmit Holding Register Empty interrupt enable
        const ETBEI = 1 << 1;
        /// Receiver Line Status interrupt enable
        const ELSI = 1 << 2;
        /// Modem Status interrupt
        const EDSSI = 1 << 3;
    }

    /// Interrupt Identification Register (IIR)
    #[derive(Copy, Clone)]
    struct IntrIdentReg: u8 {
        /// Interrupt Pending (clear when interrupt pending)
        const NOPEND = 1;

        /// Mask of potential interrupt IDs
        const INTID = 0b1110;
    }

    /// FIFO Control Register (FCR)
    #[derive(Default, Copy, Clone)]
    struct FifoCtrlReg: u8 {
        /// enable transmitter/receive FIFOs
        const ENA = 1 << 0;
        /// clear bytes and count in receiver FIFO
        const RXRST = 1 << 1;
        /// clear bytes and count in transmit FIFO
        const TXRST = 1 << 2;
        const DMAMD = 1 << 3;
        const TRGR = 0b11000000;
    }

    /// Modem Control Register (MCR)
    #[derive(Default, Copy, Clone)]
    struct ModemCtrlReg: u8 {
        /// Loopback enabled
        const LOOP = 1 << 4;
    }

    /// Line Status Register (LSR)
    #[derive(Copy, Clone)]
    struct LineStatusReg: u8 {
        /// Data Ready
        const DR = 1 << 0;
        /// Overrun Error
        const OE = 1 << 1;
        /// Transmit Hold Register Empty
        const THRE = 1 << 5;
        /// Transmitter Empty
        const TEMT = 1 << 6;
    }

    /// Line Control Register (LSR)
    #[derive(Default, Copy, Clone)]
    struct LineCtrlReg: u8 {
        /// Word Length Select (0b11 = 8 bits)
        const WLS = 0b11;
        /// Stop Bits
        const STB = 1 << 2;
        /// Parent Enable
        const PEN = 1 << 3;
        /// Even Parity Select
        const EPS = 1 << 4;
        /// Stick Parity
        const SP = 1 << 5;
        /// Break Condition
        const BC = 1 << 6;
        /// Divisor Latch Access Bit
        const DLAB = 1 << 7;
    }
}

impl Default for LineStatusReg {
    fn default() -> Self {
        LineStatusReg::TEMT | LineStatusReg::THRE
    }
}

impl IntrIdentReg {
    fn set_intr(&mut self, intr_id: Option<IntrIdent>) {
        // Clear any existing interrupt bits
        self.remove(IntrIdentReg::INTID);
        if let Some(id) = intr_id {
            self.0 .0 |= id as u8;
            self.remove(IntrIdentReg::NOPEND);
        } else {
            self.insert(IntrIdentReg::NOPEND);
        }
    }
    fn get_intr(&self) -> Option<IntrIdent> {
        if self.contains(IntrIdentReg::NOPEND) {
            None
        } else {
            IntrIdent::from_repr(self.intersection(IntrIdentReg::INTID).bits())
        }
    }
}

impl Default for IntrIdentReg {
    fn default() -> Self {
        IntrIdentReg::NOPEND
    }
}

#[repr(u8)]
#[derive(Copy, Clone, FromRepr)]
enum IntrIdent {
    /// MODEM Status, priority 4 (lowest)
    MDM = 0b0000,
    /// Transmitter Hold Register Empty, priority 3
    THRE = 0b0010,
    /// Data Ready, priority 2
    DR = 0b0100,
    /// Receiver Line Status, priority 1 (highest)
    RLS = 0b0110,
    /// Character Timeout, priority 2
    CTMO = 0b1100,
}

#[derive(Clone, Copy, AsRefStr)]
enum UartReg {
    /// Receiver Holding Register (RHR), RO
    RecvHold,
    /// Transmitter Holding Register (THR), WO
    TransmitHold,
    /// Interrupt Enable Register (IER), RW
    IntrEnable,
    /// Interrupt Identification Register (IIR), RO
    IntrIdent,
    /// FIFO Control Register (FCR), WO
    FifoCtrl,
    /// Line Control Register (LCR), RW
    LineCtrl,
    /// Modem Control Register (MCR), RW
    ModemCtrl,
    /// Line Status Register (LCR), RO
    LineStatus,
    /// Modem Status Register (MSR), RO
    ModemStatus,
    /// Scratch Register (SPR), RW
    Scratch,
    /// Divisor Latch LSB (DLL), RW
    DivisorLow,
    /// Divisor Latch MSB (DLH), RW
    DivisorHigh,
}

impl UartReg {
    const fn for_write(off: u8, dlab_status: bool) -> Option<Self> {
        match (off, dlab_status) {
            (0, true) => Some(Self::DivisorLow),
            (0, false) => Some(Self::TransmitHold),
            (1, true) => Some(Self::DivisorHigh),
            (1, false) => Some(Self::IntrEnable),
            (2, _) => Some(Self::FifoCtrl),
            (3, _) => Some(Self::LineCtrl),
            (4, _) => Some(Self::ModemCtrl),
            (5, _) => None,
            (6, _) => None,
            (7, _) => Some(Self::Scratch),
            _ => None,
        }
    }

    const fn for_read(off: u8, dlab_status: bool) -> Option<Self> {
        match (off, dlab_status) {
            (0, true) => Some(Self::DivisorLow),
            (0, false) => Some(Self::RecvHold),
            (1, true) => Some(Self::DivisorHigh),
            (1, false) => Some(Self::IntrEnable),
            (2, _) => Some(Self::IntrIdent),
            (3, _) => Some(Self::LineCtrl),
            (4, _) => Some(Self::ModemCtrl),
            (5, _) => Some(Self::LineStatus),
            (6, _) => Some(Self::ModemStatus),
            (7, _) => Some(Self::Scratch),
            _ => None,
        }
    }

    const fn is_readable(self) -> bool {
        match self {
            UartReg::RecvHold
            | UartReg::IntrEnable
            | UartReg::IntrIdent
            | UartReg::LineCtrl
            | UartReg::ModemCtrl
            | UartReg::LineStatus
            | UartReg::ModemStatus
            | UartReg::Scratch
            | UartReg::DivisorLow
            | UartReg::DivisorHigh => true,
            _ => false,
        }
    }

    const fn is_writable(self) -> bool {
        match self {
            UartReg::TransmitHold
            | UartReg::IntrEnable
            | UartReg::FifoCtrl
            | UartReg::LineCtrl
            | UartReg::ModemCtrl
            | UartReg::Scratch
            | UartReg::DivisorLow
            | UartReg::DivisorHigh => true,
            _ => false,
        }
    }
}

#[cfg(test)]
mod test {

    mod bits {
        #![allow(unused)]

        // Register offsets from base
        pub const REG_RHR: u8 = 0b000; // Receiver Buffer Register (RO)
        pub const REG_THR: u8 = 0b000; // Transmitter Holding Register (WO)
        pub const REG_IER: u8 = 0b001; // Interrupt Enable Register (RW)
        pub const REG_ISR: u8 = 0b010; // Interrupt Ident Register (RO)
        pub const REG_FCR: u8 = 0b010; // FIFO Control Register (WO)
        pub const REG_LCR: u8 = 0b011; // Line Control Register (RW)
        pub const REG_MCR: u8 = 0b100; // Modem Control Register (RW)
        pub const REG_LSR: u8 = 0b101; // Line Status Register (RO)
        pub const REG_MSR: u8 = 0b110; // Modem Status Register (RO)
        pub const REG_SPR: u8 = 0b111; // Scratch Register (RW)
        pub const REG_DLL: u8 = 0b000; // Divisor Latch LSB (RW when DLAB=1)
        pub const REG_DLH: u8 = 0b001; // Divisor Latch MSB (RW when DLAB=1)

        // Interrupt Enable Register (IER) bits
        pub const IER_ERBFI: u8 = 1 << 0; // enable received-data-available-intr
        pub const IER_ETBEI: u8 = 1 << 1; // enable xmit-holding-reg-empty-intr
        pub const IER_ELSI: u8 = 1 << 2; // enable receiver-line-status
        pub const IER_EDSSI: u8 = 1 << 3; // enable modem-status-intr

        // Possible values of Interrupt Identification Register
        pub const ISRC_NONE: u8 = 0b0001; // no interrupt
        pub const ISRC_RLS: u8 = 0b0110; // receiver line status
        pub const ISRC_DR: u8 = 0b0100; // data ready
        pub const ISRC_TMO: u8 = 0b1100; // character timeout
        pub const ISRC_THRE: u8 = 0b0010; // transmitter holding register empty
        pub const ISRC_MDM: u8 = 0b0000; // modem status

        // FIFO Control Register (FCR) bits
        pub const FCR_ENA: u8 = 1 << 0; // enable xmit/receive FIFOs
        pub const FCR_RXRST: u8 = 1 << 1; // clear bytes/count in recv FIFO
        pub const FCR_TXRST: u8 = 1 << 2; // clear bytes/count in xmit FIFO
        pub const FCR_DMAMD: u8 = 1 << 3;
        pub const FCR_TRGR: u8 = 0b11000000;

        // Modem Control Register (MCR) bits
        pub const MCR_LOOP: u8 = 1 << 4; // loopback

        // Line Status Register (LSR) bits
        pub const LSR_DR: u8 = 1 << 0; // Data Ready
        pub const LSR_OE: u8 = 1 << 1; // Overrun Error
        pub const LSR_THRE: u8 = 1 << 5; // THRE indicator
        pub const LSR_TEMT: u8 = 1 << 6; // Transmitter Empty indicator
        pub const LCR_DLAB: u8 = 0b10000000; // Divisor Latch Access Bit

        pub const MASK_PCD: u8 = 0b00001111;
        pub const MASK_MCR: u8 = 0b00011111;
        pub const MASK_IER: u8 = 0b00001111;
        pub const MASK_FCR: u8 = 0b11001111;
        pub const MASK_ISRC: u8 = 0b00001111;
    }

    use super::*;
    use bits::*;

    #[test]
    fn reset_state() {
        let mut uart = Uart::new();
        assert_eq!(uart.reg_read(REG_IER), 0);
        assert_eq!(uart.reg_read(REG_ISR), 1);
        // TI datasheet notes the state of this register, despite it being WO
        // assert_eq!(uart.reg_fifo_ctrl, 0);
        assert_eq!(uart.reg_read(REG_LCR), 0);
        assert_eq!(uart.reg_read(REG_MCR), 0);
        assert_eq!(uart.reg_read(REG_LSR), 0b01100000);
    }
    #[test]
    fn intr_thre_on_etbei_toggle() {
        let mut uart = Uart::new();
        // start with no interrupts enabled, none should be asserted
        uart.reg_write(REG_IER, 0);
        assert_eq!(uart.reg_read(REG_LSR) & LSR_THRE, LSR_THRE);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
        assert_eq!(uart.intr_state(), false);
        // enable THRE interrupt
        uart.reg_write(REG_IER, IER_ETBEI);
        assert_eq!(uart.reg_read(REG_LSR) & LSR_THRE, LSR_THRE);
        assert_eq!(uart.intr_state(), true);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_THRE);
        // after reading ISR, THRE interrupt should deassert
        assert_eq!(uart.intr_state(), false);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
        // should still be present in LSR, though
        assert_eq!(uart.reg_read(REG_LSR) & LSR_THRE, LSR_THRE);
    }
    #[test]
    fn intr_dr_on_incoming() {
        let mut uart = Uart::new();
        let tval = 0x20;

        uart.reg_write(REG_IER, IER_ERBFI);
        assert_eq!(uart.intr_state(), false);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
        uart.data_write(tval);
        assert_eq!(uart.intr_state(), true);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_DR);
        assert_eq!(uart.reg_read(REG_RHR), tval);
        assert_eq!(uart.intr_state(), false);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
    }
    #[test]
    fn intr_thre_on_outgoing() {
        let mut uart = Uart::new();
        let tval = 0x20;

        uart.reg_write(REG_IER, 0);
        assert_eq!(uart.intr_state(), false);
        uart.reg_write(REG_THR, tval);
        uart.reg_write(REG_IER, IER_ETBEI);
        assert_eq!(uart.intr_state(), false);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
        assert_eq!(uart.data_read(), Some(tval));
        assert_eq!(uart.intr_state(), true);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_THRE);
        // cleared after read of ISR
        assert_eq!(uart.intr_state(), false);
        assert_eq!(uart.reg_read(REG_ISR) & MASK_ISRC, ISRC_NONE);
    }
    #[test]
    fn safe_read_write_all() {
        let mut uart = Uart::new();

        for i in 0..=7 {
            let _ = uart.reg_read(i);
        }
        for i in 0..=7 {
            uart.reg_write(i, 0xff);
        }
        // With DLAB=1 now true, make sure the divisor registers are fine
        let _ = uart.reg_read(0);
        let _ = uart.reg_read(1);
        let _ = uart.reg_write(0, 0xff);
        let _ = uart.reg_write(1, 0xff);
    }
    #[test]
    fn interrupt_codes() {
        let mut uart = Uart::new();

        // Enable interrupts we're going to test
        uart.reg_write(REG_IER, IER_ERBFI | IER_ETBEI | IER_ELSI | IER_EDSSI);

        // Since no data has been sent, the TX register is empty
        assert_eq!(uart.reg_read(REG_ISR), ISRC_THRE);

        // Since triggering overflow requires us to use loopback mode (since the
        // data_write() path refuses to allow overflow), configure the uart for
        // loopback.  That state can be used for the data-ready intr as well.
        uart.reg_write(REG_MCR, MCR_LOOP);

        // Loop back data to assert the data-ready interrupt
        let rval = 0x20;
        uart.reg_write(REG_THR, rval);
        // data-ready interrupt should take precedence
        assert_eq!(uart.reg_read(REG_ISR), ISRC_DR);

        // Now overrun the read register
        uart.reg_write(REG_THR, rval);
        // receiver-line-status interrupt should take precedence
        assert_eq!(uart.reg_read(REG_ISR), ISRC_RLS);

        // Read RLS to clear RLS intr
        assert!((uart.reg_read(REG_LSR) & LSR_OE) != 0);
        assert_eq!(uart.reg_read(REG_ISR), ISRC_DR);

        // Read pending data to clear DR intr
        assert_eq!(uart.reg_read(REG_RHR), rval);
        assert_eq!(uart.reg_read(REG_ISR), ISRC_THRE);

        // Clear loopback mode and queue outgoing data in TX register
        uart.reg_write(REG_MCR, 0);
        let tval = 0x40;
        uart.reg_write(REG_THR, tval);
        assert_eq!(uart.reg_read(REG_ISR), ISRC_NONE);
        assert_eq!(uart.data_read(), Some(tval));
    }
}


================================================
FILE: lib/propolis/src/hw/virtio/bits.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// virtio-net feature bits
pub const VIRTIO_NET_F_CSUM: u64 = 1 << 0;
pub const VIRTIO_NET_F_GUEST_CSUM: u64 = 1 << 1;
pub const VIRTIO_NET_F_CTRL_GUEST_OFFLOADS: u64 = 1 << 2;
pub const VIRTIO_NET_F_MTU: u64 = 1 << 3;
pub const VIRTIO_NET_F_MAC: u64 = 1 << 5;
pub const VIRTIO_NET_F_GUEST_TSO4: u64 = 1 << 7;
pub const VIRTIO_NET_F_GUEST_TSO6: u64 = 1 << 8;
pub const VIRTIO_NET_F_GUEST_ECN: u64 = 1 << 9;
pub const VIRTIO_NET_F_GUEST_UFO: u64 = 1 << 10;
pub const VIRTIO_NET_F_HOST_TSO4: u64 = 1 << 11;
pub const VIRTIO_NET_F_HOST_TSO6: u64 = 1 << 12;
pub const VIRTIO_NET_F_HOST_ECN: u64 = 1 << 13;
pub const VIRTIO_NET_F_HOST_UFO: u64 = 1 << 14;
pub const VIRTIO_NET_F_MGR_RXBUF: u64 = 1 << 15;
pub const VIRTIO_NET_F_STATUS: u64 = 1 << 16;
pub const VIRTIO_NET_F_CTRL_VQ: u64 = 1 << 17;
pub const VIRTIO_NET_F_CTRL_RX: u64 = 1 << 18;
pub const VIRTIO_NET_F_CTRL_VLAN: u64 = 1 << 19;
pub const VIRTIO_NET_F_MQ: u64 = 1 << 22;

// virtio-block feature bits
pub const VIRTIO_BLK_F_SIZE_MAX: u64 = 1 << 1;
pub const VIRTIO_BLK_F_SEG_MAX: u64 = 1 << 2;
pub const VIRTIO_BLK_F_GEOMETRY: u64 = 1 << 4;
pub const VIRTIO_BLK_F_RO: u64 = 1 << 5;
pub const VIRTIO_BLK_F_BLK_SIZE: u64 = 1 << 6;
pub const VIRTIO_BLK_F_FLUSH: u64 = 1 << 9;
pub const VIRTIO_BLK_F_TOPOLOGY: u64 = 1 << 10;
pub const VIRTIO_BLK_F_CONFIG_WCE: u64 = 1 << 11;
pub const VIRTIO_BLK_F_DISCARD: u64 = 1 << 13;
pub const VIRTIO_BLK_F_WRITE_ZEROES: u64 = 1 << 14;


================================================
FILE: lib/propolis/src/hw/virtio/block.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::num::NonZeroUsize;
use std::sync::Arc;
use std::time::Instant;

use crate::accessors::MemAccessor;
use crate::block;
use crate::common::*;
use crate::hw::pci;
use crate::hw::virtio;
use crate::migrate::*;
use crate::util::regmap::RegMap;

use super::bits::*;
use super::pci::{PciVirtio, PciVirtioState};
use super::queue::{Chain, VirtQueue, VirtQueues};
use super::VirtioDevice;
use bits::*;

use futures::future::BoxFuture;
use lazy_static::lazy_static;
use zerocopy::FromBytes;

/// Sizing for virtio-block is specified in 512B sectors
const SECTOR_SZ: usize = 512;

/// Arbitrary limit to sectors permitted per discard request
const MAX_DISCARD_SECTORS: u32 = ((1024 * 1024) / SECTOR_SZ) as u32;

pub struct PciVirtioBlock {
    virtio_state: PciVirtioState,
    pci_state: pci::DeviceState,
    pub block_attach: block::DeviceAttachment,
}
impl PciVirtioBlock {
    pub fn new(queue_size: u16) -> Arc<Self> {
        let queues = VirtQueues::new(&[queue_size.try_into().unwrap()]);
        // virtio-block only needs two MSI-X entries for its interrupt needs:
        // - device config changes
        // - queue 0 notification
        let msix_count = Some(2);
        let (virtio_state, pci_state) = PciVirtioState::new(
            virtio::Mode::Legacy,
            queues,
            msix_count,
            virtio::DeviceId::Block,
            VIRTIO_BLK_CFG_SIZE,
        );

        let block_attach = block::DeviceAttachment::new(
            NonZeroUsize::new(1).unwrap(),
            pci_state.acc_mem.child(Some("block backend".to_string())),
        );
        let bvq = BlockVq::new(
            virtio_state.queues.get(0).unwrap().clone(),
            pci_state.acc_mem.child(Some("block queue".to_string())),
        );
        block_attach.queue_associate(0usize.into(), bvq);

        Arc::new(Self { pci_state, virtio_state, block_attach })
    }

    fn block_cfg_read(&self, id: &BlockReg, ro: &mut ReadOp) {
        let info = self.block_attach.info().unwrap_or_else(Default::default);

        let total_bytes = info.total_size * u64::from(info.block_size);
        match id {
            BlockReg::Capacity => {
                ro.write_u64(total_bytes / SECTOR_SZ as u64);
            }
            BlockReg::SegMax => {
                // XXX: Copy the static limit from qemu for now
                ro.write_u32(128 - 2);
            }
            BlockReg::BlockSize => ro.write_u32(info.block_size),
            BlockReg::Unused => {
                ro.fill(0);
            }
            BlockReg::MaxDiscardSectors => {
                // Arbitrarily limit to 1MiB (or the device size, if smaller)
                let sz = u32::min(
                    MAX_DISCARD_SECTORS,
                    (info.total_size / SECTOR_SZ as u64) as u32,
                );
                ro.write_u32(if info.supports_discard { sz } else { 0 });
            }
            BlockReg::MaxDiscardSeg => {
                // If the device supports discard operations, only permit one
                // segment (LBA/size) per request.
                ro.write_u32(if info.supports_discard { 1 } else { 0 });
            }
            BlockReg::DiscardSectorAlign => {
                // Expect that discard operations are block-aligned
                ro.write_u32(if info.supports_discard {
                    info.block_size / SECTOR_SZ as u32
                } else {
                    0
                });
            }
            _ => {
                // XXX: all zeroes for now
                ro.fill(0);
            }
        }
    }
}

struct CompletionToken {
    /// ID of original request.
    rid: u16,
    /// VirtIO chain in which we indicate the result.
    chain: Chain,
}

struct BlockVq(Arc<VirtQueue>, MemAccessor);
impl BlockVq {
    fn new(vq: Arc<VirtQueue>, acc_mem: MemAccessor) -> Arc<Self> {
        Arc::new(Self(vq, acc_mem))
    }
}
impl block::DeviceQueue for BlockVq {
    type Token = CompletionToken;

    fn next_req(
        &self,
    ) -> Option<(block::Request, Self::Token, Option<Instant>)> {
        let vq = &self.0;
        let mem = self.1.access()?;

        let mut chain = Chain::with_capacity(4);
        // Pop a request off the queue if there's one available.
        // For debugging purposes, we'll also use the returned index
        // as a psuedo-id for the request to associate it with its
        // subsequent completion
        let (rid, _clen) = vq.pop_avail(&mut chain, &mem)?;

        let mut breq = VbReq::default();
        if !chain.read(&mut breq, &mem) {
            todo!("error handling");
        }
        let off = breq.sector as usize * SECTOR_SZ;
        let req = match breq.rtype {
            VIRTIO_BLK_T_IN => {
                // should be (blocksize * 512) + 1 remaining writable byte for status
                // TODO: actually enforce block size
                let blocks = (chain.remain_write_bytes() - 1) / SECTOR_SZ;
                let sz = blocks * SECTOR_SZ;

                if let Some(regions) = chain.writable_bufs(sz) {
                    probes::vioblk_read_enqueue!(|| (
                        rid, off as u64, sz as u64
                    ));
                    Ok((
                        block::Request::new_read(off, sz, regions),
                        CompletionToken { rid, chain },
                        None,
                    ))
                } else {
                    Err(chain)
                }
            }
            VIRTIO_BLK_T_OUT => {
                // should be (blocksize * 512) remaining read bytes
                let blocks = chain.remain_read_bytes() / SECTOR_SZ;
                let sz = blocks * SECTOR_SZ;

                if let Some(regions) = chain.readable_bufs(sz) {
                    probes::vioblk_write_enqueue!(|| (
                        rid, off as u64, sz as u64
                    ));
                    Ok((
                        block::Request::new_write(off, sz, regions),
                        CompletionToken { rid, chain },
                        None,
                    ))
                } else {
                    Err(chain)
                }
            }
            VIRTIO_BLK_T_FLUSH => {
                probes::vioblk_flush_enqueue!(|| rid);
                Ok((
                    block::Request::new_flush(),
                    CompletionToken { rid, chain },
                    None,
                ))
            }
            VIRTIO_BLK_T_DISCARD => {
                let mut detail = DiscardWriteZeroes::default();
                if !chain.read(&mut detail, &mem) {
                    Err(chain)
                } else {
                    let off = detail.sector as usize * SECTOR_SZ;
                    let sz = detail.num_sectors as usize * SECTOR_SZ;
                    probes::vioblk_discard_enqueue!(|| (
                        rid, off as u64, sz as u64,
                    ));
                    Ok((
                        block::Request::new_discard(vec![(off, sz)]),
                        CompletionToken { rid, chain },
                        None,
                    ))
                }
            }
            _ => Err(chain),
        };
        match req {
            Err(mut chain) => {
                // try to set the status byte to failed
                let remain = chain.remain_write_bytes();
                if remain >= 1 {
                    chain.write_skip(remain - 1);
                    chain.write(&VIRTIO_BLK_S_UNSUPP, &mem);
                }
                vq.push_used(&mut chain, &mem);
                None
            }
            Ok(r) => Some(r),
        }
    }

    fn complete(
        &self,
        op: block::Operation,
        result: block::Result,
        mut token: Self::Token,
    ) {
        let CompletionToken { rid, ref mut chain } = token;
        if let Some(mem) = self.1.access() {
            let resnum = match result {
                block::Result::Success => VIRTIO_BLK_S_OK,
                block::Result::Failure => VIRTIO_BLK_S_IOERR,
                block::Result::ReadOnly => VIRTIO_BLK_S_IOERR,
                block::Result::Unsupported => VIRTIO_BLK_S_UNSUPP,
            };
            match op {
                block::Operation::Read(..) => {
                    probes::vioblk_read_complete!(|| (rid, resnum));
                }
                block::Operation::Write(..) => {
                    probes::vioblk_write_complete!(|| (rid, resnum));
                }
                block::Operation::Flush => {
                    probes::vioblk_flush_complete!(|| (rid, resnum));
                }
                block::Operation::Discard => {
                    probes::vioblk_discard_complete!(|| (rid, resnum));
                }
            }
            chain.write(&resnum, &mem);
            self.0.push_used(chain, &mem);
        }
    }

    fn abandon(&self, _token: Self::Token) {
        // Nothing necessary to safely abandon a `CompletionToken`.
    }
}

impl VirtioDevice for PciVirtioBlock {
    fn rw_dev_config(&self, mut rwo: RWOp) {
        BLOCK_DEV_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => self.block_cfg_read(id, ro),
            RWOp::Write(_) => {
                //ignore writes
            }
        });
    }

    fn mode(&self) -> virtio::Mode {
        self.virtio_state().mode()
    }

    fn features(&self) -> u64 {
        let mut feat = VIRTIO_BLK_F_BLK_SIZE;
        feat |= VIRTIO_BLK_F_SEG_MAX;
        feat |= VIRTIO_BLK_F_FLUSH;

        let info = self.block_attach.info().unwrap_or_else(Default::default);
        if info.read_only {
            feat |= VIRTIO_BLK_F_RO;
        }
        if info.supports_discard {
            feat |= VIRTIO_BLK_F_DISCARD;
        }
        feat
    }

    fn set_features(&self, _feat: u64) -> Result<(), ()> {
        // XXX: real features
        Ok(())
    }

    fn queue_notify(&self, _vq: &VirtQueue) {
        // TODO: provide proper hint
        self.block_attach.notify(0usize.into(), None);
    }
}

impl PciVirtio for PciVirtioBlock {
    fn virtio_state(&self) -> &PciVirtioState {
        &self.virtio_state
    }
    fn pci_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
}

impl block::Device for PciVirtioBlock {
    fn attachment(&self) -> &block::DeviceAttachment {
        &self.block_attach
    }
}

impl Lifecycle for PciVirtioBlock {
    fn type_name(&self) -> &'static str {
        "pci-virtio-block"
    }
    fn reset(&self) {
        self.virtio_state.reset(self);
    }
    fn pause(&self) {
        self.block_attach.pause()
    }
    fn resume(&self) {
        self.block_attach.resume();
    }
    fn paused(&self) -> BoxFuture<'static, ()> {
        Box::pin(self.block_attach.none_processing())
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}
impl MigrateMulti for PciVirtioBlock {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        <dyn PciVirtio>::export(self, output, ctx)
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        <dyn PciVirtio>::import(self, offer, ctx)
    }
}

#[derive(Copy, Clone, Debug, Default, FromBytes)]
#[repr(C)]
struct VbReq {
    rtype: u32,
    reserved: u32,
    sector: u64,
}

#[derive(Copy, Clone, Debug, Default, FromBytes)]
#[repr(C)]
struct DiscardWriteZeroes {
    sector: u64,
    num_sectors: u32,
    flags: u32,
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum BlockReg {
    Capacity,
    SizeMax,
    SegMax,
    GeoCyl,
    GeoHeads,
    GeoSectors,
    BlockSize,
    TopoPhysExp,
    TopoAlignOff,
    TopoMinIoSz,
    TopoOptIoSz,
    Writeback,
    Unused,
    MaxDiscardSectors,
    MaxDiscardSeg,
    DiscardSectorAlign,
    MaxZeroSectors,
    MaxZeroSeg,
    ZeroMayUnmap,
}
lazy_static! {
    static ref BLOCK_DEV_REGS: RegMap<BlockReg> = {
        let layout = [
            (BlockReg::Capacity, 8),
            (BlockReg::SizeMax, 4),
            (BlockReg::SegMax, 4),
            (BlockReg::GeoCyl, 2),
            (BlockReg::GeoHeads, 1),
            (BlockReg::GeoSectors, 1),
            (BlockReg::BlockSize, 4),
            (BlockReg::TopoPhysExp, 1),
            (BlockReg::TopoAlignOff, 1),
            (BlockReg::TopoMinIoSz, 2),
            (BlockReg::TopoOptIoSz, 4),
            (BlockReg::Writeback, 1),
            (BlockReg::Unused, 3),
            (BlockReg::MaxDiscardSectors, 4),
            (BlockReg::MaxDiscardSeg, 4),
            (BlockReg::DiscardSectorAlign, 4),
            (BlockReg::MaxZeroSectors, 4),
            (BlockReg::MaxZeroSeg, 4),
            (BlockReg::ZeroMayUnmap, 1),
            (BlockReg::Unused, 3),
        ];
        RegMap::create_packed(
            VIRTIO_BLK_CFG_SIZE,
            &layout,
            Some(BlockReg::Unused),
        )
    };
}

mod bits {
    #![allow(unused)]

    pub const VIRTIO_BLK_T_IN: u32 = 0;
    pub const VIRTIO_BLK_T_OUT: u32 = 1;
    pub const VIRTIO_BLK_T_FLUSH: u32 = 4;
    pub const VIRTIO_BLK_T_DISCARD: u32 = 11;
    pub const VIRTIO_BLK_T_WRITE_ZEROES: u32 = 13;

    pub const VIRTIO_BLK_S_OK: u8 = 0;
    pub const VIRTIO_BLK_S_IOERR: u8 = 1;
    pub const VIRTIO_BLK_S_UNSUPP: u8 = 2;

    pub const VIRTIO_BLK_CFG_SIZE: usize = 0x3c;
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn vioblk_read_enqueue(id: u16, off: u64, sz: u64) {}
    fn vioblk_read_complete(id: u16, res: u8) {}

    fn vioblk_write_enqueue(id: u16, off: u64, sz: u64) {}
    fn vioblk_write_complete(id: u16, res: u8) {}

    fn vioblk_flush_enqueue(id: u16) {}
    fn vioblk_flush_complete(id: u16, res: u8) {}

    fn vioblk_discard_enqueue(id: u16, off: u64, sz: u64) {}
    fn vioblk_discard_complete(id: u16, res: u8) {}
}


================================================
FILE: lib/propolis/src/hw/virtio/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Propolis implements VirtIO devices for guests with appropriate drivers.
//!
//! We model virtio devices as (virtual) PCI devices, using the virtio PCI
//! transport mechanism as defined in the VirtIO 1.2 specification.
//! Currently we expose drivers for virtio-net, virtio-block, and virtio-9pfs.

use bitflags::bitflags;

#[allow(unused)]
mod bits;

pub mod block;
#[cfg(feature = "falcon")]
pub mod p9fs;
pub mod pci;
mod queue;
#[cfg(feature = "falcon")]
pub mod softnpu;
pub mod viona;
pub mod vsock;

#[cfg(test)]
pub mod testutil;

use crate::common::RWOp;
use crate::hw::pci as pci_hw;
use crate::lifecycle::Lifecycle;
use queue::VirtQueue;

pub use block::PciVirtioBlock;
pub use viona::PciVirtioViona;
pub use vsock::PciVirtioSock;

bitflags! {
    pub struct LegacyFeatures: u64 {
        const NOTIFY_ON_EMPTY = 1 << 24;
        const ANY_LAYOUT = 1 << 27;
    }
}

/// Describes the VirtIO "mode" exposed by the device.
#[derive(Clone, Copy, Debug, Eq, PartialEq, strum::FromRepr)]
#[repr(u32)]
pub enum Mode {
    /// Legacy mode is pre-VirtIO 1.0.
    Legacy,

    /// Modern devices are those that implement and expose the VirtIO
    /// 1.0 and later specification.
    Modern,

    /// Transitional devices exposes both the pre-VirtIO 1.0 "Legacy"
    /// interface VirtIO 1.0 and later "Modern" interface.
    Transitional,
}

impl Mode {
    /// Returns the PCI revision ID for the given mode.
    pub fn pci_revision(self) -> u8 {
        match self {
            Mode::Legacy | Mode::Transitional => 0,
            Mode::Modern => 1,
        }
    }
}

/// Recognized VirtIO Device IDs, as defined in the VirtIO 1.2 specification,
/// section 5, "Device Types".
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum DeviceId {
    Reserved = 0,
    Network = 1,
    Block = 2,
    Console = 3,
    Entropy = 4,
    TradMemBalloon = 5,
    IoMem = 6,
    RpMsg = 7,
    Scsi = 8,
    NineP = 9,
    Mac80211Wlan = 10,
    RprocSerial = 11,
    Caif = 12,
    MemBalloon = 13,
    Gpu = 16,
    Timer = 17,
    Input = 18,
    Socket = 19,
    Crypto = 20,
    SigDistMod = 21,
    Pstore = 22,
    Iommu = 23,
    Memory = 24,
    Audio = 25,
    Filesystem = 26,
    Pmem = 27,
    Rpmb = 28,
    Mac80211HWSim = 29,
    VideoEncoder = 30,
    VideoDecoder = 31,
    ArmScmi = 32,
    NitroSecureMod = 33,
    I2c = 34,
    Watchdog = 35,
    Can = 36,
    ParameterServer = 38,
    AudioPolicy = 39,
    Bluetooth = 40,
    Gpio = 41,
    Rdma = 42,
}

impl DeviceId {
    /// Maps a VirtIO Device ID to a PCI Device ID for the given mode.
    ///
    /// VirtIO defines its own namespace for device IDs that is independent
    /// of the underlying transport between host and guest.  The mapping from
    /// that space into PCI device IDs is dependent on the mode; for devices
    /// following the VirtIO 1.0 and later specifications, this is straight
    /// forward: just add 0x1040 to the VirtIO ID.
    ///
    /// However, for legacy and transitional mode devices, the mapping is
    /// irregular, and a table in the VirtIO specification lists the defined
    /// subset of device types and their respective PCI IDs. But note that there
    /// are legacy devices with no such defined mapping, and thus no standard
    /// transitional IDs. In these cases, we choose to use IDs that seem to be
    /// shared in a broad consensus across different implementations, in
    /// particular, QEMU.
    ///
    /// This is not really an issue for us, since we only expose a handful of
    /// device models; regardless, we provide mappings for everything defined in
    /// the VirtIO spec.
    ///
    /// See VirtIO 1.2, sec 4.1.2.1 for the mapping from VirtIO device ID
    /// to PCI device ID.
    pub fn pci_dev_id(self, mode: Mode) -> Result<u16, Self> {
        match mode {
            Mode::Modern => Ok(self as u16 + 0x1040),
            Mode::Legacy | Mode::Transitional => match self {
                Self::Network => Ok(0x1000),
                Self::Block => Ok(0x1001),
                Self::TradMemBalloon => Ok(0x1002),
                Self::Console => Ok(0x1003),
                Self::Scsi => Ok(0x1004),
                Self::Entropy => Ok(0x1005),
                Self::NineP => Ok(0x1009),
                Self::Socket => Ok(0x1012), // Taken from QEMU, used by Linux
                _ => Err(self),
            },
        }
    }

    /// Maps a VirtIO Device ID to a PCI Device Sub ID.
    /// XXX: Check these mappings against some reference.
    pub fn pci_sub_dev_id(self, mode: Mode) -> Result<u16, Self> {
        match mode {
            Mode::Legacy | Mode::Transitional => Ok(self as u16),
            Mode::Modern => self.pci_dev_id(mode),
        }
    }

    /// Maps a VirtIO Device ID to a PCI Device Class.
    ///
    /// Sadly, these mappings are mostly arbitrary.
    pub fn pci_class(self) -> Result<u8, Self> {
        match self {
            Self::Network => Ok(pci_hw::bits::CLASS_NETWORK),
            Self::Block | Self::NineP => Ok(pci_hw::bits::CLASS_STORAGE),
            Self::Socket => Ok(pci_hw::bits::CLASS_UNCLASSIFIED),
            _ => Err(self),
        }
    }

    /// Constructs a crate::hw::pci::Ident from the given VirtIO device
    /// ID and mode.
    pub fn pci_ident(self, mode: Mode) -> Result<pci_hw::Ident, Self> {
        use crate::hw::ids::pci::VENDOR_VIRTIO;
        let vendor_id = VENDOR_VIRTIO;
        let sub_vendor_id = VENDOR_VIRTIO;
        let device_id = self.pci_dev_id(mode)?;
        let sub_device_id = self.pci_sub_dev_id(mode)?;
        let device_class = self.pci_class()?;
        let revision_id = mode.pci_revision();
        Ok(pci_hw::Ident {
            vendor_id,
            device_id,
            sub_vendor_id,
            sub_device_id,
            device_class,
            revision_id,
            ..Default::default()
        })
    }
}

pub trait VirtioDevice: Send + Sync + 'static + Lifecycle {
    /// Read/write device-specific virtio configuration space.
    fn rw_dev_config(&self, ro: RWOp);

    /// Returns the device virtio mode (Legacy, Transitional, Modern).
    fn mode(&self) -> Mode;

    /// Returns the device-specific virtio feature bits.
    fn features(&self) -> u64;

    /// Sets the device-specific virtio feature bits
    ///
    /// Returns `Err` if an error occurred while setting the features.  Doing so
    /// will transition the device to the Failed state.
    fn set_features(&self, feat: u64) -> Result<(), ()>;

    /// Service driver notification for a given virtqueue
    fn queue_notify(&self, vq: &VirtQueue);

    /// Notification of virtqueue configuration change
    ///
    /// Returns `Err` if an error occurred while handling the specified
    /// `VqChange`.  Doing so will transition the device to the Failed state.
    fn queue_change(
        &self,
        _vq: &VirtQueue,
        _change: VqChange,
    ) -> Result<(), ()> {
        Ok(())
    }
}

pub trait VirtioIntr: Send + 'static {
    fn notify(&self);
    fn read(&self) -> VqIntr;
}

#[derive(Debug)]
pub enum VqChange {
    /// Underlying virtio device has been reset
    Reset,

    /// Physical address changed for VQ
    Address,

    /// MSI(-X) configuration changed for VQ
    IntrCfg,
}

pub enum VqIntr {
    /// Pin (lintr) interrupt
    Pin,

    /// MSI(-X) with address, data, and masked state
    Msi(u64, u32, bool),
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn virtio_vq_notify(virtio_dev_addr: u64, virtqueue_id: u16) {}
    fn virtio_vq_pop(vq_addr: u64, desc_idx: u16, avail_idx: u16) {}
    fn virtio_vq_push(vq_addr: u64, used_idx: u16, used_len: u32) {}

    fn virtio_viona_mq_set_use_pairs(cause: u8, npairs: u16) {}

    fn virtio_device_needs_reset() {}
    fn virtio_set_status(value: u8) {}
    fn virtio_state_reset() {}
}


================================================
FILE: lib/propolis/src/hw/virtio/p9fs.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::{self, File};
use std::mem::size_of;
use std::os::unix::ffi::OsStrExt;
use std::os::unix::fs::MetadataExt;
use std::path::PathBuf;
use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::hw::{pci, virtio};
use crate::migrate::Migrator;
use crate::util::regmap::RegMap;
use crate::vmm::MemCtx;

use super::pci::{PciVirtio, PciVirtioState};
use super::queue::{write_buf, Chain, VirtQueue, VirtQueues, VqSize};
use super::VirtioDevice;

use ispf::WireSize;
use lazy_static::lazy_static;
use libc::{
    DT_DIR, DT_REG, EILSEQ, EINVAL, ENOENT, ENOLCK, ENOTSUP, EOVERFLOW, ERANGE,
};
use p9ds::proto::{
    self, Dirent, MessageType, P9Version, Qid, QidType, Rattach, Rclunk,
    Rgetattr, Rlerror, Rlopen, Rread, Rreaddir, Rstatfs, Rwalk, Tattach,
    Tgetattr, Tlopen, Tread, Treaddir, Tstatfs, Twalk, Version,
};
use slog::{warn, Logger};
use zerocopy::IntoBytes;

/// This const is to add headroom into serialized P9 data packets. These packets
/// go through a virtio transport. It's been observed with Linux guests that we
/// cannot fill up an entire `msize` packet running through that transport for
/// RREAD message types, as the packet gets truncated by a small number of bytes
/// (13 is most often observed) and the PDU size will no longer match the the
/// RREAD header stated size.
const P9FS_VIRTIO_READ_HEADROOM: usize = 20;

// Form the rread header. Unfortunately we can't do this with the Rread
// structure because the count is baked into the data field which is tied
// to the length of the vector and filling that vector is what we're
// explicitly trying to avoid here.
#[repr(C, packed)]
#[derive(Copy, Clone, IntoBytes)]
struct RreadHeader {
    size: u32,
    typ: u8,
    tag: u16,
    count: u32,
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn p9fs_cfg_read() {}
}

/// This is a work-in-progres P9 filesystem device. It's a minimum viable
/// implementation provide a P9 filesystem to guest. It's been tested with
/// illumos and Linux guests. There are many capabilities that are not yet
/// implemented.
///
/// The design centers around a P9Handler trait that allows various different
/// types of P9 devices to be implemented. This file includes a `HostFSHandler`
/// implementation that allows mounting host filesystems in the guest.
/// Currently filesystems can only be mounted as read-only. Another
/// implementation is in the SoftNpu device that supports P4 program transfer
/// via p9fs.
pub struct PciVirtio9pfs {
    virtio_state: PciVirtioState,
    pci_state: pci::DeviceState,
    handler: Arc<dyn P9Handler>,
}

impl PciVirtio9pfs {
    pub fn new(queue_size: u16, handler: Arc<dyn P9Handler>) -> Arc<Self> {
        let queues = VirtQueues::new(&[VqSize::new(queue_size)]);
        let msix_count = Some(2); //guess
        let (virtio_state, pci_state) = PciVirtioState::new(
            virtio::Mode::Legacy,
            queues,
            msix_count,
            virtio::DeviceId::NineP,
            VIRTIO_9P_CFG_SIZE,
        );
        Arc::new(Self { virtio_state, pci_state, handler })
    }
}

impl VirtioDevice for PciVirtio9pfs {
    fn rw_dev_config(&self, mut rwo: RWOp) {
        P9FS_DEV_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => {
                probes::p9fs_cfg_read!(|| ());
                match id {
                    P9fsReg::TagLen => {
                        ro.write_u16(self.handler.target().len() as u16);
                    }
                    P9fsReg::Tag => {
                        let mut bs = [0; 256];
                        for (i, x) in self.handler.target().bytes().enumerate()
                        {
                            if i == 256 {
                                break;
                            }
                            bs[i] = x;
                        }
                        ro.write_bytes(&bs);
                        ro.fill(0);
                    }
                }
            }
            RWOp::Write(_) => {}
        })
    }

    fn mode(&self) -> virtio::Mode {
        virtio::Mode::Legacy
    }

    fn features(&self) -> u64 {
        VIRTIO_9P_F_MOUNT_TAG
    }

    fn set_features(&self, _feat: u64) -> Result<(), ()> {
        Ok(())
    }

    fn queue_notify(&self, vq: &VirtQueue) {
        self.handler.handle_req(vq);
    }
}

impl Lifecycle for PciVirtio9pfs {
    fn type_name(&self) -> &'static str {
        "pci-virtio-9pfs"
    }
    fn reset(&self) {
        self.virtio_state.reset(self);
    }
    fn migrate(&'_ self) -> Migrator<'_> {
        Migrator::NonMigratable
    }
}

impl PciVirtio for PciVirtio9pfs {
    fn virtio_state(&self) -> &PciVirtioState {
        &self.virtio_state
    }
    fn pci_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum P9fsReg {
    TagLen,
    Tag,
}

lazy_static! {
    static ref P9FS_DEV_REGS: RegMap<P9fsReg> = {
        let layout = [(P9fsReg::TagLen, 2), (P9fsReg::Tag, 256)];
        RegMap::create_packed(VIRTIO_9P_CFG_SIZE, &layout, None)
    };
}

struct Fid {
    pathbuf: PathBuf,
    file: Option<fs::File>,
}

struct Fileserver {
    fids: HashMap<u32, Fid>,
}

pub(crate) mod bits {
    use std::mem::size_of;

    // features
    pub const VIRTIO_9P_F_MOUNT_TAG: u64 = 0x1;

    pub const VIRTIO_9P_MAX_TAG_SIZE: usize = 256;
    pub const VIRTIO_9P_CFG_SIZE: usize =
        VIRTIO_9P_MAX_TAG_SIZE + size_of::<u16>();
}
use bits::*;

pub trait P9Handler: Sync + Send + 'static {
    fn source(&self) -> &str;
    fn target(&self) -> &str;
    fn msize(&self) -> u32;
    fn handle_version(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_attach(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_walk(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_open(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_readdir(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    );
    fn handle_read(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    );
    fn handle_write(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    );
    fn handle_clunk(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_getattr(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);
    fn handle_statfs(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx);

    fn handle_req(&self, vq: &VirtQueue) {
        let mem = vq.acc_mem.access().unwrap();

        let mut chain = Chain::with_capacity(1);
        let (_idx, _clen) = vq.pop_avail(&mut chain, &mem).unwrap();

        //TODO better as uninitialized?
        let mut data = Vec::new();
        let msize = self.msize();
        data.resize(msize as usize, 0);
        let mut buf = GuestData::from(data.as_mut_slice());

        // TODO copy pasta from tail end of Chain::read function. Seemingly
        // cannot use Chain::read as-is because it expects a statically sized
        // type.
        let mut done = 0;
        let _total = chain.for_remaining_type(true, |addr, len| {
            let mut remain = GuestData::from(&mut buf[done..]);
            if let Some(copied) = mem.read_into(addr, &mut remain, len) {
                let need_more = copied != remain.len();
                done += copied;
                (copied, need_more)
            } else {
                (0, false)
            }
        });

        let len = u32::from_le_bytes(buf[0..4].try_into().unwrap()) as usize;
        let typ = MessageType::try_from(buf[4]).unwrap();

        match typ {
            MessageType::Tversion => {
                self.handle_version(&data[..len], &mut chain, &mem)
            }

            MessageType::Tattach => {
                self.handle_attach(&data[..len], &mut chain, &mem)
            }

            MessageType::Twalk => {
                self.handle_walk(&data[..len], &mut chain, &mem)
            }

            MessageType::Tlopen => {
                self.handle_open(&data[..len], &mut chain, &mem)
            }

            MessageType::Treaddir => {
                self.handle_readdir(&data[..len], &mut chain, &mem, msize)
            }

            MessageType::Tread => {
                self.handle_read(&data[..len], &mut chain, &mem, msize)
            }

            MessageType::Twrite => {
                self.handle_write(&data[..len], &mut chain, &mem, msize)
            }

            MessageType::Tclunk => {
                self.handle_clunk(&data[..len], &mut chain, &mem)
            }

            MessageType::Tgetattr => {
                self.handle_getattr(&data[..len], &mut chain, &mem)
            }

            MessageType::Tstatfs => {
                self.handle_statfs(&data[..len], &mut chain, &mem)
            }

            //TODO: There are many p9fs operations that are not implemented. If
            //      you hit an ENOTSUP, this is the place to start for adding a
            //      new message type handler.
            _ => {
                write_error(ENOTSUP as u32, &mut chain, &mem);
            }
        };

        vq.push_used(&mut chain, &mem);
    }
}

pub struct HostFSHandler {
    max_chunk_size: u32,
    msize: Mutex<u32>,
    source: String,
    target: String,
    fileserver: Mutex<Box<Fileserver>>,
    log: Logger,
}

impl HostFSHandler {
    pub fn new(
        source: String,
        target: String,
        max_chunk_size: u32,
        log: Logger,
    ) -> Self {
        let fileserver =
            Mutex::new(Box::new(Fileserver { fids: HashMap::new() }));
        Self {
            source,
            target,
            max_chunk_size,
            msize: Mutex::new(max_chunk_size),
            fileserver,
            log,
        }
    }

    fn do_read(
        &self,
        msg: &Tread,
        fid: &mut Fid,
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    ) {
        let file = match fid.file {
            Some(ref f) => f,
            None => {
                warn!(self.log, "read: file not open: {:?}", &fid.pathbuf,);
                return write_error(EINVAL as u32, chain, mem);
            }
        };

        let metadata = match file.metadata() {
            Ok(m) => m,
            Err(e) => {
                let ecode = e.raw_os_error().unwrap_or(0);
                warn!(
                    self.log,
                    "read: metadata for {:?}: {:?}", &fid.pathbuf, e,
                );
                return write_error(ecode as u32, chain, mem);
            }
        };

        // bail with empty response if offset is greater than file size
        if metadata.len() < msg.offset {
            warn!(
                self.log,
                "read: offset > file size: {} > {}",
                msg.offset,
                metadata.len(),
            );
            let response = Rread::new(Vec::new());
            let mut out = ispf::to_bytes_le(&response).unwrap();
            let buf = out.as_mut_slice();
            return write_buf(buf, chain, mem);
        }

        let space_left = u64::from(msize)
            - (size_of::<RreadHeader>() + P9FS_VIRTIO_READ_HEADROOM) as u64;

        let msglen =
            std::cmp::min(u64::from(msg.count), metadata.len() - msg.offset);

        let buflen = std::cmp::min(space_left, msglen);

        p9_read_file(
            &file,
            chain,
            msg.tag,
            mem,
            buflen as usize,
            msg.offset as i64,
        );
    }

    fn do_statfs(&self, fid: &mut Fid, chain: &mut Chain, mem: &MemCtx) {
        let sfs = unsafe {
            let mut sfs: libc::statvfs = std::mem::zeroed::<libc::statvfs>();
            libc::statvfs(
                fid.pathbuf.as_path().as_os_str().as_bytes().as_ptr()
                    as *const i8,
                &mut sfs,
            );
            sfs
        };

        // fstype: u32
        let fstype = 0;
        // bsize: u32
        let bsize = sfs.f_bsize;
        // blocks: u64
        let blocks = sfs.f_blocks;
        // bfree: u64
        let bfree = sfs.f_bfree;
        // bavail: u64
        let bavail = sfs.f_bavail;
        // files: u64
        let files = sfs.f_files;
        // ffree: u64
        let ffree = sfs.f_ffree;
        // fsid: u64
        let fsid = sfs.f_fsid;
        // namelen: u32
        let namelen = sfs.f_namemax;

        let resp = Rstatfs::new(
            fstype,
            bsize as u32,
            blocks,
            bfree,
            bavail,
            files,
            ffree,
            fsid,
            namelen as u32,
        );

        let mut out = ispf::to_bytes_le(&resp).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn do_getattr(
        &self,
        fid: &mut Fid,
        tag: u16,
        valid: u64,
        chain: &mut Chain,
        mem: &MemCtx,
    ) {
        let metadata = match fs::metadata(&fid.pathbuf) {
            Ok(m) => m,
            Err(e) => {
                let ecode = e.raw_os_error().unwrap_or(0);
                return write_error(ecode as u32, chain, mem);
            }
        };

        // qid: Qid,
        let qid = Qid {
            typ: {
                if metadata.is_dir() {
                    QidType::Dir
                } else if metadata.is_symlink() {
                    QidType::Link
                } else {
                    QidType::File
                }
            },
            version: metadata.mtime() as u32, //todo something better from ufs?
            path: metadata.ino(),
        };
        // mode: u32,
        let mode = metadata.mode();
        // uid: u32,
        let uid = 0; //squash for now
                     // gid: u32,
        let gid = 0; //squash for now
                     // nlink: u64,
        let nlink = metadata.nlink();
        // rdev: u64,
        let rdev =
            if metadata.is_file() || metadata.is_dir() || metadata.is_symlink()
            {
                0 // Regular files, directories, and symlinks should have rdev = 0
            } else {
                metadata.rdev() // Only device files should have non-zero rdev
            };

        // attrsize: u64,
        let attrsize = metadata.size();
        // blksize: u64,
        let blksize = metadata.blksize();
        // blocks: u64,
        let blocks = metadata.blocks();
        // atime_sec: u64,
        let atime_sec = metadata.atime();
        // atime_nsec: u64,
        let atime_nsec = metadata.atime_nsec();
        // mtime_sec: u64,
        let mtime_sec = metadata.mtime();
        // mtime_nsec: u64,
        let mtime_nsec = metadata.mtime_nsec();
        // ctime_sec: u64,
        let ctime_sec = metadata.ctime();
        // ctime_nsec: u64,
        let ctime_nsec = metadata.ctime_nsec();
        // btime_sec: u64,
        let btime_sec = 0; // reserved for future use in spec
                           // btime_nsec: u64,
        let btime_nsec = 0; // reserved for future use in spec
                            // gen: u64,
        let gen = 0; // reserved for future use in spec
                     // data_version: u64,
        let data_version = 0; // reserved for future use in spec

        let mut resp = Rgetattr::new(
            valid,
            qid,
            mode,
            uid,
            gid,
            nlink,
            rdev,
            attrsize,
            blksize,
            blocks,
            atime_sec as u64,
            atime_nsec as u64,
            mtime_sec as u64,
            mtime_nsec as u64,
            ctime_sec as u64,
            ctime_nsec as u64,
            btime_sec as u64,
            btime_nsec as u64,
            gen,
            data_version,
        );
        resp.tag = tag;

        let mut out = ispf::to_bytes_le(&resp).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }
}

impl P9Handler for HostFSHandler {
    fn source(&self) -> &str {
        &self.source
    }

    fn target(&self) -> &str {
        &self.target
    }

    fn msize(&self) -> u32 {
        match self.msize.lock() {
            Ok(msize) => *msize,
            Err(e) => {
                warn!(self.log, "handle_req: failed to get msize lock: {}", e);
                self.max_chunk_size
            }
        }
    }

    fn handle_version(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let mut msg: Version = ispf::from_bytes_le(&msg_buf).unwrap();
        msg.version = P9Version::V2000L.to_string();
        msg.typ = MessageType::Rversion;
        if msg.msize > self.max_chunk_size {
            warn!(
                self.log,
                "request exceeds max chunk size {} > {}",
                msg.msize,
                self.max_chunk_size
            );
            return write_error(EOVERFLOW as u32, chain, mem);
        }
        // TODO this is likely bad for multiple clients with different msizes,
        // should be a session level variable.
        match self.msize.lock() {
            Ok(mut msize) => *msize = msg.msize,
            Err(e) => {
                warn!(
                    self.log,
                    "handle_version: failed to get msize lock: {}", e
                );
            }
        }
        let mut out = ispf::to_bytes_le(&msg).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn handle_attach(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        //NOTE:
        //  - multiple file trees not supported, aname is ignored
        //  - authentication not supported afid is ignored
        //  - users not tracked, uname is ignored

        // deserialize message
        let msg: Tattach = ispf::from_bytes_le(&msg_buf).unwrap();

        // grab inode number for qid uniqe file id
        let qpath = match fs::metadata(&self.source) {
            Err(e) => {
                let ecode = e.raw_os_error().unwrap_or(0);
                return write_error(ecode as u32, chain, mem);
            }
            Ok(m) => m.ino(),
        };

        match self.fileserver.lock() {
            Ok(mut fs) => {
                // check to see if fid is in use
                match fs.fids.get(&msg.fid) {
                    Some(_) => {
                        warn!(self.log, "attach fid in use: {}", msg.fid);
                        // The spec says to throw an error here, but in an
                        // effort to support clients who don't explicitly cluck
                        // fids, and considering the fact that we do not support
                        // multiple fs trees, just carry on
                        //return write_error(EEXIST as u32, chain, mem);
                    }
                    None => {
                        // create fid entry
                        fs.fids.insert(
                            msg.fid,
                            Fid {
                                pathbuf: PathBuf::from(self.source.clone()),
                                file: None,
                            },
                        );
                    }
                };
            }
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        }

        // send response
        let response =
            Rattach::new(Qid { typ: QidType::Dir, version: 0, path: qpath });
        let mut out = ispf::to_bytes_le(&response).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn handle_walk(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let msg: Twalk = ispf::from_bytes_le(&msg_buf).unwrap();

        match self.fileserver.lock() {
            Ok(mut fs) => {
                // check to see if fid exists
                let fid = match fs.fids.get(&msg.fid) {
                    Some(p) => p,
                    None => {
                        warn!(self.log, "walk: fid {} not found", msg.fid);
                        return write_error(ENOENT as u32, chain, mem);
                    }
                };

                let mut qids = Vec::new();
                let mut newpath = fid.pathbuf.clone();
                if msg.wname.len() > 0 {
                    // create new sub path from referenced fid path and wname
                    // elements
                    for n in msg.wname {
                        newpath.push(n.value);
                    }

                    // check that new path is a thing
                    let (ino, qt) = match fs::metadata(&newpath) {
                        Err(e) => {
                            let ecode = e.raw_os_error().unwrap_or(0);
                            warn!(
                                self.log,
                                "walk: no metadata: {:?}: {:?}", newpath, e
                            );
                            return write_error(ecode as u32, chain, mem);
                        }
                        Ok(m) => {
                            let qt = if m.is_dir() {
                                QidType::Dir
                            } else {
                                QidType::File
                            };
                            (m.ino(), qt)
                        }
                    };
                    qids.push(Qid { typ: qt, version: 0, path: ino });
                }

                // check to see if newfid is in use
                match fs.fids.get(&msg.newfid) {
                    Some(_) => {
                        // The spec says to throw an error here, but in an
                        // effort to support clients who don't explicitly cluck
                        // fids, and considering the fact that we do not support
                        // multiple fs trees, just carry on
                        //return write_error(EEXIST as u32, chain, mem);
                    }
                    None => {}
                };

                // create newfid entry
                fs.fids
                    .insert(msg.newfid, Fid { pathbuf: newpath, file: None });

                let response = Rwalk::new(qids);
                let mut out = ispf::to_bytes_le(&response).unwrap();
                let buf = out.as_mut_slice();
                write_buf(buf, chain, mem);
            }
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        }
    }

    fn handle_open(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let msg: Tlopen = ispf::from_bytes_le(&msg_buf).unwrap();

        match self.fileserver.lock() {
            Ok(mut fs) => {
                // check to see if fid exists
                let fid = match fs.fids.get_mut(&msg.fid) {
                    Some(p) => p,
                    None => {
                        warn!(self.log, "open: fid {} not found", msg.fid);
                        return write_error(ENOENT as u32, chain, mem);
                    }
                };

                // check that fid path is a thing
                let (ino, qt) = match fs::metadata(&fid.pathbuf) {
                    Err(e) => {
                        let ecode = e.raw_os_error().unwrap_or(0);
                        warn!(
                            self.log,
                            "open: no metadata: {:?}: {:?}", &fid.pathbuf, e
                        );
                        return write_error(ecode as u32, chain, mem);
                    }
                    Ok(m) => {
                        let qt = if m.is_dir() {
                            QidType::Dir
                        } else {
                            QidType::File
                        };
                        (m.ino(), qt)
                    }
                };

                // open the file
                fid.file = Some(
                    match fs::OpenOptions::new()
                        .read(true)
                        .open(fid.pathbuf.clone())
                    {
                        Ok(f) => f,
                        Err(e) => {
                            let ecode = e.raw_os_error().unwrap_or(0);
                            warn!(
                                self.log,
                                "open: {:?}: {:?}", &fid.pathbuf, e
                            );
                            return write_error(ecode as u32, chain, mem);
                        }
                    },
                );

                let response =
                    Rlopen::new(Qid { typ: qt, version: 0, path: ino }, 0);

                let mut out = ispf::to_bytes_le(&response).unwrap();
                let buf = out.as_mut_slice();
                write_buf(buf, chain, mem);
            }
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        }
    }

    fn handle_readdir(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    ) {
        let msg: Treaddir = ispf::from_bytes_le(&msg_buf).unwrap();

        // get the path for the requested fid
        let pathbuf = match self.fileserver.lock() {
            Ok(fs) => match fs.fids.get(&msg.fid) {
                Some(f) => f.pathbuf.clone(),
                None => {
                    warn!(self.log, "readdir: fid {} not found", msg.fid);
                    return write_error(ENOENT as u32, chain, mem);
                }
            },
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        };

        // read the directory at the provided path
        let mut dir = match fs::read_dir(&pathbuf) {
            Ok(r) => match r.collect::<Result<Vec<fs::DirEntry>, _>>() {
                Ok(d) => d,
                Err(e) => {
                    let ecode = e.raw_os_error().unwrap_or(0);
                    warn!(
                        self.log,
                        "readdir: collect: {:?}: {:?}", &pathbuf, e
                    );
                    return write_error(ecode as u32, chain, mem);
                }
            },
            Err(e) => {
                let ecode = e.raw_os_error().unwrap_or(0);
                warn!(self.log, "readdir: {:?}: {:?}", &pathbuf, e);
                return write_error(ecode as u32, chain, mem);
            }
        };

        // bail with out of range error if offset is greater than entries
        if (dir.len() as u64) < msg.offset {
            return write_error(ERANGE as u32, chain, mem);
        }

        // need to sort to ensure consistent offsets
        dir.sort_by_key(|a| a.path());

        let mut space_left = msize as usize
            - size_of::<u32>()          // Rreaddir.size
            - size_of::<MessageType>()  // Rreaddir.typ
            - size_of::<u16>()          // Rreaddir.tag
            - size_of::<u32>(); // Rreaddir.data.len

        let mut entries: Vec<proto::Dirent> = Vec::new();

        let mut offset = 1;
        for de in &dir[msg.offset as usize..] {
            let metadata = match de.metadata() {
                Ok(m) => m,
                Err(e) => {
                    let ecode = e.raw_os_error().unwrap_or(0);
                    warn!(
                        self.log,
                        "readdir: metadata: {:?}: {:?}",
                        &de.path(),
                        e
                    );
                    return write_error(ecode as u32, chain, mem);
                }
            };

            let (typ, ftyp) = if metadata.is_dir() {
                (QidType::Dir, DT_DIR)
            } else {
                (QidType::File, DT_REG)
            };

            let qid = Qid { typ, version: 0, path: metadata.ino() };

            let name = match de.file_name().into_string() {
                Ok(n) => n,
                Err(_) => {
                    // getting a bit esoteric with our error codes here...
                    return write_error(EILSEQ as u32, chain, mem);
                }
            };

            let dirent = Dirent { qid, offset, typ: ftyp, name };

            if space_left <= dirent.wire_size() {
                break;
            }

            space_left -= dirent.wire_size();
            entries.push(dirent);
            offset += 1;
        }

        let response = Rreaddir::new(entries);
        let mut out = ispf::to_bytes_le(&response).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn handle_read(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        msize: u32,
    ) {
        let msg: Tread = ispf::from_bytes_le(&msg_buf).unwrap();

        // get  the requested fid
        match self.fileserver.lock() {
            Ok(ref mut fs) => match fs.fids.get_mut(&msg.fid) {
                Some(ref mut fid) => self.do_read(&msg, fid, chain, mem, msize),
                None => {
                    warn!(self.log, "read: fid {} not found", msg.fid);
                    return write_error(ENOENT as u32, chain, mem);
                }
            },
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        };
    }

    fn handle_write(
        &self,
        _msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        _msize: u32,
    ) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_clunk(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        //TODO something
        let resp = Rclunk::new();
        let mut out = ispf::to_bytes_le(&resp).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn handle_getattr(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let msg: Tgetattr = ispf::from_bytes_le(&msg_buf).unwrap();
        match self.fileserver.lock() {
            Ok(ref mut fs) => match fs.fids.get_mut(&msg.fid) {
                Some(ref mut fid) => {
                    self.do_getattr(fid, msg.tag, msg.request_mask, chain, mem)
                }
                None => {
                    warn!(self.log, "getattr: fid {} not found", msg.fid);
                    return write_error(ENOENT as u32, chain, mem);
                }
            },
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        }
    }

    fn handle_statfs(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let msg: Tstatfs = ispf::from_bytes_le(&msg_buf).unwrap();
        match self.fileserver.lock() {
            Ok(ref mut fs) => match fs.fids.get_mut(&msg.fid) {
                Some(ref mut fid) => self.do_statfs(fid, chain, mem),
                None => {
                    warn!(self.log, "statfs: fid {} not found", msg.fid);
                    return write_error(ENOENT as u32, chain, mem);
                }
            },
            Err(_) => {
                return write_error(ENOLCK as u32, chain, mem);
            }
        }
    }
}

pub(crate) fn write_error(ecode: u32, chain: &mut Chain, mem: &MemCtx) {
    let msg = Rlerror::new(ecode);
    let mut out = ispf::to_bytes_le(&msg).unwrap();
    let buf = out.as_mut_slice();
    write_buf(buf, chain, mem);
}

fn p9_read_file(
    file: &File,
    chain: &mut Chain,
    tag: u16,
    mem: &MemCtx,
    count: usize,
    offset: i64,
) {
    let size = size_of::<RreadHeader>() + count;

    let h = RreadHeader {
        size: size as u32,
        typ: MessageType::Rread as u8,
        tag,
        count: count as u32,
    };

    chain.write(&h, mem);

    // Send the header to the guest from the buffer constructed above. Then
    // send the actual file data
    let mut done = 0;
    let _total = chain.for_remaining_type(false, |addr, len| {
        let sub_mapping = mem.writable_region(&GuestRegion(addr, len)).unwrap();
        let len = usize::min(len, count);
        let off = offset + done as i64;
        let mapped = sub_mapping.pread(file, len, off).unwrap();
        done += mapped;

        let need_more = done < count;
        (mapped, need_more)
    });
}


================================================
FILE: lib/propolis/src/hw/virtio/pci.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::ffi::c_void;
use std::num::NonZeroU16;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::{Arc, Condvar, Mutex, MutexGuard, Weak};

use super::probes;
use super::queue::{self, VirtQueues};
use super::{VirtioDevice, VirtioIntr, VqChange, VqIntr};
use crate::common::{RWOp, ReadOp, WriteOp, PAGE_SHIFT, PAGE_SIZE};
use crate::hw::pci::{self, BarN, CapId};
use crate::hw::virtio;
use crate::hw::virtio::queue::VqSize;
use crate::intr_pins::IntrPin;
use crate::migrate::{
    MigrateCtx, MigrateMulti, MigrateStateError, PayloadOffers, PayloadOutputs,
};
use crate::util::regmap::RegMap;

use bit_field::BitField;
use lazy_static::lazy_static;

const VIRTIO_MSI_NO_VECTOR: u16 = 0xffff;

const VIRTIO_PCI_ISR_QUEUE: u8 = 1 << 0;
const VIRTIO_PCI_ISR_CFG: u8 = 1 << 1;

bitflags! {
    #[derive(Clone, Copy, Debug, Default, PartialEq)]
    pub struct Status: u8 {
        const RESET = 0;
        const ACK = 1 << 0;
        const DRIVER = 1 << 1;
        const DRIVER_OK = 1 << 2;
        const FEATURES_OK = 1 << 3;
        const NEEDS_RESET = 1 << 6;
        const FAILED = 1 << 7;
    }
}

#[derive(Copy, Clone, Debug, Eq, PartialEq)]
enum IntrMode {
    IsrOnly,
    IsrLintr,
    Msi,
}
impl From<pci::IntrMode> for IntrMode {
    fn from(pci_mode: pci::IntrMode) -> Self {
        match pci_mode {
            pci::IntrMode::Disabled => IntrMode::IsrOnly,
            pci::IntrMode::INTxPin => IntrMode::IsrLintr,
            pci::IntrMode::Msix => IntrMode::Msi,
        }
    }
}

struct VirtioState {
    status: Status,
    queue_select: u16,
    negotiated_features: u64,
    /// True if the high-half of feature register has been selected via
    /// the feature select register; false if the low-half is selected
    /// (which is always the case for legacy devices).
    device_feature_select: u32,
    driver_feature_select: u32,
    config_generation: u8,
    config_generation_seen: bool,
    device_config_size: usize,
    mode: virtio::Mode,
    intr_mode: IntrMode,
    intr_mode_updating: bool,
    msix_cfg_vec: u16,
    msix_queue_vec: Vec<u16>,
}

impl VirtioState {
    fn new(
        device_config_size: usize,
        nmsix: usize,
        mode: virtio::Mode,
    ) -> Self {
        let msix_queue_vec = vec![VIRTIO_MSI_NO_VECTOR; nmsix];
        Self {
            status: Status::RESET,
            queue_select: 0,
            negotiated_features: 0,
            device_feature_select: 0,
            driver_feature_select: 0,
            config_generation: 0,
            config_generation_seen: false,
            device_config_size,
            mode,
            intr_mode: IntrMode::IsrOnly,
            intr_mode_updating: false,
            msix_cfg_vec: VIRTIO_MSI_NO_VECTOR,
            msix_queue_vec,
        }
    }

    fn reset(&mut self) {
        self.status = Status::RESET;
        self.queue_select = 0;
        self.negotiated_features = 0;
        self.config_generation = 0;
        self.config_generation_seen = false;
        self.msix_cfg_vec = VIRTIO_MSI_NO_VECTOR;
        self.msix_queue_vec.fill(VIRTIO_MSI_NO_VECTOR);
    }

    fn witness_config_generation(&mut self) {
        self.config_generation_seen = true;
    }

    fn _evolve_config_generation(&mut self) {
        if self.config_generation_seen {
            self.config_generation = self.config_generation.wrapping_add(1);
            self.config_generation_seen = false;
        }
    }
}

pub trait PciVirtio: VirtioDevice + Send + Sync + 'static {
    fn virtio_state(&self) -> &PciVirtioState;
    fn pci_state(&self) -> &pci::DeviceState;

    /// Handles notification that the IO port representing the queue
    /// notification register in the device BAR has changed.
    fn notify_port_update(&self, state: Option<NonZeroU16>) {
        let _used = state;
    }

    /// Handles notification that an MMIO address in the range representing the
    /// queue notification register in the device BAR has changed.
    fn notify_mmio_addr_update(&self, addr: Option<u64>) {
        let _used = addr;
    }

    /// Handles notification from the PCI emulation layer that one of the BARs
    /// has undergone a configuration change.
    fn bar_update(&self, bstate: pci::BarState) {
        match bstate.id {
            pci::BarN::BAR0 => {
                // Notify the device about the location (if any) of the Queue
                // Notify register in the containing BAR region.
                let port = bstate.decode_en.then(|| {
                    // Having registered `bstate.value` as the address in BAR0
                    // only succeeds if that address through to the size of the
                    // registered region - the virtio legacy config registers -
                    // does not wrap. The base address *could* be zero, unwise
                    // as that would be, but adding LEGACY_REG_OFF_QUEUE_NOTIFY
                    // guarantees that the computed offset here is non-zero.
                    NonZeroU16::new(
                        bstate.value as u16
                            + LEGACY_REG_QUEUE_NOTIFY_OFFSET as u16,
                    )
                    .expect("addition does not wrap")
                });
                self.notify_port_update(port);
            }
            pci::BarN::BAR2 => {
                let addr = bstate
                    .decode_en
                    .then(|| bstate.value + NOTIFY_REG_OFFSET as u64);
                self.notify_mmio_addr_update(addr);
            }
            _ => {}
        }
    }
}

impl<D: PciVirtio + Send + Sync + 'static> pci::Device for D {
    fn device_state(&self) -> &pci::DeviceState {
        self.pci_state()
    }

    fn bar_rw(&self, bar: pci::BarN, mut rwo: RWOp) {
        let vs = self.virtio_state();
        let map = match bar {
            pci::BarN::BAR0 => {
                if vs.legacy_map_use_msix.load(Ordering::SeqCst) {
                    &vs.legacy_config
                } else {
                    &vs.legacy_config_nomsix
                }
            }
            pci::BarN::BAR2 => &vs.common_config,
            _ => panic!("Config IO to unsupported BAR {bar:?}"),
        };
        map.process(&mut rwo, |id, mut rwo| match id {
            VirtioConfigRegBlock::Common => {
                COMMON_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.common_read(self, id, ro),
                    RWOp::Write(wo) => {
                        vs.common_write(self.pci_state(), self, id, wo)
                    }
                })
            }
            VirtioConfigRegBlock::Notify => {
                NOTIFY_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.notify_read(id, ro),
                    RWOp::Write(wo) => vs.notify_write(self, id, wo),
                })
            }
            VirtioConfigRegBlock::IsrStatus => {
                ISR_STATUS_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.isr_status_read(id, ro),
                    RWOp::Write(_wo) => {
                        // Read-only for device.
                    }
                })
            }
            VirtioConfigRegBlock::Legacy => {
                LEGACY_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.legacy_read(self, id, ro),
                    RWOp::Write(wo) => {
                        vs.legacy_write(self.pci_state(), self, id, wo)
                    }
                })
            }
            VirtioConfigRegBlock::DeviceConfig => self.rw_dev_config(rwo),
            // Write ignored, read as zero.
            VirtioConfigRegBlock::RazWi => {}
        });
    }

    fn cap_rw(&self, id: CapId<u32>, mut rwo: RWOp) {
        let vs = self.virtio_state();
        let id = {
            let CapId::Vendor(tag) = id else {
                unimplemented!("Unhandled capability type: {id:x?}");
            };
            let Ok(id) = VirtioCfgCapTag::try_from(tag) else {
                unimplemented!("Unknown vendor capability: {id:x?}");
            };
            id
        };
        match id {
            VirtioCfgCapTag::Common => {
                COMMON_CFG_CAP_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.common_cfg_cap_read(id, ro),
                    RWOp::Write(_) => {
                        // Read-only for driver
                    }
                });
            }
            VirtioCfgCapTag::Notify => {
                NOTIFY_CFG_CAP_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.notify_cfg_cap_read(id, ro),
                    RWOp::Write(_) => {
                        // Read-only for driver
                    }
                });
            }
            VirtioCfgCapTag::Isr => {
                COMMON_CFG_CAP_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.isr_cfg_cap_read(id, ro),
                    RWOp::Write(_) => {
                        // Read-only for driver
                    }
                });
            }
            VirtioCfgCapTag::Device => {
                COMMON_CFG_CAP_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.device_cfg_cap_read(id, ro),
                    RWOp::Write(_) => {
                        // Note: unlike most other hypervisors, Propolis does
                        // not presently support writes via the device config
                        // register. So, e.g., one cannot set a MAC address this
                        // way.
                        // TODO: Plumb a logging object through into here.
                        // error!(
                        //     self.log,
                        //     "unsupported write {wo:?} to dev config register"
                        // ),
                        eprintln!("unsupported write to device cap reg");
                    }
                })
            }
            VirtioCfgCapTag::Pci => {
                PCI_CFG_CAP_REGS.process(&mut rwo, |id, rwo| match rwo {
                    RWOp::Read(ro) => vs.pci_cfg_cap_read(self, id, ro),
                    RWOp::Write(wo) => vs.pci_cfg_cap_write(self, id, wo),
                });
            }
            VirtioCfgCapTag::SharedMemory => {
                unimplemented!("VirtIO Shared Memory is unsupported");
            }
            VirtioCfgCapTag::Vendor => {
                unimplemented!("VirtIO Vendor capabilities are unsupported");
            }
        }
    }

    fn attach(&self) {
        let ps = self.pci_state();
        if let Some(pin) = ps.lintr_pin() {
            let vs = self.virtio_state();
            vs.isr_state.set_pin(pin);
        }
    }

    fn interrupt_mode_change(&self, intr_mode: pci::IntrMode) {
        let vs = self.virtio_state();
        vs.set_intr_mode(self.pci_state(), intr_mode.into(), false);
    }

    fn msi_update(&self, info: pci::MsiUpdate) {
        let vs = self.virtio_state();
        let mut state = vs.state.lock().unwrap();
        if state.intr_mode != IntrMode::Msi {
            return;
        }
        state =
            vs.state_cv.wait_while(state, |s| s.intr_mode_updating).unwrap();
        state.intr_mode_updating = true;

        for vq in vs.queues.iter() {
            let val = *state.msix_queue_vec.get(vq.id as usize).unwrap();

            // avoid deadlock while modify per-VQ interrupt config
            drop(state);

            let result = match info {
                pci::MsiUpdate::MaskAll | pci::MsiUpdate::UnmaskAll
                    if val != VIRTIO_MSI_NO_VECTOR =>
                {
                    self.queue_change(vq, VqChange::IntrCfg)
                }
                pci::MsiUpdate::Modify(idx) if val == idx => {
                    self.queue_change(vq, VqChange::IntrCfg)
                }
                _ => Ok(()),
            };

            state = vs.state.lock().unwrap();
            if result.is_err() {
                // An error updating the VQ interrupt config should set the
                // device in a failed state.
                vs.needs_reset_locked(self, &mut state);
            }
        }
        state.intr_mode_updating = false;
        vs.state_cv.notify_all();
    }

    fn bar_update(&self, bstate: pci::BarState) {
        PciVirtio::bar_update(self, bstate);
    }
}

pub struct PciVirtioState {
    pub queues: VirtQueues,

    state: Mutex<VirtioState>,
    state_cv: Condvar,
    isr_state: Arc<IsrState>,

    common_config: RegMap<VirtioConfigRegBlock>,

    legacy_config: RegMap<VirtioConfigRegBlock>,
    legacy_config_nomsix: RegMap<VirtioConfigRegBlock>,

    /// Quick access to register map for MSIX (true) or non-MSIX (false)
    legacy_map_use_msix: AtomicBool,
}

impl PciVirtioState {
    pub(super) fn new(
        mode: virtio::Mode,
        queues: VirtQueues,
        msix_count: Option<u16>,
        device_type: virtio::DeviceId,
        cfg_size: usize,
    ) -> (Self, pci::DeviceState) {
        assert!(cfg_size < PAGE_SIZE);
        assert!(cfg_size + LEGACY_REG_SIZE < 0x200);

        let ident = device_type.pci_ident(mode).expect("PCI Ident");
        let mut builder = pci::Builder::new(ident).add_lintr();

        if let Some(count) = msix_count {
            builder = builder.add_cap_msix(pci::BarN::BAR1, count);
        }

        if mode == virtio::Mode::Transitional || mode == virtio::Mode::Legacy {
            // XXX: properly size the legacy cfg BAR
            builder = builder.add_bar_io(pci::BarN::BAR0, 0x200);
        }
        if mode == virtio::Mode::Transitional || mode == virtio::Mode::Modern {
            builder =
                builder.add_bar_mmio(pci::BarN::BAR2, 4 * PAGE_SIZE as u32);
            builder = builder.add_cap_vendor(
                VirtioCfgCapTag::Common.into(),
                COMMON_CFG_CAP_SIZE,
            );
            // Note: we don't presently support a non-zero multiplier for the
            // notification register, so we don't need to size this for the
            // number of queues; hence the fixed size.
            builder = builder.add_cap_vendor(
                VirtioCfgCapTag::Notify.into(),
                NOTIFY_CFG_CAP_SIZE,
            );
            builder = builder.add_cap_vendor(
                VirtioCfgCapTag::Isr.into(),
                COMMON_CFG_CAP_SIZE,
            );
            builder = builder.add_cap_vendor(
                VirtioCfgCapTag::Device.into(),
                COMMON_CFG_CAP_SIZE,
            );
            builder = builder
                .add_cap_vendor(VirtioCfgCapTag::Pci.into(), PCI_CFG_CAP_SIZE);
        }
        let pci_state = builder.finish();

        // With respect to layout, for the time being, we are unconditionally
        // transitional, meaning that we support both the legacy and common
        // configuration layouts.
        let common_config = RegMap::create_packed_passthru(
            4 * PAGE_SIZE,
            &[
                (VirtioConfigRegBlock::Common, COMMON_REG_SIZE),
                (VirtioConfigRegBlock::RazWi, PAGE_SIZE - COMMON_REG_SIZE),
                (VirtioConfigRegBlock::DeviceConfig, cfg_size),
                (VirtioConfigRegBlock::RazWi, PAGE_SIZE - cfg_size),
                (VirtioConfigRegBlock::Notify, NOTIFY_REG_SIZE),
                (VirtioConfigRegBlock::RazWi, PAGE_SIZE - NOTIFY_REG_SIZE),
                (VirtioConfigRegBlock::IsrStatus, ISR_STATUS_REG_SIZE),
                (VirtioConfigRegBlock::RazWi, PAGE_SIZE - ISR_STATUS_REG_SIZE),
            ],
        );
        let legacy_config = RegMap::create_packed_passthru(
            cfg_size + LEGACY_REG_SIZE,
            &[
                (VirtioConfigRegBlock::Legacy, LEGACY_REG_SIZE),
                (VirtioConfigRegBlock::DeviceConfig, cfg_size),
            ],
        );
        let legacy_config_nomsix = RegMap::create_packed_passthru(
            cfg_size + LEGACY_REG_SIZE_NO_MSIX,
            &[
                (VirtioConfigRegBlock::Legacy, LEGACY_REG_SIZE_NO_MSIX),
                (VirtioConfigRegBlock::DeviceConfig, cfg_size),
            ],
        );
        let legacy_map_use_msix = AtomicBool::new(false);
        // Allow VQs to access memory through the PCI state
        let nmsix = queues.max_capacity();
        let state = Mutex::new(VirtioState::new(cfg_size, nmsix, mode));
        let state_cv = Condvar::new();
        let isr_state = IsrState::new();
        let this = Self {
            queues,
            state,
            state_cv,
            isr_state,
            common_config,
            legacy_config,
            legacy_config_nomsix,
            legacy_map_use_msix,
        };

        for queue in this.queues.iter() {
            queue.set_intr(IsrIntr::new(&this.isr_state));
            pci_state
                .acc_mem
                .adopt(&queue.acc_mem, Some(format!("VQ {}", queue.id)));
        }

        (this, pci_state)
    }

    pub fn mode(&self) -> virtio::Mode {
        self.state.lock().unwrap().mode
    }

    fn qaddr<F>(&self, queue_select: u16, thunk: F) -> u64
    where
        F: FnOnce(&virtio::queue::MapInfo) -> u64,
    {
        self.queues
            .get(queue_select)
            .map(|queue| {
                let state = queue.get_state();
                thunk(&state.mapping)
            })
            .unwrap_or(0)
    }

    fn common_read(
        &self,
        dev: &dyn VirtioDevice,
        id: &CommonConfigReg,
        ro: &mut ReadOp,
    ) {
        match id {
            CommonConfigReg::DeviceFeatureSelect => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.device_feature_select);
            }
            CommonConfigReg::DeviceFeature => {
                let state = self.state.lock().unwrap();
                let shift = state.device_feature_select * 32;
                let features = if shift < 64 {
                    self.features_supported(dev) >> shift
                } else {
                    0
                };
                ro.write_u32(features as u32);
            }
            CommonConfigReg::DriverFeatureSelect => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.driver_feature_select);
            }
            CommonConfigReg::DriverFeature => {
                let state = self.state.lock().unwrap();
                let shift = state.driver_feature_select * 32;
                let features = if shift < 64 {
                    state.negotiated_features >> shift
                } else {
                    0
                };
                ro.write_u32(features as u32);
            }
            CommonConfigReg::ConfigMsixVector => {
                let state = self.state.lock().unwrap();
                ro.write_u16(state.msix_cfg_vec);
            }
            CommonConfigReg::NumQueues => {
                ro.write_u16(self.queues.count().get());
            }
            CommonConfigReg::DeviceStatus => {
                let state = self.state.lock().unwrap();
                ro.write_u8(state.status.bits());
            }
            CommonConfigReg::ConfigGeneration => {
                let mut state = self.state.lock().unwrap();
                state.witness_config_generation();
                ro.write_u8(state.config_generation);
            }
            CommonConfigReg::QueueSelect => {
                let state = self.state.lock().unwrap();
                ro.write_u16(state.queue_select);
            }
            CommonConfigReg::QueueSize => {
                let state = self.state.lock().unwrap();
                let size = self
                    .queues
                    .get(state.queue_select)
                    .map(|vq| vq.size())
                    .unwrap_or(0);
                ro.write_u16(size);
            }
            CommonConfigReg::QueueMsixVector => {
                let state = self.state.lock().unwrap();
                let vector = state
                    .msix_queue_vec
                    .get(state.queue_select as usize)
                    .map(|queue_sel| *queue_sel)
                    .unwrap_or(VIRTIO_MSI_NO_VECTOR);
                ro.write_u16(vector);
            }
            CommonConfigReg::QueueEnable => {
                let state = self.state.lock().unwrap();
                let enabled = self
                    .queues
                    .get(state.queue_select)
                    .map(|q| q.is_enabled())
                    .unwrap_or(false);
                ro.write_u16(enabled.into())
            }
            CommonConfigReg::QueueNotifyOffset => {
                ro.write_u16(0);
            }
            CommonConfigReg::QueueDescAddr => {
                let state = self.state.lock().unwrap();
                let addr = self.qaddr(state.queue_select, |m| m.desc_addr);
                ro.write_u64(addr);
            }
            CommonConfigReg::QueueDriverAddr => {
                let state = self.state.lock().unwrap();
                let addr = self.qaddr(state.queue_select, |m| m.avail_addr);
                ro.write_u64(addr);
            }
            CommonConfigReg::QueueDeviceAddr => {
                let state = self.state.lock().unwrap();
                let addr = self.qaddr(state.queue_select, |m| m.used_addr);
                ro.write_u64(addr);
            }
            // Note: currently unused.
            CommonConfigReg::QueueNotifyData => {
                let state = self.state.lock().unwrap();
                let data = self
                    .queues
                    .get(state.queue_select)
                    .map(|q| q.notify_data)
                    .unwrap_or(0);
                ro.write_u16(data);
            }
            // Note: currently unused.
            CommonConfigReg::QueueReset => {
                ro.write_u16(0);
            }
        }
    }

    fn common_write(
        &self,
        pci_state: &pci::DeviceState,
        dev: &dyn VirtioDevice,
        id: &CommonConfigReg,
        wo: &mut WriteOp,
    ) {
        match id {
            CommonConfigReg::DeviceFeatureSelect => {
                let mut state = self.state.lock().unwrap();
                state.device_feature_select = wo.read_u32();
            }
            CommonConfigReg::DeviceFeature => {
                // Read-only for driver
            }
            CommonConfigReg::DriverFeatureSelect => {
                let mut state = self.state.lock().unwrap();
                state.driver_feature_select = wo.read_u32();
            }
            CommonConfigReg::DriverFeature => {
                let mut state = self.state.lock().unwrap();
                let shift = state.driver_feature_select * 32;
                if shift < 64 {
                    let current = {
                        let lo = 32 - shift as usize;
                        let hi = 64 - shift as usize;
                        state.negotiated_features.get_bits(lo..hi) << lo
                    };
                    let offered = (u64::from(wo.read_u32()) << shift) | current;
                    let negotiated = self.features_supported(dev) & offered;
                    state.negotiated_features = negotiated;
                }
            }
            CommonConfigReg::ConfigMsixVector => {
                let mut state = self.state.lock().unwrap();
                state.msix_cfg_vec = wo.read_u16();
            }
            CommonConfigReg::NumQueues => {
                // Read-only for driver
            }
            CommonConfigReg::DeviceStatus => {
                self.set_status(dev, wo.read_u8());
            }
            CommonConfigReg::ConfigGeneration => {
                // Read-only for driver
            }
            CommonConfigReg::QueueSelect => {
                let mut state = self.state.lock().unwrap();
                state.queue_select = wo.read_u16();
            }
            CommonConfigReg::QueueSize => {
                let mut state = self.state.lock().unwrap();
                match VqSize::try_from(wo.read_u16()) {
                    Err(_) => {
                        // Bad queue size.
                        self.needs_reset_locked(dev, &mut state);
                    }
                    Ok(offered) => {
                        let qs = state.queue_select;
                        let Some(queue) = self.queues.get(qs) else {
                            // Invalid queue; write dropped.
                            return;
                        };
                        let mut size = queue.size.lock().unwrap();
                        *size = offered;
                    }
                }
            }
            CommonConfigReg::QueueMsixVector => {
                let hdl = pci_state.msix_hdl().unwrap();
                let mut state = self.state.lock().unwrap();
                let sel = state.queue_select as usize;
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let val = wo.read_u16();

                    if state.intr_mode != IntrMode::Msi {
                        // Store the vector information for later
                        state.msix_queue_vec[sel] = val;
                    } else {
                        state = self
                            .state_cv
                            .wait_while(state, |s| s.intr_mode_updating)
                            .unwrap();
                        state.intr_mode_updating = true;
                        state.msix_queue_vec[sel] = val;

                        // State lock cannot be held while updating queue
                        // interrupt handlers due to deadlock possibility.
                        drop(state);
                        queue.set_intr(MsiIntr::new(hdl, val));
                        state = self.state.lock().unwrap();

                        // With the MSI configuration updated for the virtqueue,
                        // notify the device of the change
                        if dev.queue_change(queue, VqChange::IntrCfg).is_err() {
                            self.needs_reset_locked(dev, &mut state);
                        }

                        state.intr_mode_updating = false;
                        self.state_cv.notify_all();
                    }
                }
            }
            CommonConfigReg::QueueEnable => {
                let mut state = self.state.lock().unwrap();
                let enabled = wo.read_u16() != 0;
                if let Some(queue) = self.queues.get(state.queue_select) {
                    if enabled {
                        queue.enable();
                        if dev.queue_change(queue, VqChange::Address).is_err() {
                            self.needs_reset_locked(dev, &mut state);
                        }
                    }
                }
            }
            CommonConfigReg::QueueNotifyOffset => {
                // Read-only for driver
            }
            CommonConfigReg::QueueDescAddr => {
                let state = self.state.lock().unwrap();
                let offered_desc_addr = wo.read_u64();
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let current = &queue.get_state().mapping;
                    queue.map_virtqueue(
                        offered_desc_addr,
                        current.avail_addr,
                        current.used_addr,
                    );
                }
            }
            CommonConfigReg::QueueDriverAddr => {
                let state = self.state.lock().unwrap();
                let offered_avail_addr = wo.read_u64();
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let current = &queue.get_state().mapping;
                    queue.map_virtqueue(
                        current.desc_addr,
                        offered_avail_addr,
                        current.used_addr,
                    );
                }
            }
            CommonConfigReg::QueueDeviceAddr => {
                let state = self.state.lock().unwrap();
                let offered_used_addr = wo.read_u64();
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let current = &queue.get_state().mapping;
                    queue.map_virtqueue(
                        current.desc_addr,
                        current.avail_addr,
                        offered_used_addr,
                    );
                }
            }
            CommonConfigReg::QueueNotifyData => {
                // Read-only for driver
            }
            // Note that this is a per-queue register, but since we don't
            // advertise the `VIRTIO_F_RING_RESET` feature bit, if we see
            // it, resetting the device isn't unreasonable.
            CommonConfigReg::QueueReset => self.set_needs_reset(dev),
        }
    }

    fn notify_read(&self, id: &NotifyReg, ro: &mut ReadOp) {
        match id {
            NotifyReg::Notify => {
                ro.write_u16(0);
            }
        }
    }

    fn notify_write(
        &self,
        dev: &dyn VirtioDevice,
        id: &NotifyReg,
        wo: &mut WriteOp,
    ) {
        match id {
            NotifyReg::Notify => self.queue_notify(dev, wo.read_u16()),
        }
    }

    fn isr_status_read(&self, id: &IsrStatusReg, ro: &mut ReadOp) {
        match id {
            IsrStatusReg::IsrStatus => {
                // reading ISR Status clears it as well
                let isr = self.isr_state.read_clear();
                ro.write_u8(isr);
            }
        }
    }

    fn common_cfg_cap_read(&self, id: &CommonCfgCapReg, op: &mut ReadOp) {
        match id {
            CommonCfgCapReg::CapLen => op.write_u8(COMMON_CFG_CAP_SIZE + 2),
            CommonCfgCapReg::CfgType => {
                op.write_u8(VirtioCfgCapTag::Common as u8)
            }
            CommonCfgCapReg::Bar => op.write_u8(BarN::BAR2 as u8),
            CommonCfgCapReg::Id => op.write_u8(0),
            CommonCfgCapReg::Padding => {}
            CommonCfgCapReg::Offset => op.write_u32(COMMON_REG_OFFSET as u32),
            CommonCfgCapReg::Length => op.write_u32(COMMON_REG_SIZE as u32),
        }
    }

    fn notify_cfg_cap_read(&self, id: &NotifyCfgCapReg, op: &mut ReadOp) {
        match id {
            NotifyCfgCapReg::Common(common_id) => match common_id {
                CommonCfgCapReg::CfgType => {
                    op.write_u8(VirtioCfgCapTag::Notify as u8)
                }
                CommonCfgCapReg::CapLen => op.write_u8(NOTIFY_CFG_CAP_SIZE + 2),
                CommonCfgCapReg::Offset => {
                    op.write_u32(NOTIFY_REG_OFFSET as u32)
                }
                CommonCfgCapReg::Length => op.write_u32(NOTIFY_REG_SIZE as u32),
                _ => self.common_cfg_cap_read(common_id, op),
            },
            NotifyCfgCapReg::Multiplier => op.write_u32(0),
        }
    }

    fn device_cfg_cap_read(&self, id: &CommonCfgCapReg, op: &mut ReadOp) {
        match id {
            CommonCfgCapReg::CfgType => {
                op.write_u8(VirtioCfgCapTag::Device as u8)
            }
            CommonCfgCapReg::Offset => op.write_u32(DEVICE_REG_OFFSET as u32),
            CommonCfgCapReg::Length => {
                let state = self.state.lock().unwrap();
                op.write_u32(state.device_config_size as u32);
            }
            _ => self.common_cfg_cap_read(id, op),
        }
    }

    fn isr_cfg_cap_read(&self, id: &CommonCfgCapReg, op: &mut ReadOp) {
        match id {
            CommonCfgCapReg::CfgType => op.write_u8(VirtioCfgCapTag::Isr as u8),
            CommonCfgCapReg::Offset => {
                op.write_u32(ISR_STATUS_REG_OFFSET as u32)
            }
            CommonCfgCapReg::Length => op.write_u32(ISR_STATUS_REG_SIZE as u32),
            _ => self.common_cfg_cap_read(id, op),
        }
    }

    fn pci_cfg_cap_read(
        &self,
        dev: &dyn VirtioDevice,
        id: &PciCfgCapReg,
        op: &mut ReadOp,
    ) {
        let _todo = dev;
        match id {
            PciCfgCapReg::Common(common_id) => match common_id {
                CommonCfgCapReg::CfgType => {
                    op.write_u8(VirtioCfgCapTag::Pci as u8)
                }
                CommonCfgCapReg::Bar => op.write_u8(0), // TODO: Handle
                CommonCfgCapReg::Offset => op.write_u32(0), // TODO: Handle
                CommonCfgCapReg::Length => op.write_u32(0), // TODO: Handle
                _ => self.common_cfg_cap_read(common_id, op),
            },
            PciCfgCapReg::PciData => {
                // TODO: We actually need to handle this.
                op.write_u32(0);
            }
        }
    }

    fn pci_cfg_cap_write(
        &self,
        dev: &dyn VirtioDevice,
        id: &PciCfgCapReg,
        op: &mut WriteOp,
    ) {
        let _todo = (dev, op);
        match id {
            PciCfgCapReg::Common(common_id) => {
                match common_id {
                    CommonCfgCapReg::Bar => {
                        // TODO: Store the bar
                    }
                    CommonCfgCapReg::Offset => {
                        // TODO: Store the offset
                    }
                    CommonCfgCapReg::Length => {
                        // TODO: Store the length
                    }
                    // Everything else is read-only for the driver.
                    _ => {}
                }
            }
            PciCfgCapReg::PciData => {
                // TODO: Handle the write.
            }
        }
    }

    fn legacy_read(
        &self,
        dev: &dyn VirtioDevice,
        id: &LegacyConfigReg,
        ro: &mut ReadOp,
    ) {
        match id {
            LegacyConfigReg::DeviceFeature => {
                ro.write_u32(self.features_supported(dev) as u32);
            }
            LegacyConfigReg::DriverFeature => {
                let state = self.state.lock().unwrap();
                ro.write_u32(state.negotiated_features as u32);
            }
            LegacyConfigReg::QueueAddress4k => {
                let state = self.state.lock().unwrap();
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let qs = queue.get_state();
                    let addr = qs.mapping.desc_addr;
                    ro.write_u32((addr >> PAGE_SHIFT) as u32);
                } else {
                    // bogus queue
                    ro.write_u32(0);
                }
            }
            LegacyConfigReg::QueueSize => {
                let state = self.state.lock().unwrap();
                let sz = self
                    .queues
                    .get(state.queue_select)
                    .map(|vq| vq.size())
                    .unwrap_or(0);
                ro.write_u16(sz);
            }
            LegacyConfigReg::QueueSelect => {
                let state = self.state.lock().unwrap();
                ro.write_u16(state.queue_select);
            }
            LegacyConfigReg::QueueNotify => {}
            LegacyConfigReg::DeviceStatus => {
                let state = self.state.lock().unwrap();
                ro.write_u8(state.status.bits());
            }
            LegacyConfigReg::IsrStatus => {
                // reading ISR Status clears it as well
                let isr = self.isr_state.read_clear();
                ro.write_u8(isr);
            }
            LegacyConfigReg::ConfigMsixVector => {
                let state = self.state.lock().unwrap();
                ro.write_u16(state.msix_cfg_vec);
            }
            LegacyConfigReg::QueueMsixVector => {
                let state = self.state.lock().unwrap();
                let val = state
                    .msix_queue_vec
                    .get(state.queue_select as usize)
                    .map(|queue_sel| *queue_sel)
                    .unwrap_or(VIRTIO_MSI_NO_VECTOR);
                ro.write_u16(val);
            }
        }
    }

    fn legacy_write(
        &self,
        pci_state: &pci::DeviceState,
        dev: &dyn VirtioDevice,
        id: &LegacyConfigReg,
        wo: &mut WriteOp,
    ) {
        match id {
            LegacyConfigReg::DriverFeature => {
                let offered = u64::from(wo.read_u32());
                let negotiated = self.features_supported(dev) & offered;
                let mut state = self.state.lock().unwrap();
                match dev.set_features(negotiated) {
                    Ok(_) => {
                        state.negotiated_features = negotiated;
                    }
                    Err(_) => {
                        self.needs_reset_locked(dev, &mut state);
                    }
                }
            }
            LegacyConfigReg::QueueAddress4k => {
                let mut state = self.state.lock().unwrap();
                let pfn = wo.read_u32();
                if pfn == 0 {
                    return;
                }
                if let Some(queue) = self.queues.get(state.queue_select) {
                    let qs_old = queue.get_state();
                    let new_addr = u64::from(pfn) << PAGE_SHIFT;
                    queue.map_legacy(new_addr);
                    if qs_old.mapping.desc_addr != new_addr {
                        if dev.queue_change(queue, VqChange::Address).is_err() {
                            self.needs_reset_locked(dev, &mut state);
                        }
                    }
                }
            }
            LegacyConfigReg::QueueSelect => {
                self.common_write(
                    pci_state,
                    dev,
                    &CommonConfigReg::QueueSelect,
                    wo,
                );
            }
            LegacyConfigReg::QueueNotify => {
                self.queue_notify(dev, wo.read_u16());
            }
            LegacyConfigReg::DeviceStatus => {
                self.set_status(dev, wo.read_u8());
            }
            LegacyConfigReg::ConfigMsixVector => {
                let mut state = self.state.lock().unwrap();
                state.msix_cfg_vec = wo.read_u16();
            }
            LegacyConfigReg::QueueMsixVector => {
                self.common_write(
                    pci_state,
                    dev,
                    &CommonConfigReg::QueueMsixVector,
                    wo,
                );
            }

            LegacyConfigReg::DeviceFeature
            | LegacyConfigReg::QueueSize
            | LegacyConfigReg::IsrStatus => {
                // Read-only regs
            }
        }
    }

    fn features_supported(&self, dev: &dyn VirtioDevice) -> u64 {
        dev.features() | queue::Features::transitional().bits()
    }

    fn set_status(&self, dev: &dyn VirtioDevice, value: u8) {
        probes::virtio_set_status!(|| value);
        let mut state = self.state.lock().unwrap();
        let status = Status::from_bits_truncate(value);
        if status == Status::RESET && state.status != Status::RESET {
            self.virtio_reset(dev, state);
        } else {
            self.apply_status(dev, &mut state, status);
        }
    }

    fn apply_status(
        &self,
        dev: &dyn VirtioDevice,
        state: &mut MutexGuard<VirtioState>,
        status: Status,
    ) {
        if status == state.status {
            // No actual difference, bail out early.
            return;
        }

        if !status.contains(state.status) {
            // The driver has disregarded VirtIO 1.2 section 2.1.2:
            //
            // > The driver MUST NOT clear a device status bit.
            //
            // If we allowed such a thing then the guest might toggle
            // FEATURES_OK and violate the expectation that `set_features`
            // is called only once when setting up a device. Instead, the
            // guest driver is in the wrong and we'll set NEEDS_RESET.
            self.needs_reset_locked(dev, state);
            return;
        }

        // Any bits here are being set at most once since the last device reset.
        let new_bits = status.difference(state.status);

        if new_bits.contains(Status::FEATURES_OK) {
            // From VirtIO 1.2 section 2.1:
            //
            // > FEATURES_OK (8) Indicates that the driver has acknowledged
            // > all the features it understands, and feature negotiation
            // > is complete.
            //
            // So, at this point if the guest sets additional features, we don't
            // have to care about them; renegotiation requires a device reset
            // ("The only way to renegotiate is to reset the device."). The
            // features provided are the ones we should enable.
            if dev.set_features(state.negotiated_features) == Err(()) {
                // Those requested features were not tolerable. We *must not*
                // reflect FEATURES_OK in status. Additionally, set NEEDS_RESET
                // in the hopes that the guset might see the issue and attempt
                // operating in a less-featureful mode.
                self.needs_reset_locked(dev, state);
                return;
            }
        }

        state.status = status;
    }

    /// Set the "Needs Reset" state on the VirtIO device
    fn needs_reset_locked(
        &self,
        _dev: &dyn VirtioDevice,
        state: &mut MutexGuard<VirtioState>,
    ) {
        // TODO: would be *great* to know which device needs a reset.. compare
        // with device_id in nvme and how we can give out per-device IDs when
        // setting things up.
        probes::virtio_device_needs_reset!(|| ());

        if !state.status.contains(Status::NEEDS_RESET) {
            state.status.insert(Status::NEEDS_RESET);
            // XXX: interrupt needed?
        }
    }

    /// Indicate to the guest that the VirtIO device has encountered an error of
    /// some sort and requires a reset.
    pub fn set_needs_reset(&self, dev: &dyn VirtioDevice) {
        let mut state = self.state.lock().unwrap();
        self.needs_reset_locked(dev, &mut state);
    }

    fn queue_notify(&self, dev: &dyn VirtioDevice, queue: u16) {
        probes::virtio_vq_notify!(|| (
            dev as *const dyn VirtioDevice as *const c_void as u64,
            queue
        ));
        if let Some(vq) = self.queues.get(queue) {
            vq.arise();
            if self.mode() != virtio::Mode::Modern || vq.is_enabled() {
                dev.queue_notify(vq);
            }
        }
    }

    /// Reset all non-control queues as part of a device reset (or shutdown).
    pub fn reset_queues(&self, dev: &dyn VirtioDevice) {
        let mut state = self.state.lock().unwrap();
        self.reset_queues_locked(dev, &mut state);
    }

    fn reset_queues_locked(
        &self,
        dev: &dyn VirtioDevice,
        state: &mut MutexGuard<VirtioState>,
    ) {
        for queue in self.queues.iter_all() {
            queue.reset();
            if dev.queue_change(queue, VqChange::Reset).is_err() {
                self.needs_reset_locked(dev, state);
            }
        }
    }

    /// Reset the virtio portion of the device
    ///
    /// This leaves PCI state (such as configured BARs) unchanged
    fn virtio_reset(
        &self,
        dev: &dyn VirtioDevice,
        mut state: MutexGuard<VirtioState>,
    ) {
        probes::virtio_state_reset!(|| ());
        self.reset_queues_locked(dev, &mut state);
        state.reset();
        let _ = self.isr_state.read_clear();
    }

    pub fn reset<D>(&self, dev: &D)
    where
        D: pci::Device + PciVirtio,
    {
        let vs = dev.virtio_state();
        let ps = dev.pci_state();

        let state = vs.state.lock().unwrap();
        vs.virtio_reset(dev, state);
        ps.reset(dev);
    }

    fn set_intr_mode(
        &self,
        pci_state: &pci::DeviceState,
        new_mode: IntrMode,
        is_import: bool,
    ) {
        let mut state = self.state.lock().unwrap();
        let old_mode = state.intr_mode;
        if new_mode == old_mode {
            return;
        }

        state =
            self.state_cv.wait_while(state, |s| s.intr_mode_updating).unwrap();

        state.intr_mode_updating = true;
        match old_mode {
            IntrMode::IsrLintr => {
                // When leaving lintr-pin mode, deassert anything on said pin
                self.isr_state.disable();
            }
            IntrMode::Msi => {
                // When leaving MSI mode, re-wire the Isr interrupt handling
                //
                // To avoid deadlock, the state lock must be dropped while
                // updating the interrupts handlers on queues.
                drop(state);
                for queue in self.queues.iter() {
                    queue.set_intr(IsrIntr::new(&self.isr_state));
                }
                state = self.state.lock().unwrap();
            }
            _ => {}
        }

        state.intr_mode = new_mode;
        // Make sure the correct legacy register map is used
        self.legacy_map_use_msix
            .store(new_mode == IntrMode::Msi, Ordering::SeqCst);
        match new_mode {
            IntrMode::IsrLintr => {
                self.isr_state.enable(is_import);
            }
            IntrMode::Msi => {
                let hdl = pci_state.msix_hdl().unwrap();
                for vq in self.queues.iter() {
                    let vec = *state
                        .msix_queue_vec
                        .get(vq.id as usize)
                        .expect("msix for virtqueue is ok");

                    // State lock cannot be held while updating queue interrupt
                    // handlers due to deadlock possibility.
                    drop(state);
                    vq.set_intr(MsiIntr::new(hdl.clone(), vec));
                    state = self.state.lock().unwrap();
                }
            }
            _ => {}
        }
        state.intr_mode_updating = false;
        self.state_cv.notify_all();
    }

    pub fn negotiated_features(&self) -> u64 {
        let state = self.state.lock().unwrap();
        state.negotiated_features
    }
}
impl MigrateMulti for PciVirtioState {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let state = self.state.lock().unwrap();
        let (isr_queue, isr_cfg) = self.isr_state.read();

        let device = migrate::DeviceStateV1 {
            status: state.status.bits(),
            queue_select: state.queue_select,
            negotiated_features: state.negotiated_features,
            device_feature_select: state.device_feature_select,
            driver_feature_select: state.driver_feature_select,
            config_generation: state.config_generation,
            config_generation_seen: state.config_generation_seen,
            device_config_size: state.device_config_size as u64,
            mode: state.mode as u32,
            msix_cfg_vec: state.msix_cfg_vec,
            msix_queue_vec: state.msix_queue_vec.clone(),
            isr_queue,
            isr_cfg,
        };
        drop(state);

        let queues = self.queues.export();

        output.push(migrate::PciVirtioStateV1 { device, queues }.into())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let input: migrate::PciVirtioStateV1 = offer.take()?;

        let dev = input.device;
        let mut state = self.state.lock().unwrap();
        state.status = Status::from_bits(dev.status).ok_or_else(|| {
            MigrateStateError::ImportFailed(format!(
                "virtio status: failed to import saved value {status:#x}",
                status = dev.status
            ))
        })?;
        state.queue_select = dev.queue_select;
        state.negotiated_features = dev.negotiated_features;
        state.device_feature_select = dev.device_feature_select;
        state.driver_feature_select = dev.driver_feature_select;
        state.config_generation = dev.config_generation;
        state.config_generation_seen = dev.config_generation_seen;
        state.device_config_size = dev.device_config_size as usize;
        state.mode = virtio::Mode::from_repr(dev.mode).ok_or_else(|| {
            MigrateStateError::ImportFailed(format!(
                "virtio mode: failed to import saved value {mode:#x}",
                mode = dev.mode
            ))
        })?;
        state.msix_cfg_vec = dev.msix_cfg_vec;
        state.msix_queue_vec = dev.msix_queue_vec;
        self.isr_state.import(dev.isr_queue, dev.isr_cfg);

        self.queues.import(&input.queues, state.mode)?;

        Ok(())
    }
}

impl MigrateMulti for dyn PciVirtio {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let ps = self.pci_state();
        let vs = self.virtio_state();

        MigrateMulti::export(vs, output, ctx)?;
        MigrateMulti::export(ps, output, ctx)?;
        Ok(())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        let ps = self.pci_state();
        let vs = self.virtio_state();

        MigrateMulti::import(vs, offer, ctx)?;
        MigrateMulti::import(ps, offer, ctx)?;

        // Now that PCI state is populated, apply its calculated interrupt mode
        // to the VirtIO state.
        vs.set_intr_mode(ps, ps.get_intr_mode().into(), true);

        // Perform a (potentially spurious) update notification for the BARs
        // containing the virtio registers.  This ensures that anything
        // interested in the placement of those BARs (such as the notify
        // logic) is configured properly.
        if let Some(bar0) = ps.bar(pci::BarN::BAR0) {
            self.bar_update(bar0);
        }
        if let Some(bar2) = ps.bar(pci::BarN::BAR2) {
            self.bar_update(bar2);
        }

        Ok(())
    }
}

#[derive(Default)]
struct IsrInner {
    disabled: bool,
    /// Is an ISR asserted for virtqueue(s) in this device?
    intr_queue: bool,
    /// Is an ISR asserted for a config change in this device?
    intr_cfg: bool,
    pin: Option<Arc<dyn IntrPin>>,
}
impl IsrInner {
    fn raised(&self) -> bool {
        self.intr_queue || self.intr_cfg
    }
}
struct IsrState(Mutex<IsrInner>);
impl IsrState {
    fn new() -> Arc<Self> {
        Arc::new(Self(Mutex::new(IsrInner::default())))
    }
    /// Raise queue ISR condition
    fn raise_queue(&self) {
        self.sync_pin(|inner| {
            inner.intr_queue = true;
        });
    }
    /// Read ISR value, then clear it.
    fn read_clear(&self) -> u8 {
        let (mut queue, mut cfg) = (false, false);
        self.sync_pin(|inner| {
            queue = inner.intr_queue;
            cfg = inner.intr_cfg;
            inner.intr_queue = false;
            inner.intr_cfg = false;
        });
        let mut val = 0;
        if queue {
            val |= VIRTIO_PCI_ISR_QUEUE;
        }
        if cfg {
            val |= VIRTIO_PCI_ISR_CFG;
        }
        val
    }
    /// Read ISR value.  Returns (`intr_queue`, `intr_cfg`)
    fn read(&self) -> (bool, bool) {
        let inner = self.0.lock().unwrap();
        (inner.intr_queue, inner.intr_cfg)
    }
    /// Import ISR value
    ///
    /// Sets the internal ISR value without propagating the state to the
    /// underlying pin, as is necessary when doing a migration related import of
    /// various device states.
    fn import(&self, intr_queue: bool, intr_cfg: bool) {
        let mut inner = self.0.lock().unwrap();
        inner.intr_queue = intr_queue;
        inner.intr_cfg = intr_cfg;
        if let Some(pin) = inner.pin.as_ref() {
            pin.import_state(inner.raised());
        }
    }
    /// Sync ISR state with any associated interrupt pin
    fn sync_pin(&self, f: impl FnOnce(&mut IsrInner)) {
        let mut inner = self.0.lock().unwrap();

        let raised_before = inner.raised();
        f(&mut inner);
        let raised_after = inner.raised();

        // Sync pin state with ISR value
        if !inner.disabled {
            if !raised_before && raised_after {
                if let Some(pin) = inner.pin.as_ref() {
                    pin.assert()
                }
            }
            if raised_before && !raised_after {
                if let Some(pin) = inner.pin.as_ref() {
                    pin.deassert()
                }
            }
        }
    }
    /// Disable state emission via interrupt pin
    fn disable(&self) {
        let mut inner = self.0.lock().unwrap();
        if !inner.disabled {
            if let Some(pin) = inner.pin.as_ref() {
                pin.deassert();
            }
            inner.disabled = true;
        }
    }
    /// Enable state emission via interrupt pin
    fn enable(&self, is_import: bool) {
        let mut inner = self.0.lock().unwrap();
        if inner.disabled {
            if inner.raised() && !is_import {
                if let Some(pin) = inner.pin.as_ref() {
                    pin.assert();
                }
            }
            inner.disabled = false;
        }
    }
    /// Set underlying interrupt pin.
    ///
    /// # Panics
    /// If called more than once on a given [IsrState]
    fn set_pin(&self, pin: Arc<dyn IntrPin>) {
        let mut inner = self.0.lock().unwrap();
        let old = inner.pin.replace(pin);
        // Loosen this in the future if/when PCI device attachment logic becomes
        // more sophisticated.
        assert!(old.is_none(), "set_pin() should not be called more than once");
    }
}

struct IsrIntr(Weak<IsrState>);
impl IsrIntr {
    fn new(state: &Arc<IsrState>) -> Box<Self> {
        Box::new(Self(Arc::downgrade(state)))
    }
}
impl VirtioIntr for IsrIntr {
    fn notify(&self) {
        if let Some(state) = Weak::upgrade(&self.0) {
            state.raise_queue()
        }
    }
    fn read(&self) -> VqIntr {
        VqIntr::Pin
    }
}

struct MsiIntr {
    hdl: pci::MsixHdl,
    index: u16,
}
impl MsiIntr {
    fn new(hdl: pci::MsixHdl, index: u16) -> Box<Self> {
        Box::new(Self { hdl, index })
    }
}
impl VirtioIntr for MsiIntr {
    fn notify(&self) {
        if self.index < self.hdl.count() {
            self.hdl.fire(self.index);
        }
    }
    fn read(&self) -> VqIntr {
        if self.index < self.hdl.count() {
            let data = self.hdl.read(self.index);
            VqIntr::Msi(data.addr, data.data, data.masked)
        } else {
            VqIntr::Pin
        }
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum VirtioConfigRegBlock {
    Legacy,
    Common,
    DeviceConfig,
    Notify,
    IsrStatus,
    RazWi,
}

// Some of these sizes are drawn from the VirtIO specification (e.g., the
// sum for `COMMON_REG_SIZE` is the sum of the sizes of the data that make
// up the common register fields as defined in VirtIO 1.2).
//
// Others are somewhat abitrary; the page offsets we have chosen, for example,
// are of our own selection and are defined so that a guest driver can map
// different registers in their own pages (using 4KiB page mappings).  This is
// not strictly necessary, however.
const LEGACY_REG_SIZE: usize = 0x18;
const LEGACY_REG_SIZE_NO_MSIX: usize = LEGACY_REG_SIZE - 2 * 2;
const LEGACY_REG_QUEUE_NOTIFY_OFFSET: usize = 0x10;

const COMMON_REG_OFFSET: usize = 0;
const COMMON_REG_SIZE: usize =
    4 + 4 + 4 + 4 + 2 + 2 + 1 + 1 + 2 + 2 + 2 + 2 + 2 + 8 + 8 + 8 + 2 + 2;
// Some tests want to poke at the common config registers, but before doing so use the total
// common_cfg struct size to spot-check that the layout is correct. Ideally the register map we
// build here could go both ways and either be public, or public for tests.
#[cfg(test)]
pub const COMMON_REG_SIZE_TEST: usize = COMMON_REG_SIZE;
const DEVICE_REG_OFFSET: usize = PAGE_SIZE;
const NOTIFY_REG_OFFSET: usize = 2 * PAGE_SIZE;
pub const NOTIFY_REG_SIZE: usize = 4;
const ISR_STATUS_REG_OFFSET: usize = 3 * PAGE_SIZE;
const ISR_STATUS_REG_SIZE: usize = 1;

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum CommonConfigReg {
    /// Configuration data for the device as a whole.
    DeviceFeatureSelect,
    DeviceFeature,
    DriverFeatureSelect,
    DriverFeature,
    ConfigMsixVector,
    NumQueues,
    DeviceStatus,
    ConfigGeneration,

    /// Configuration information for a specific queue.
    QueueSelect,
    QueueSize,
    QueueMsixVector,
    QueueEnable,
    QueueNotifyOffset,
    QueueDescAddr,
    QueueDriverAddr,
    QueueDeviceAddr,
    QueueNotifyData,
    QueueReset,
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum NotifyReg {
    Notify,
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum IsrStatusReg {
    IsrStatus,
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum LegacyConfigReg {
    DeviceFeature,
    DriverFeature,
    QueueAddress4k,
    QueueSize,
    QueueSelect,
    QueueNotify,
    DeviceStatus,
    IsrStatus,
    ConfigMsixVector,
    QueueMsixVector,
}

lazy_static! {
    static ref COMMON_REGS: RegMap<CommonConfigReg> = {
        let layout = [
            // These refer to the device as a whole.
            (CommonConfigReg::DeviceFeatureSelect, 4),
            (CommonConfigReg::DeviceFeature, 4),
            (CommonConfigReg::DriverFeatureSelect, 4),
            (CommonConfigReg::DriverFeature, 4),
            (CommonConfigReg::ConfigMsixVector, 2),
            (CommonConfigReg::NumQueues, 2),
            (CommonConfigReg::DeviceStatus, 1),
            (CommonConfigReg::ConfigGeneration, 1),
            // These are banked for specific virtqueues, distinguished
            // via the "QueueSelect" register.
            (CommonConfigReg::QueueSelect, 2),
            (CommonConfigReg::QueueSize, 2),
            (CommonConfigReg::QueueMsixVector, 2),
            (CommonConfigReg::QueueEnable, 2),
            (CommonConfigReg::QueueNotifyOffset, 2),
            (CommonConfigReg::QueueDescAddr, 8),
            (CommonConfigReg::QueueDriverAddr, 8),
            (CommonConfigReg::QueueDeviceAddr, 8),
            (CommonConfigReg::QueueNotifyData, 2),
            (CommonConfigReg::QueueReset, 2),
        ];
        RegMap::create_packed(COMMON_REG_SIZE, &layout, None)
    };

    static ref NOTIFY_REGS: RegMap<NotifyReg> = {
        let layout = [
            (NotifyReg::Notify, 4),
        ];
        RegMap::create_packed(NOTIFY_REG_SIZE, &layout, None)
    };

    static ref ISR_STATUS_REGS: RegMap<IsrStatusReg> = {
        let layout = [
            (IsrStatusReg::IsrStatus, 1),
        ];
        RegMap::create_packed(ISR_STATUS_REG_SIZE, &layout, None)
    };

    static ref LEGACY_REGS: RegMap<LegacyConfigReg> = {
        let layout = [
            (LegacyConfigReg::DeviceFeature, 4),
            (LegacyConfigReg::DriverFeature, 4),
            (LegacyConfigReg::QueueAddress4k, 4),
            (LegacyConfigReg::QueueSize, 2),
            (LegacyConfigReg::QueueSelect, 2),
            (LegacyConfigReg::QueueNotify, 2),
            (LegacyConfigReg::DeviceStatus, 1),
            (LegacyConfigReg::IsrStatus, 1),
            (LegacyConfigReg::ConfigMsixVector, 2),
            (LegacyConfigReg::QueueMsixVector, 2),
        ];
        RegMap::create_packed(LEGACY_REG_SIZE, &layout, None)
    };
}

/// VirtIO configuration capabilities.
///
/// These definitions come from the description of
/// `cfg_type` in section 4.1.4 in VirtIO 1.2.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
#[repr(u32)]
enum VirtioCfgCapTag {
    Common = 1,
    Notify = 2,
    Isr = 3,
    Device = 4,
    Pci = 5,
    SharedMemory = 8,
    Vendor = 9,
}

impl From<VirtioCfgCapTag> for u32 {
    fn from(tag: VirtioCfgCapTag) -> u32 {
        tag as u32
    }
}

impl TryFrom<u32> for VirtioCfgCapTag {
    type Error = u32;
    fn try_from(raw: u32) -> Result<Self, Self::Error> {
        match raw {
            1 => Ok(Self::Common),
            2 => Ok(Self::Notify),
            3 => Ok(Self::Isr),
            4 => Ok(Self::Device),
            5 => Ok(Self::Pci),
            8 => Ok(Self::SharedMemory),
            9 => Ok(Self::Vendor),
            _ => Err(raw),
        }
    }
}

const COMMON_CFG_CAP_SIZE: u8 = 1 + 1 + 1 + 1 + 2 + 4 + 4;
const NOTIFY_CFG_CAP_SIZE: u8 = COMMON_CFG_CAP_SIZE + 4;
const PCI_CFG_CAP_SIZE: u8 = COMMON_CFG_CAP_SIZE + 4;

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum CommonCfgCapReg {
    CapLen,
    CfgType,
    Bar,
    Id,
    Padding,
    Offset,
    Length,
}

lazy_static! {
    /// The common configuration capability registers in config space.  Note
    /// that the capability type and next pointer are not included here, as
    /// these are defined and consumed by the framework.  So while the padding
    /// field appears to pad to a 6 byte offset, it actually pads to an 8 byte
    /// offset, as the entire register space is already offset by two bytes.
    ///
    /// This definition corresponds to `struct virtio_pci_cap` from sec 4.1.4
    /// of VirtIO 1.2.
    static ref COMMON_CFG_CAP_REGS: RegMap<CommonCfgCapReg> = {
        let layout = [
            (CommonCfgCapReg::CapLen, 1),
            (CommonCfgCapReg::CfgType, 1),
            (CommonCfgCapReg::Bar, 1),
            (CommonCfgCapReg::Id, 1),
            (CommonCfgCapReg::Padding, 2),  // Note, includes type and next
            (CommonCfgCapReg::Offset, 4),
            (CommonCfgCapReg::Length, 4),
        ];
        RegMap::create_packed(COMMON_CFG_CAP_SIZE.into(), &layout, None)
    };
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum NotifyCfgCapReg {
    Common(CommonCfgCapReg),
    Multiplier,
}

lazy_static! {
    /// The nofiticiation capability regsiters in config space.
    ///
    /// See the note around `COMMON_CFG_CAP_REGS` for details about
    /// padding, offsets, and alignment.  This definition corresponds
    /// to `struct virtio_pci_notify_cap` from sec 4.1.4.4 of VirtIO 1.2.
    static ref NOTIFY_CFG_CAP_REGS: RegMap<NotifyCfgCapReg> = {
        let layout = [
            (NotifyCfgCapReg::Common(CommonCfgCapReg::CapLen), 1),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::CfgType), 1),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::Bar), 1),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::Id), 1),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::Padding), 2),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::Offset), 4),
            (NotifyCfgCapReg::Common(CommonCfgCapReg::Length), 4),
            (NotifyCfgCapReg::Multiplier, 4),
        ];
        RegMap::create_packed(NOTIFY_CFG_CAP_SIZE.into(), &layout, None)
    };
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum PciCfgCapReg {
    Common(CommonCfgCapReg),
    PciData,
}

lazy_static! {
    /// The PCI configuration capability register in config space.
    ///
    /// See the note around `COMMON_CFG_CAP_REGS` for details about
    /// padding, offsets, and alignment.  This definition corresponds
    /// to `struct virtio_pci_cfg_cap` from sec 4.1.4.9 of VirtIO 1.2.
    static ref PCI_CFG_CAP_REGS: RegMap<PciCfgCapReg> = {
        let layout = [
            (PciCfgCapReg::Common(CommonCfgCapReg::CapLen), 1),
            (PciCfgCapReg::Common(CommonCfgCapReg::CfgType), 1),
            (PciCfgCapReg::Common(CommonCfgCapReg::Bar), 1),
            (PciCfgCapReg::Common(CommonCfgCapReg::Id), 1),
            (PciCfgCapReg::Common(CommonCfgCapReg::Padding), 2),
            (PciCfgCapReg::Common(CommonCfgCapReg::Offset), 4),
            (PciCfgCapReg::Common(CommonCfgCapReg::Length), 4),
            (PciCfgCapReg::PciData, 4),
        ];
        RegMap::create_packed(PCI_CFG_CAP_SIZE.into(), &layout, None)
    };
}

pub mod migrate {
    use crate::hw::virtio::queue;
    use crate::migrate::*;
    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct DeviceStateV1 {
        pub status: u8,
        pub queue_select: u16,
        pub negotiated_features: u64,
        pub device_feature_select: u32,
        pub driver_feature_select: u32,
        pub config_generation: u8,
        pub config_generation_seen: bool,
        pub device_config_size: u64,
        pub mode: u32,
        pub msix_cfg_vec: u16,
        pub msix_queue_vec: Vec<u16>,
        pub isr_queue: bool,
        pub isr_cfg: bool,
    }

    #[derive(Deserialize, Serialize)]
    pub struct PciVirtioStateV1 {
        pub device: DeviceStateV1,
        pub queues: queue::migrate::VirtQueuesV1,
    }
    impl Schema<'_> for PciVirtioStateV1 {
        fn id() -> SchemaId {
            ("pci-virtio", 1)
        }
    }
}


================================================
FILE: lib/propolis/src/hw/virtio/queue.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::mem;
use std::num::{NonZeroU16, Wrapping};
use std::sync::atomic::{fence, AtomicBool, AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};

use bitflags::bitflags;
use zerocopy::{FromBytes, IntoBytes};

use super::probes;
use super::{VirtioIntr, VqIntr};
use crate::accessors::MemAccessor;
use crate::common::*;
use crate::hw::virtio;
use crate::migrate::MigrateStateError;
use crate::vmm::MemCtx;

bitflags! {
    /// Features supported by our implementation of virtqueues.
    pub struct Features: u64 {
        const RING_INDIRECT_DESC = 1 << 28;
        const RING_EVENT_IDX = 1 << 29;
        const VERSION_1 = 1 << 32;
    }

    struct QueueFlags: u16 {
        const DESC_NEXT = 1 << 0;
        const DESC_WRITE = 1 << 1;
        const DESC_INDIRECT = 1 << 2;
    }

    struct AvailFlags: u16 {
        const NO_INTERRUPT = 1 << 0;
    }

    struct UsedFlags: u16 {
        const NO_NOTIFY = 1 << 0;
    }
}

impl Features {
    /// Returns those features appropriate for a legacy queue.
    pub fn legacy() -> Self {
        Self::RING_INDIRECT_DESC
    }

    /// Returns those features appropriate for a transitional queue.
    pub fn transitional() -> Self {
        Self::legacy() | Self::VERSION_1
    }
}

#[repr(C)]
#[derive(Copy, Clone, FromBytes)]
struct VqdDesc {
    addr: u64,
    len: u32,
    flags: u16,
    next: u16,
}
#[repr(C)]
#[derive(Copy, Clone, Debug, IntoBytes)]
struct VqdUsed {
    id: u32,
    len: u32,
}

#[derive(Copy, Clone, Debug)]
pub struct VqReq {
    desc_idx: u16,
    avail_idx: u16,
}

pub struct VqAvail {
    /// Is populated with a valid physical address(es) for its contents
    valid: bool,

    gpa_flags: GuestAddr,
    gpa_idx: GuestAddr,
    gpa_ring: GuestAddr,
    cur_avail_idx: Wrapping<u16>,

    gpa_desc: GuestAddr,
}

impl VqAvail {
    /// If there's a request ready, pop it off the queue and return the
    /// corresponding descriptor and available ring indicies.
    fn read_next_avail(&mut self, rsize: u16, mem: &MemCtx) -> Option<VqReq> {
        if !self.valid {
            return None;
        }
        if let Some(idx) = mem.read::<u16>(self.gpa_idx) {
            let ndesc = Wrapping(*idx) - self.cur_avail_idx;
            if ndesc.0 != 0 && ndesc.0 <= rsize {
                let avail_idx = self.cur_avail_idx.0 & (rsize - 1);
                self.cur_avail_idx += Wrapping(1);

                fence(Ordering::Acquire);
                let addr = self.gpa_ring.offset::<u16>(avail_idx as usize);
                return mem
                    .read(addr)
                    .map(|desc_idx| VqReq { desc_idx: *desc_idx, avail_idx });
            }
        }
        None
    }

    fn read_ring_descr(
        &self,
        id: u16,
        rsize: u16,
        mem: &MemCtx,
    ) -> Option<GuestData<VqdDesc>> {
        assert!(id < rsize);
        let addr = self.gpa_desc.offset::<VqdDesc>(id as usize);
        mem.read::<VqdDesc>(addr)
    }

    fn reset(&mut self) {
        self.valid = false;
        self.gpa_flags = GuestAddr(0);
        self.gpa_idx = GuestAddr(0);
        self.gpa_ring = GuestAddr(0);
        self.gpa_desc = GuestAddr(0);
        self.cur_avail_idx = Wrapping(0);
    }

    fn map_split(&mut self, desc_addr: u64, avail_addr: u64) {
        self.gpa_desc = GuestAddr(desc_addr);
        // 16-bit flags, followed by 16-bit idx, followed by avail desc ring
        self.gpa_flags = GuestAddr(avail_addr);
        self.gpa_idx = GuestAddr(avail_addr + 2);
        self.gpa_ring = GuestAddr(avail_addr + 4);
    }

    /// Returns guest flags.
    fn flags(&self, mem: &MemCtx) -> AvailFlags {
        let value =
            if self.valid { *mem.read(self.gpa_flags).unwrap() } else { 0 };
        AvailFlags::from_bits_truncate(value)
    }

    /// Returns true IFF interrupts are supressed.
    #[allow(dead_code)]
    fn _intr_supressed(&self, mem: &MemCtx) -> bool {
        let flags = self.flags(mem);
        flags.contains(AvailFlags::NO_INTERRUPT)
    }
}

pub struct VqUsed {
    /// Is populated with a valid physical address(es) for its contents
    valid: bool,

    gpa_flags: GuestAddr,
    gpa_idx: GuestAddr,
    gpa_ring: GuestAddr,
    used_idx: Wrapping<u16>,
    interrupt: Option<Box<dyn VirtioIntr>>,
}

impl VqUsed {
    fn write_used(&mut self, id: u16, len: u32, rsize: u16, mem: &MemCtx) {
        // We do not expect used entries to be pushed into a virtqueue which has
        // not been configured atop physical addresses yet.
        assert!(self.valid);

        let idx = self.used_idx.0 & (rsize - 1);
        self.used_idx += Wrapping(1);
        let desc_addr = self.gpa_ring.offset::<VqdUsed>(idx as usize);

        let used = VqdUsed { id: u32::from(id), len };
        mem.write(desc_addr, &used);

        fence(Ordering::Release);
        mem.write(self.gpa_idx, &self.used_idx.0);
    }

    /// Returns guest flags.
    fn flags(&self, mem: &MemCtx) -> UsedFlags {
        let value: u16 = *mem.read(self.gpa_flags).unwrap();
        UsedFlags::from_bits_truncate(value)
    }

    /// Sets flags.
    fn set_flags(&self, flags: UsedFlags, mem: &MemCtx) {
        let value = flags.bits();
        mem.write(self.gpa_flags, &value);
    }

    /// Disables notifications on this queue; returns the previous state.
    fn disable_notify(&self, mem: &MemCtx) -> bool {
        let flags = self.flags(mem);
        let current = flags.contains(UsedFlags::NO_NOTIFY);
        self.set_flags(flags | UsedFlags::NO_NOTIFY, mem);
        current
    }

    fn enable_notify(&self, mem: &MemCtx) {
        let mut flags = self.flags(mem);
        flags.remove(UsedFlags::NO_NOTIFY);
        self.set_flags(flags, mem);
    }

    /// Returns true iff notifications are supressed for this queue.
    fn notify_supressed(&self, mem: &MemCtx) -> bool {
        let flags = self.flags(mem);
        flags.contains(UsedFlags::NO_NOTIFY)
    }

    fn reset(&mut self) {
        self.valid = false;
        self.gpa_flags = GuestAddr(0);
        self.gpa_idx = GuestAddr(0);
        self.gpa_ring = GuestAddr(0);
        self.used_idx = Wrapping(0);
    }
    fn map_split(&mut self, gpa: u64) {
        // 16-bit flags, followed by 16-bit idx, followed by used desc ring
        self.gpa_flags = GuestAddr(gpa);
        self.gpa_idx = GuestAddr(gpa + 2);
        self.gpa_ring = GuestAddr(gpa + 4);
    }
}

#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct VqSize(NonZeroU16);
impl VqSize {
    pub const fn new(size: u16) -> VqSize {
        let size = NonZeroU16::new(size).expect("nonzero queue size");
        assert!(size.is_power_of_two());
        Self(size)
    }
}

impl TryFrom<NonZeroU16> for VqSize {
    type Error = VqSizeError;

    fn try_from(value: NonZeroU16) -> Result<Self, Self::Error> {
        if !value.is_power_of_two() {
            Err(VqSizeError::NotPow2)
        } else {
            Ok(Self(value))
        }
    }
}
impl TryFrom<u16> for VqSize {
    type Error = VqSizeError;

    fn try_from(value: u16) -> Result<Self, Self::Error> {
        NonZeroU16::try_from(value).or(Err(VqSizeError::IsZero))?.try_into()
    }
}

impl Into<u16> for VqSize {
    fn into(self) -> u16 {
        self.0.get()
    }
}

#[derive(Copy, Clone, Debug, thiserror::Error)]
pub enum VqSizeError {
    #[error("virtqueue size must be power of 2")]
    NotPow2,
    #[error("virtqueue size must not be 0")]
    IsZero,
}

pub struct VirtQueue {
    pub id: u16,
    pub size: Mutex<VqSize>,
    pub live: AtomicBool,
    pub enabled: AtomicBool,
    pub is_control: AtomicBool,
    pub notify_data: u16,
    avail: Mutex<VqAvail>,
    used: Mutex<VqUsed>,
    pub acc_mem: MemAccessor,
}

const fn qalign(addr: u64, align: u64) -> u64 {
    assert!(align.is_power_of_two());
    let mask = align - 1;
    (addr + mask) & !mask
}

impl VirtQueue {
    fn new(id: u16, size: VqSize) -> Self {
        Self {
            id,
            size: Mutex::new(size),
            live: AtomicBool::new(false),
            enabled: AtomicBool::new(false),
            is_control: AtomicBool::new(false),
            notify_data: id,
            avail: Mutex::new(VqAvail {
                valid: false,
                gpa_flags: GuestAddr(0),
                gpa_idx: GuestAddr(0),
                gpa_ring: GuestAddr(0),
                cur_avail_idx: Wrapping(0),
                gpa_desc: GuestAddr(0),
            }),
            used: Mutex::new(VqUsed {
                valid: false,
                gpa_flags: GuestAddr(0),
                gpa_idx: GuestAddr(0),
                gpa_ring: GuestAddr(0),
                used_idx: Wrapping(0),
                interrupt: None,
            }),
            acc_mem: MemAccessor::new_orphan(),
        }
    }

    pub(super) fn reset(&self) {
        let mut avail = self.avail.lock().unwrap();
        let mut used = self.used.lock().unwrap();

        // XXX verify no outstanding chains
        avail.reset();
        used.reset();
        self.live.store(false, Ordering::Release);
        self.enabled.store(false, Ordering::Release);
    }

    pub(super) fn enable(&self) {
        self.enabled.store(true, Ordering::Release);
    }

    pub(super) fn is_enabled(&self) -> bool {
        self.enabled.load(Ordering::Acquire)
    }

    pub(super) fn arise(&self) {
        self.live.store(true, Ordering::Release);
    }

    pub(super) fn is_alive(&self) -> bool {
        self.live.load(Ordering::Acquire)
    }

    pub(super) fn is_control(&self) -> bool {
        self.is_control.load(Ordering::Acquire)
    }

    pub(super) fn set_control(&self) {
        self.is_control.store(true, Ordering::Release);
    }

    #[inline(always)]
    pub fn size(&self) -> u16 {
        let size = *self.size.lock().unwrap();
        size.into()
    }

    /// Attempt to establish area mappings for this virtqueue at specified
    /// physical addresses.  Using the terminology of VirtIO 1.2, we take the
    /// addresses for the "Descriptor Area", "Driver Area", and "Device Area".
    /// Previously, these were called the "Descriptor Table", "Available Ring",
    /// and "Used Ring".  However, section 2.7 of the version 1.2 specification
    /// also refers to these using the older names, so we retain that
    /// terminology.
    pub fn map_virtqueue(
        &self,
        desc_addr: u64,
        avail_addr: u64,
        used_addr: u64,
    ) {
        let mut avail = self.avail.lock().expect("avail is initialized");
        let mut used = self.used.lock().expect("used is initialized");
        avail.map_split(desc_addr, avail_addr);
        used.map_split(used_addr);
        avail.valid = true;
        used.valid = true;
    }

    /// Attempt to establish ring mappings at a specified physical address,
    /// using legacy-style split virtqueue layout.
    ///
    /// `addr` must be aligned to 4k per the legacy requirements
    pub fn map_legacy(&self, addr: u64) {
        const LEGACY_QALIGN: u64 = PAGE_SIZE as u64;
        assert_eq!(addr & (LEGACY_QALIGN - 1), 0);
        assert_ne!(addr, 0);

        let size = self.size() as usize;

        let desc_addr = addr;
        let desc_len = mem::size_of::<VqdDesc>() * size;

        let avail_addr = desc_addr + desc_len as u64;
        let avail_len = 2 * (size + 3);

        let used_addr = qalign(avail_addr + avail_len as u64, LEGACY_QALIGN);
        let _used_len = mem::size_of::<VqUsed>() * size + 2 * 3;

        self.map_virtqueue(desc_addr, avail_addr, used_addr);
    }

    /// Returns true iff there is a valid mapping for this queue in the
    /// guest physical address space.
    pub fn is_mapped(&self) -> bool {
        self.avail.lock().unwrap().valid
    }

    /// Returns true if this queue is not mapped, or is empty.
    pub fn avail_is_empty(&self, mem: &MemCtx) -> bool {
        let avail = self.avail.lock().expect("not poisoned");
        !avail.valid || {
            let guest_idx: u16 = *mem.read(avail.gpa_idx).unwrap();
            avail.cur_avail_idx == std::num::Wrapping(guest_idx)
        }
    }

    pub fn get_state(&self) -> Info {
        let avail = self.avail.lock().unwrap();
        let used = self.used.lock().unwrap();

        Info {
            mapping: MapInfo {
                desc_addr: avail.gpa_desc.0,
                avail_addr: avail.gpa_flags.0,
                used_addr: used.gpa_flags.0,
                valid: avail.valid,
            },
            avail_idx: avail.cur_avail_idx.0,
            used_idx: used.used_idx.0,
        }
    }

    pub fn set_state(&self, info: &Info) {
        let mut avail = self.avail.lock().unwrap();
        let mut used = self.used.lock().unwrap();

        avail.map_split(info.mapping.desc_addr, info.mapping.avail_addr);
        used.map_split(info.mapping.used_addr);
        avail.valid = info.mapping.valid;
        used.valid = info.mapping.valid;
        avail.cur_avail_idx = Wrapping(info.avail_idx);
        used.used_idx = Wrapping(info.used_idx);
    }

    /// Accummulates a sequence of available descriptors into a `Chain`.
    ///
    /// VirtIO descriptors can be organized into a linked list
    pub fn pop_avail(
        &self,
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Option<(u16, u32)> {
        assert!(chain.idx.is_none());
        let mut avail = self.avail.lock().unwrap();
        let req = avail.read_next_avail(self.size(), mem)?;

        let mut desc = avail.read_ring_descr(req.desc_idx, self.size(), mem)?;
        let mut flags = DescFlag::from_bits_truncate(desc.flags);
        let mut count = 0;
        let mut len = 0;
        chain.idx = Some(req.desc_idx);
        probes::virtio_vq_pop!(|| (
            self as *const VirtQueue as u64,
            req.desc_idx,
            req.avail_idx,
        ));

        // non-indirect descriptor(s)
        while !flags.contains(DescFlag::INDIRECT) {
            let buf = match flags.contains(DescFlag::WRITE) {
                true => ChainBuf::Writable(GuestAddr(desc.addr), desc.len),
                false => ChainBuf::Readable(GuestAddr(desc.addr), desc.len),
            };
            count += 1;
            len += desc.len;
            chain.push_buf(buf);

            if flags.intersects(DescFlag::NEXT | DescFlag::INDIRECT) {
                if count == self.size() {
                    // XXX: signal error condition?
                    chain.idx = None;
                    return None;
                }
                if let Some(next) =
                    avail.read_ring_descr(desc.next, self.size(), mem)
                {
                    desc = next;
                    flags = DescFlag::from_bits_truncate(desc.flags);
                } else {
                    return Some((req.avail_idx, len));
                }
            } else {
                return Some((req.avail_idx, len));
            }
        }
        // XXX: skip indirect if not negotiated
        if flags.contains(DescFlag::INDIRECT) {
            if (desc.len as usize) < mem::size_of::<VqdDesc>()
                || desc.len as usize & (mem::size_of::<VqdDesc>() - 1) != 0
            {
                // XXX: signal error condition?
                chain.idx = None;
                return None;
            }
            let indirect_count = desc.len as usize / mem::size_of::<VqdDesc>();
            let idescs = mem
                .read_many::<VqdDesc>(GuestAddr(desc.addr), indirect_count)
                .unwrap();
            desc = idescs.get(0).unwrap();
            flags = DescFlag::from_bits_truncate(desc.flags);
            loop {
                let buf = match flags.contains(DescFlag::WRITE) {
                    true => ChainBuf::Writable(GuestAddr(desc.addr), desc.len),
                    false => ChainBuf::Readable(GuestAddr(desc.addr), desc.len),
                };

                count += 1;
                len += desc.len;
                chain.push_buf(buf);

                if flags.contains(DescFlag::NEXT) {
                    // XXX: better error handling
                    desc = idescs.get(desc.next as usize).unwrap();
                    flags = DescFlag::from_bits_truncate(desc.flags);
                } else {
                    break;
                }
            }
        }
        Some((req.avail_idx, len))
    }

    pub fn push_used(&self, chain: &mut Chain, mem: &MemCtx) {
        assert!(chain.idx.is_some());
        let mut used = self.used.lock().unwrap();
        let id = mem::replace(&mut chain.idx, None).unwrap();
        // XXX: for now, just go off of the write stats
        let len = chain.write_stat.bytes - chain.write_stat.bytes_remain;
        probes::virtio_vq_push!(|| (self as *const VirtQueue as u64, id, len));
        used.write_used(id, len, self.size(), mem);
        // XXX: This is wrong.  Interrupt notification is on the avail ring,
        // not used.
        #[allow(clippy::overly_complex_bool_expr)]
        if true || !used.notify_supressed(mem) {
            if let Some(intr) = used.interrupt.as_ref() {
                intr.notify();
            }
        }
        chain.reset();
    }

    /// Set the backing interrupt resource for VQ
    pub(super) fn set_intr(&self, intr: Box<dyn VirtioIntr>) {
        let mut used = self.used.lock().unwrap();
        used.interrupt = Some(intr)
    }

    /// Read the interrupt configuration for the `Used` ring
    pub(super) fn read_intr(&self) -> Option<VqIntr> {
        let used = self.used.lock().unwrap();
        used.interrupt.as_ref().map(|x| x.read())
    }

    /// Disables interrupts (notifications) on the `Used` ring
    pub(super) fn disable_intr(&self, mem: &MemCtx) -> bool {
        let used = self.used.lock().unwrap();
        used.disable_notify(mem)
    }

    /// Enables interrupts (notifications) on the `Used` ring
    pub(super) fn enable_intr(&self, mem: &MemCtx) {
        let used = self.used.lock().unwrap();
        used.enable_notify(mem);
    }

    /// Send an interrupt for this virtual queue.
    pub(super) fn send_intr(&self, mem: &MemCtx) {
        let used = self.used.lock().unwrap();
        // XXX: This is wrong.  Interrupt notification is on the avail ring,
        // not used.
        #[allow(clippy::overly_complex_bool_expr)]
        if true || !used.notify_supressed(mem) {
            if let Some(intr) = used.interrupt.as_ref() {
                intr.notify();
            }
        }
    }

    pub fn export(&self) -> migrate::VirtQueueV1 {
        let avail = self.avail.lock().unwrap();
        let used = self.used.lock().unwrap();

        migrate::VirtQueueV1 {
            id: self.id,
            size: self.size(),
            descr_gpa: avail.gpa_desc.0,
            mapping_valid: avail.valid && used.valid,
            live: self.live.load(Ordering::Acquire),
            enabled: self.enabled.load(Ordering::Acquire),
            is_control: self.is_control.load(Ordering::Acquire),
            notify_data: self.notify_data,

            // `flags` field is the first member for avail and used rings
            avail_gpa: avail.gpa_flags.0,
            used_gpa: used.gpa_flags.0,

            avail_cur_idx: avail.cur_avail_idx.0,
            used_idx: used.used_idx.0,
        }
    }

    pub fn import(
        &self,
        state: &migrate::VirtQueueV1,
        mode: virtio::Mode,
    ) -> Result<(), MigrateStateError> {
        let mut avail = self.avail.lock().unwrap();
        let mut used = self.used.lock().unwrap();

        if self.id != state.id {
            return Err(MigrateStateError::ImportFailed(format!(
                "VirtQueue: mismatched IDs {} vs {}",
                self.id, state.id,
            )));
        }
        if mode == virtio::Mode::Legacy {
            // As VirtIO 1.0 notes, for a device operated as legacy,
            //
            // > There was no mechanism to negotiate the queue size.
            //
            // so if these sizes don't match, the payload is truly incompatible
            // with this device.
            if self.size() != state.size {
                return Err(MigrateStateError::ImportFailed(format!(
                    "VirtQueue: mismatched size {} vs {}",
                    self.size(),
                    state.size,
                )));
            }
        } else {
            // Otherwise, we expect to import into a freshly-created VirtIO PCI
            // device, with queues all set to their maximum sizes. The sizes to
            // import may be smaller if the guest OS's driver configured them
            // down.
            let mut queue_size = self.size.lock().unwrap();
            if queue_size.0.get() < state.size {
                return Err(MigrateStateError::ImportFailed(format!(
                    "VirtQueue: larger than supported {} > {}",
                    queue_size.0.get(),
                    state.size,
                )));
            }
            let new_size = match VqSize::try_from(state.size) {
                Ok(size) => size,
                Err(e) => {
                    return Err(MigrateStateError::ImportFailed(format!(
                        "VirtQueue: unacceptable queue size: {}",
                        e
                    )));
                }
            };
            *queue_size = new_size;
        }
        if self.notify_data != state.notify_data {
            return Err(MigrateStateError::ImportFailed(format!(
                "VirtQueue: mismatched notify data {} vs {}",
                self.notify_data, state.notify_data,
            )));
        }

        avail.map_split(state.descr_gpa, state.avail_gpa);
        avail.valid = state.mapping_valid;
        avail.cur_avail_idx = Wrapping(state.avail_cur_idx);

        used.map_split(state.used_gpa);
        used.valid = state.mapping_valid;
        used.used_idx = Wrapping(state.used_idx);

        self.live.store(state.live, Ordering::Release);
        self.enabled.store(state.enabled, Ordering::Release);
        self.is_control.store(state.is_control, Ordering::Release);

        Ok(())
    }
}

bitflags! {
    #[derive(Default)]
    pub struct DescFlag: u16 {
        const NEXT = 1 << 0;
        const WRITE = 1 << 1;
        const INDIRECT = 1 << 2;
    }
}

#[derive(Copy, Clone, Debug)]
pub enum ChainBuf {
    Readable(GuestAddr, u32),
    Writable(GuestAddr, u32),
}
impl ChainBuf {
    pub fn is_readable(&self) -> bool {
        match self {
            ChainBuf::Readable(_, _) => true,
            ChainBuf::Writable(_, _) => false,
        }
    }
    pub fn is_writable(&self) -> bool {
        !self.is_readable()
    }
}

#[derive(Default, Debug)]
struct ChainStat {
    count: u32,
    bytes: u32,
    bytes_remain: u32,
    pos_idx: u32,
    pos_off: u32,
}

#[derive(Debug)]
pub struct Chain {
    idx: Option<u16>,
    read_stat: ChainStat,
    write_stat: ChainStat,
    bufs: Vec<ChainBuf>,
}
impl Chain {
    pub fn with_capacity(size: usize) -> Self {
        assert!(size <= u16::MAX as usize);
        Self {
            idx: None,
            read_stat: Default::default(),
            write_stat: Default::default(),
            bufs: Vec::with_capacity(size),
        }
    }
    fn push_buf(&mut self, buf: ChainBuf) {
        let (stat, len) = match buf {
            ChainBuf::Readable(_, len) => (&mut self.read_stat, len),
            ChainBuf::Writable(_, len) => (&mut self.write_stat, len),
        };
        stat.count += 1;
        stat.bytes += len;
        stat.bytes_remain += len;
        self.bufs.push(buf);
    }
    fn reset(&mut self) {
        self.idx = None;
        self.read_stat = Default::default();
        self.write_stat = Default::default();
        self.bufs.clear();
    }

    pub fn read<T: Copy + FromBytes>(
        &mut self,
        item: &mut T,
        mem: &MemCtx,
    ) -> bool {
        let item_sz = mem::size_of::<T>();
        if (self.read_stat.bytes_remain as usize) < item_sz {
            return false;
        }
        // Safety: We assume the mutable item reference we have received is
        // valid (aligned, etc) to begin with.  It is cast into a u8 slice to
        // handle cases where it cannot be filled by a single buffer copy.
        let raw = unsafe {
            std::slice::from_raw_parts_mut(item as *mut T as *mut u8, item_sz)
        };
        let mut done = 0;
        let total = self.for_remaining_type(true, |addr, len| {
            let mut remain = GuestData::from(&mut raw[done..]);
            if let Some(copied) = mem.read_into(addr, &mut remain, len) {
                let need_more = copied != remain.len();

                done += copied;
                (copied, need_more)
            } else {
                // Copy failed, so do not attempt anything else
                (0, false)
            }
        });
        total == item_sz
    }
    /// Fetch a string of readable guest regions from the chain, provided there
    /// are enough to cover a specified length.
    pub fn readable_bufs(&mut self, len: usize) -> Option<Vec<GuestRegion>> {
        if len == 0 || (self.read_stat.bytes_remain as usize) < len {
            return None;
        }

        let mut bufs = Vec::new();
        let mut remain = len;
        self.for_remaining_type(true, |addr, blen| {
            let to_consume = usize::min(blen, remain);

            bufs.push(GuestRegion(addr, to_consume));

            // Since we checked for enough remaining bytes ahead of time, there
            // should be no risk of this failing.
            remain = remain.checked_sub(to_consume).unwrap();
            (to_consume, remain != 0)
        });
        assert_eq!(remain, 0);
        Some(bufs)
    }
    pub fn write<T: Copy + IntoBytes>(
        &mut self,
        item: &T,
        mem: &MemCtx,
    ) -> bool {
        let item_sz = mem::size_of::<T>();
        if (self.write_stat.bytes_remain as usize) < item_sz {
            return false;
        }
        // Safety: We assume the item reference we have received is valid
        // (aligned, etc) to begin with.  It is cast into a u8 slice to handle
        // cases where it cannot be filled by a single buffer copy.
        let raw = unsafe {
            std::slice::from_raw_parts(item as *const T as *const u8, item_sz)
        };
        let mut done = 0;
        let total = self.for_remaining_type(false, |addr, len| {
            let remain = &raw[done..];
            if let Some(copied) = mem.write_from(addr, remain, len) {
                let need_more = copied != remain.len();

                done += copied;
                (copied, need_more)
            } else {
                // Copy failed, so do not attempt anything else
                (0, false)
            }
        });
        total == item_sz
    }

    pub fn write_skip(&mut self, len: usize) -> bool {
        if len == 0 {
            return true;
        }
        if (self.write_stat.bytes_remain as usize) < len {
            return false;
        }
        let remain = len;
        self.for_remaining_type(false, |_addr, blen| {
            if blen < remain {
                // consume (skip) whole buffer length and continue
                (blen, true)
            } else {
                // consume only what is needed
                (remain, false)
            }
        });
        true
    }
    /// Fetch a string of writable guest regions from the chain, provided there
    /// are enough to cover a specified length.
    pub fn writable_bufs(&mut self, len: usize) -> Option<Vec<GuestRegion>> {
        if len == 0 || (self.write_stat.bytes_remain as usize) < len {
            return None;
        }

        let mut bufs = Vec::new();
        let mut remain = len;
        self.for_remaining_type(false, |addr, blen| {
            let to_consume = usize::min(blen, remain);

            bufs.push(GuestRegion(addr, to_consume));

            // Since we checked for enough remaining bytes ahead of time, there
            // should be no risk of this failing.
            remain = remain.checked_sub(to_consume).unwrap();
            (to_consume, remain != 0)
        });
        assert_eq!(remain, 0);
        Some(bufs)
    }

    pub fn remain_write_bytes(&self) -> usize {
        self.write_stat.bytes_remain as usize
    }
    pub fn remain_read_bytes(&self) -> usize {
        self.read_stat.bytes_remain as usize
    }

    pub(crate) fn for_remaining_type<F>(
        &mut self,
        is_read: bool,
        mut f: F,
    ) -> usize
    where
        F: FnMut(GuestAddr, usize) -> (usize, bool),
    {
        let stat = match is_read {
            true => &mut self.read_stat,
            false => &mut self.write_stat,
        };
        let iter = self
            .bufs
            .iter()
            .enumerate()
            .skip(stat.pos_idx as usize)
            .skip_while(|(_i, buf)| {
                if is_read {
                    buf.is_writable()
                } else {
                    buf.is_readable()
                }
            });
        let mut consumed_total = 0;
        for (idx, buf) in iter {
            let (addr, len) = match buf {
                ChainBuf::Readable(a, l) => {
                    if !is_read {
                        continue;
                    }
                    (*a, *l)
                }
                ChainBuf::Writable(a, l) => {
                    if is_read {
                        continue;
                    }
                    (*a, *l)
                }
            };
            if len == 0 {
                // skip 0-len buffers, even though they should not exist
                continue;
            }
            assert!(stat.pos_off < len);
            let off_addr = GuestAddr(addr.0 + u64::from(stat.pos_off));
            let off_len = (len - stat.pos_off) as usize;
            let (consumed, do_more) = f(off_addr, off_len);
            assert!(consumed <= off_len);
            if consumed != 0 {
                consumed_total += consumed;
                if consumed == off_len {
                    stat.pos_idx = idx as u32 + 1;
                    stat.pos_off = 0;
                } else {
                    stat.pos_off += consumed as u32;
                }
            }
            if !do_more {
                break;
            }
        }
        assert!(consumed_total as u32 <= stat.bytes_remain);
        stat.bytes_remain -= consumed_total as u32;
        consumed_total
    }
}

#[derive(Debug)]
pub struct MapInfo {
    pub desc_addr: u64,
    pub avail_addr: u64,
    pub used_addr: u64,
    pub valid: bool,
}

#[derive(Debug)]
pub struct Info {
    pub mapping: MapInfo,
    pub avail_idx: u16,
    pub used_idx: u16,
}

pub struct VirtQueues {
    len: AtomicUsize,
    peak: AtomicUsize,
    queues: Vec<Arc<VirtQueue>>,
}

const MAX_QUEUES: usize = 65535;

impl VirtQueues {
    pub fn new(sizes: &[VqSize]) -> Self {
        assert!(
            !sizes.is_empty() && sizes.len() <= MAX_QUEUES,
            "virtqueue size must be positive u16"
        );
        Self::new_with_len(sizes.len(), sizes)
    }

    pub fn new_with_len(initial_len: usize, sizes: &[VqSize]) -> Self {
        assert!(
            0 < initial_len
                && initial_len <= sizes.len()
                && sizes.len() <= MAX_QUEUES,
            "virtqueue size must be positive u16 and len must be smaller pos"
        );
        let queues = sizes
            .into_iter()
            .enumerate()
            .map(|(id, size)| Arc::new(VirtQueue::new(id as u16, *size)))
            .collect::<Vec<_>>();
        let len = AtomicUsize::new(initial_len);
        let peak = AtomicUsize::new(initial_len);
        Self { len, peak, queues }
    }

    pub fn set_len(&self, len: usize) -> Result<(), usize> {
        if len == 0 || len > self.max_capacity() {
            return Err(len);
        }
        self.len.store(len, Ordering::Release);
        let mut peak = self.peak.load(Ordering::Acquire);
        while len > peak {
            match self.peak.compare_exchange(
                peak,
                len,
                Ordering::Relaxed,
                Ordering::Relaxed,
            ) {
                Ok(_) => {
                    // We've updated the peak, all done
                    break;
                }
                Err(next_peak) => {
                    peak = next_peak;
                }
            }
        }
        Ok(())
    }

    pub fn count(&self) -> NonZeroU16 {
        NonZeroU16::try_from(self.len() as u16)
            .expect("queue count already validated")
    }

    pub fn len(&self) -> usize {
        self.len.load(Ordering::Relaxed)
    }

    pub fn peak(&self) -> usize {
        self.peak.load(Ordering::Relaxed)
    }

    pub fn reset_peak(&self) {
        let current = self.len.load(Ordering::Relaxed);
        self.peak.store(current, Ordering::Relaxed);
    }

    pub const fn max_capacity(&self) -> usize {
        self.queues.len()
    }

    pub fn get(&self, qid: u16) -> Option<&Arc<VirtQueue>> {
        let len = self.len();
        let qid = usize::from(qid);
        // XXX: This special case is for the virtio network device, which always
        // puts the control queue at the end of queue vector (see VirtIO 1.2
        // section 5.1.2).  None of the other devices currently handle queues
        // specially in this way, but we should come up with some better
        // mechanism here.
        if qid + 1 == len {
            Some(self.get_control())
        } else {
            self.queues[..len].get(qid)
        }
    }

    fn get_control(&self) -> &Arc<VirtQueue> {
        &self.queues[self.max_capacity() - 1]
    }

    pub fn iter(&self) -> impl std::iter::Iterator<Item = &Arc<VirtQueue>> {
        let len = self.len() - 1;
        self.queues[..len].iter().chain([self.get_control()])
    }

    /// Iterate all queues the device may have used; the current number of
    /// VirtQueues may be lower than a previous high watermark, but in cases
    /// like device reset and teardown we must manage all viona rings
    /// corresponding to ever-active VirtQueues.
    pub fn iter_all(&self) -> impl std::iter::Iterator<Item = &Arc<VirtQueue>> {
        let peak = self.peak() - 1;
        self.queues[..peak].iter().chain([self.get_control()])
    }

    pub fn export(&self) -> migrate::VirtQueuesV1 {
        let len = self.len() as u64;
        let peak = self.peak() as u64;
        let queues = self.queues.iter().map(|q| q.export()).collect();
        migrate::VirtQueuesV1 { len, peak, queues }
    }

    pub fn import(
        &self,
        state: &migrate::VirtQueuesV1,
        mode: virtio::Mode,
    ) -> Result<(), MigrateStateError> {
        for (vq, vq_input) in self.queues.iter().zip(state.queues.iter()) {
            vq.import(vq_input, mode)?;
        }
        // Avoid mucking with `peak` directly, since peak implies at some point
        // the device had been `set_len()` for that many queues and later
        // `set_len()` down to the actual exported count.
        self.set_len(state.peak as usize).map_err(|len| {
            MigrateStateError::ImportFailed(format!(
                "VirtQueues: could not set len to peak: {len}"
            ))
        })?;
        self.set_len(state.len as usize).map_err(|len| {
            MigrateStateError::ImportFailed(format!(
                "VirtQueues: could not set len to {len}"
            ))
        })?;
        Ok(())
    }
}

pub mod migrate {
    use serde::{Deserialize, Serialize};

    #[derive(Deserialize, Serialize)]
    pub struct VirtQueuesV1 {
        pub len: u64,
        pub peak: u64,
        pub queues: Vec<VirtQueueV1>,
    }

    #[derive(Deserialize, Serialize)]
    pub struct VirtQueueV1 {
        pub id: u16,
        pub size: u16,
        pub descr_gpa: u64,
        pub mapping_valid: bool,
        pub live: bool,
        pub enabled: bool,
        pub is_control: bool,
        pub notify_data: u16,

        pub avail_gpa: u64,
        pub avail_cur_idx: u16,

        pub used_gpa: u64,
        pub used_idx: u16,
    }
}

#[cfg(feature = "falcon")]
pub(crate) fn write_buf(buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
    // more copy pasta from Chain::write b/c like Chain:read a
    // statically sized type is expected.
    let mut done = 0;
    let _total = chain.for_remaining_type(false, |addr, len| {
        let remain = &buf[done..];
        if let Some(copied) = mem.write_from(addr, remain, len) {
            let need_more = copied != remain.len();

            done += copied;
            (copied, need_more)
        } else {
            // Copy failed, so do not attempt anything else
            (0, false)
        }
    });
}


================================================
FILE: lib/propolis/src/hw/virtio/softnpu.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::{
    collections::BTreeMap,
    fs::{self, File, OpenOptions},
    io::{Result, Write},
    sync::{Arc, Mutex},
    thread::{sleep, spawn},
    time::Duration,
};

use crate::{
    chardev::{Sink, Source},
    common::*,
    hw::{pci, uart::LpcUart, virtio},
    migrate::Migrator,
    util::regmap::RegMap,
    vmm::MemCtx,
};

use super::{
    bits::*,
    pci::{PciVirtio, PciVirtioState},
    queue::{write_buf, Chain, VirtQueue, VirtQueues, VqSize},
    viona::bits::VIRTIO_NET_S_LINK_UP,
    VirtioDevice,
};

use softnpu::p4rs::{self, packet_in, packet_out, Pipeline};
use softnpu::ManagementRequest;

use crate::hw::virtio::p9fs::{write_error, P9Handler, PciVirtio9pfs};
use dlpi::sys::dlpi_recvinfo_t;
use lazy_static::lazy_static;
use libc::ENOTSUP;
use libloading::os::unix::{Library, Symbol, RTLD_NOW};
use p9ds::proto::{Rclunk, Rwrite, Twrite};
use rand::Rng;
use serde::{Deserialize, Serialize};
use slog::{error, info, warn, Logger};

// Transit jumbo frames
const MTU: usize = 9216;
const SOFTNPU_CPU_AUX_PORT: u16 = 1000;

pub const MANAGEMENT_MESSAGE_PREAMBLE: u8 = 0b11100101;
pub const SOFTNPU_TTY: &str = "/dev/tty03";

/// A software network processing unit (SoftNpu) is an ASIC emulator. It's meant
/// to represent a P4 programmable ASIC such as those found in programmable
/// switches and NICs.
///
/// A SoftNpu instance can support a variable number of ports. These ports are
/// specified by the user as data link names through propolis configuration.
/// SoftNpu establishes a dlpi handle on each configured data link to perform
/// packet i/o.
///
/// When a SoftNpu device is instantiated there is no P4 program that runs by
/// default. A program must be loaded onto the emulated ASIC just like a real
/// ASIC. This is accomplished through the P9 file system device exposed by
/// SoftNpu. This P9 implementation exports a specific version string 9P2000.P4
/// and only implements file writes to allow a consumer to upload a P4 program.
///
/// SoftNpu takes pre-compiled P4 programs in the form of shared libraries.
/// These shared libraries must export a [pipeline constructor](
/// https://oxidecomputer.github.io/p4/p4rs/trait.Pipeline.html) under the
/// symbol `_main_pipeline_create`. Programs compiled with the `x4c` compiler
/// export this symbol automatically.
///
/// Once a pre-compiled P4 program is loaded, the Pipeline object from that
/// program is used to process packets. SoftNpu uses the illumos dlpi interface
/// to send and receive raw Ethernet frames from the data link devices it has
/// been configured with. Each frame received is processed with the loaded
/// pipeline. If the pipeline invocation returns an egress port, then the egress
/// packet returned by the pipeline will be sent out that port. If no egress
/// port is returned, the packet is dropped.
///
/// In addition to forwarding packets between ports, SoftNpu also supports
/// forwarding packets to and from the guest. This is accomplished through a
/// special `pci_port` device. This is a viona device that shows up in the guest
/// as a virtio network device. When a pipeline invocation returns an egress
/// port of `0`, packets are sent to this port.
///
/// Most P4 programs require a corresponding control plane program to manage
/// table state. For example, a program to add routing entries onto the ASIC. P4
/// programs themselves only handle packets, they are not capable of managing
/// table state. SoftNpu provides a uart-based management interface so that
/// programs running in the guest can modify the tables of the P4 program loaded
/// onto the ASIC. This is uart plumbed into the guest as `tty03`. What tables
/// exist and how they can be modified is up to the particular program that is
/// loaded. SoftNpu just provides a generic interface for table management and a
/// few other ASIC housekeeping items like determining the number of ports.
pub struct SoftNpu {
    /// Data links SoftNpu will hook into.
    pub data_links: Vec<String>,

    /// The PCI port.
    pub pci_port: Arc<PciVirtioSoftNpuPort>,

    /// UART for management from guest
    uart: Arc<LpcUart>,

    /// P9 file system endpoint for pre-compiled program transfer
    pub p9fs: Arc<PciVirtio9pfs>,

    booted: Mutex<bool>,

    /// Logging instance
    log: Logger,
}

unsafe impl Send for SoftNpu {}
unsafe impl Sync for SoftNpu {}

type LoadedP4Program = (Library, Box<dyn Pipeline>);

/// PciVirtioSoftNpuPort is a PCI device exposed to the guest as a virtio-net
/// device. This device represents a sidecar CPU port.
pub struct PciVirtioSoftNpuPort {
    /// Logging instance
    log: Logger,

    /// Virtio state to guest
    virtio_state: Arc<PortVirtioState>,

    /// dlpi handle for external i/o
    data_handles: Vec<dlpi::DlpiHandle>,

    mac: [u8; 6],

    //TODO should be able to do this as a RwLock
    pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
}

pub struct PortVirtioState {
    /// Underlying virtio state
    pci_virtio_state: PciVirtioState,

    /// Underlying PCI device state
    pci_state: pci::DeviceState,
}

impl PortVirtioState {
    fn new(queue_size: u16) -> Self {
        let rxq_size = VqSize::new(queue_size);
        let txq_size = VqSize::new(queue_size);
        let queues = VirtQueues::new(&[rxq_size, txq_size]);
        let msix_count = Some(2);
        let (pci_virtio_state, pci_state) = PciVirtioState::new(
            virtio::Mode::Legacy,
            queues,
            msix_count,
            virtio::DeviceId::Network,
            VIRTIO_NET_CFG_SIZE,
        );
        Self { pci_virtio_state, pci_state }
    }
}

impl SoftNpu {
    /// Create a new SoftNpu device for the specified data links. The
    /// `queue_size` is used for the viona device that underpins the PCI port
    /// going to the guest. The `uart` is used to provide a P4 management
    /// interface to the guest. The pipeline object is used to process packets.
    /// In most cases the value in the mutex should be initialized to `None` as
    /// users will dynamically load a P4 program from inside the guest.
    pub fn new(
        data_links: Vec<String>,
        queue_size: u16,
        uart: Arc<LpcUart>,
        p9fs: Arc<PciVirtio9pfs>,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        log: Logger,
    ) -> Result<Arc<Self>> {
        info!(log, "softnpu: data links {:#?}", data_links);

        let data_handles = Self::data_handles(&data_links)?;
        let virtio = Arc::new(PortVirtioState::new(queue_size));
        let pci_port = PciVirtioSoftNpuPort::new(
            Self::generate_mac(),
            data_handles,
            virtio,
            pipeline.clone(),
            log.clone(),
        );

        Ok(Arc::new(SoftNpu {
            data_links,
            pci_port,
            uart,
            p9fs,
            booted: Mutex::new(false),
            log,
        }))
    }

    /// Generate a mac address with the Oxide OUI for the leading bits and then
    /// something random in the range of 0xf00000 - 0xf00000 per RFD 174.
    fn generate_mac() -> [u8; 6] {
        let mut rng = rand::rng();
        let m = rng.random_range::<u32, _>(0xf00000..0xffffff).to_le_bytes();
        [0xa8, 0x40, 0x25, m[0], m[1], m[2]]
    }

    /// Set up a dlpi handle for each data link.
    fn data_handles(data_links: &Vec<String>) -> Result<Vec<dlpi::DlpiHandle>> {
        let mut handles = Vec::new();
        for x in data_links {
            let h = dlpi::open(x, dlpi::sys::DLPI_RAW)?;

            // Although we bind to the IPv6 SAP (Ethertype), the DL_PROMISC_SAP
            // allows us to pick up everything. Binding to *something* to start
            // with appears to be required to get packets.
            dlpi::bind(h, 0x86dd)?;
            dlpi::promisc_on(h, dlpi::sys::DL_PROMISC_MULTI)?;
            dlpi::promisc_on(h, dlpi::sys::DL_PROMISC_SAP)?;
            dlpi::promisc_on(h, dlpi::sys::DL_PROMISC_PHYS)?;
            dlpi::promisc_on(h, dlpi::sys::DL_PROMISC_RX_ONLY)?;
            handles.push(h);
        }
        Ok(handles)
    }

    /// Start the management handler for servicing requests from the guest over
    /// the provided uart device.
    fn run_management_handler_thread(&self) {
        info!(self.log, "softnpu: running management handler");
        self.uart.set_autodiscard(false);

        let log = self.log.clone();
        let uart = self.uart.clone();
        let pipeline = self.pci_port.pipeline.clone();
        let radix = self.data_links.len();

        spawn(move || Self::management_handler(uart, pipeline, radix, log));
    }

    fn management_handler(
        uart: Arc<LpcUart>,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        radix: usize,
        log: Logger,
    ) {
        info!(log, "management handler thread started");
        loop {
            let r = ManagementMessageReader::new(uart.clone(), log.clone());
            let msg = r.read();
            info!(log, "received management message: {:#?}", msg);

            let pipeline = pipeline.clone();
            let uart = uart.clone();
            let log = log.clone();
            handle_management_message(msg, pipeline, uart, radix, log.clone());
            info!(log, "handled management message");
        }
    }
}

impl Lifecycle for SoftNpu {
    fn type_name(&self) -> &'static str {
        "softnpu"
    }

    fn start(&self) -> anyhow::Result<()> {
        let mut booted = self.booted.lock().unwrap();
        if *booted {
            return Ok(());
        }
        self.run_management_handler_thread();
        for i in 0..self.pci_port.data_handles.len() {
            info!(self.log, "starting ingress packet handler for port {}", i);

            PacketHandler::run_ingress_packet_handler_thread(
                i,
                self.pci_port.data_handles.clone(),
                self.pci_port.virtio_state.clone(),
                self.pci_port.pipeline.clone(),
                self.log.clone(),
            );
        }
        *booted = true;
        Ok(())
    }

    fn migrate(&'_ self) -> Migrator<'_> {
        Migrator::NonMigratable
    }
}

impl PciVirtioSoftNpuPort {
    pub fn new(
        mac: [u8; 6],
        data_handles: Vec<dlpi::DlpiHandle>,
        virtio: Arc<PortVirtioState>,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        log: Logger,
    ) -> Arc<Self> {
        Arc::new(PciVirtioSoftNpuPort {
            mac,
            data_handles,
            pipeline,
            log,
            virtio_state: virtio,
        })
    }

    fn handle_guest_virtio_request(&self, vq: &VirtQueue) {
        if vq.id == 0 {
            return self.handle_q0_req(vq);
        }

        let mem = match self.virtio_state.pci_state.acc_mem.access() {
            Some(mem) => mem,
            None => return,
        };
        let mut chain = Chain::with_capacity(1);
        let Some((_idx, _clen)) = vq.pop_avail(&mut chain, &mem) else {
            return;
        };

        // only vq.push_used if we actually read something
        let mut push_used = false;

        // read as many Ethernet frames from the guest as we can
        loop {
            let mut virtio_bytes = [0u8; 10];
            // read in virtio mystery bytes
            let n = read_buf(&mem, &mut chain, &mut virtio_bytes);
            if n != 10 {
                if n > 0 {
                    push_used = true;
                } else {
                    break;
                }
                warn!(self.log, "failed to read virtio mystery bytes ({})", n);
                //break;
            }

            let mut frame = [0u8; MTU];
            // read in Ethernet header
            let n = read_buf(&mem, &mut chain, &mut frame);
            if n == 0 {
                break;
            }
            push_used = true;

            let pkt = packet_in::new(&frame[..n]);

            let mut pipeline = match self.pipeline.lock() {
                Ok(pipe) => pipe,
                Err(e) => {
                    error!(self.log, "failed to lock pipeline: {}", e);
                    break;
                }
            };
            let pl: &mut Box<dyn Pipeline> = match &mut *pipeline {
                Some(ref mut x) => &mut x.1,
                None => {
                    // This just means no P4 program has been set by the guest.
                    break;
                }
            };

            PacketHandler::process_guest_packet(
                pkt,
                &self.data_handles,
                pl,
                &self.log,
            );
        }

        if push_used {
            vq.push_used(&mut chain, &mem);
        }
    }

    fn handle_q0_req(&self, _vq: &VirtQueue) {
        // ignore notifications from the queue that we use for writing to the
        // guest.
        return;
    }

    fn net_cfg_read(&self, id: &NetReg, ro: &mut ReadOp) {
        match id {
            NetReg::Mac => {
                ro.write_bytes(&self.mac);
            }
            NetReg::Status => {
                // Always report link up
                ro.write_u16(VIRTIO_NET_S_LINK_UP);
            }
            NetReg::MaxVqPairs => {
                // hard-wired to single vq pair for now
                ro.write_u16(1);
            }
        }
    }
}

impl Lifecycle for PciVirtioSoftNpuPort {
    fn type_name(&self) -> &'static str {
        "pci-virtio-softnpu-port"
    }

    fn reset(&self) {
        self.virtio_state.pci_virtio_state.reset(self);
    }
}

impl PciVirtio for PciVirtioSoftNpuPort {
    fn virtio_state(&self) -> &PciVirtioState {
        &self.virtio_state.pci_virtio_state
    }

    fn pci_state(&self) -> &pci::DeviceState {
        &self.virtio_state.pci_state
    }
}

impl VirtioDevice for PciVirtioSoftNpuPort {
    fn rw_dev_config(&self, mut rwo: RWOp) {
        NET_DEV_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => self.net_cfg_read(id, ro),
            RWOp::Write(_) => {
                //ignore writes
            }
        });
    }

    fn mode(&self) -> virtio::Mode {
        virtio::Mode::Legacy
    }

    fn features(&self) -> u64 {
        VIRTIO_NET_F_MAC
    }

    fn set_features(&self, _feat: u64) -> std::result::Result<(), ()> {
        Ok(())
    }

    fn queue_notify(&self, vq: &VirtQueue) {
        self.handle_guest_virtio_request(vq);
    }
}

struct PacketHandler {}

impl PacketHandler {
    /// Spawn a thread that handles packets coming into the emulated ASIC for
    /// the specified interface index.
    fn run_ingress_packet_handler_thread(
        index: usize,
        data_handles: Vec<dlpi::DlpiHandle>,
        virtio: Arc<PortVirtioState>,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        log: Logger,
    ) {
        spawn(move || {
            info!(log, "ingress packet handler is running for port {}", index,);
            Self::run_ingress_packet_handler(
                index,
                data_handles,
                virtio.clone(),
                pipeline.clone(),
                log,
            )
        });
    }

    /// Handle packets coming into the emulated ASIC for the specified interface
    /// index.
    fn run_ingress_packet_handler(
        index: usize,
        data_handles: Vec<dlpi::DlpiHandle>,
        virtio: Arc<PortVirtioState>,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        log: Logger,
    ) {
        let dh = data_handles[index];
        loop {
            //
            // wait for a packet from dlpi
            //
            let mut src = [0u8; dlpi::sys::DLPI_PHYSADDR_MAX];
            let mut msg = [0u8; MTU];
            let mut recvinfo = dlpi_recvinfo_t::default();
            let n = match dlpi::recv(
                dh,
                &mut src,
                &mut msg,
                -1, // block until we get something
                Some(&mut recvinfo),
            ) {
                Ok((_, n)) => n,
                Err(e) => {
                    error!(log, "rx error at index {}: {}", index, e);
                    continue;
                }
            };

            //
            // process packet with loaded P4 program
            //

            // TODO pipeline should not need to be mutable for packet handling?
            let pkt = packet_in::new(&msg[..n]);
            let mut p = pipeline.lock().unwrap();
            let pl = match &mut *p {
                Some(ref mut pl) => &mut pl.1,
                None => continue, // no program is loaded
            };

            Self::process_external_packet(
                index + 1,
                pkt,
                &data_handles,
                &virtio,
                pl,
                &log,
            )
        }
    }

    /// Run a packet coming into the ASIC from an external port through the
    /// loaded pipeline and forward it on to its destination.
    fn process_external_packet(
        index: usize,
        mut pkt: packet_in<'_>,
        data_handles: &Vec<dlpi::DlpiHandle>,
        virtio: &Arc<PortVirtioState>,
        pipeline: &mut Box<dyn Pipeline>,
        log: &Logger,
    ) {
        for (mut out_pkt, port) in
            pipeline.process_packet(index as u16, &mut pkt)
        {
            // packet is going to CPU port
            if port == 0 {
                Self::send_packet_to_cpu_port(&mut out_pkt, virtio, &log);
            }
            // packet is passing through
            else {
                Self::send_packet_to_ext_port(
                    &mut out_pkt,
                    data_handles,
                    port - 1,
                    &log,
                );
            }
        }
    }

    /// Run a packet coming into the ASIC from the guest pci port through the
    /// loaded pipeline and forward it on to its destination.
    fn process_guest_packet(
        mut pkt: packet_in<'_>,
        data_handles: &Vec<dlpi::DlpiHandle>,
        pipeline: &mut Box<dyn Pipeline>,
        log: &Logger,
    ) {
        for (mut out_pkt, port) in pipeline.process_packet(0, &mut pkt) {
            if port == 0 {
                // no looping packets back to the guest
                return;
            }
            if port == SOFTNPU_CPU_AUX_PORT {
                // we are not currently emulating this port type
                return;
            }
            Self::send_packet_to_ext_port(
                &mut out_pkt,
                data_handles,
                port - 1,
                &log,
            );
        }
    }

    /// Send a packet out an external port using dlpi.
    fn send_packet_to_ext_port(
        pkt: &mut packet_out<'_>,
        data_handles: &Vec<dlpi::DlpiHandle>,
        port: u16,
        log: &Logger,
    ) {
        if usize::from(port) >= data_handles.len() {
            error!(log, "port out of range {} >= {}", port, data_handles.len());
            return;
        }
        // get the dlpi handle for this port
        let dh = data_handles[port as usize];

        //TODO avoid copying the whole packet
        let mut out = pkt.header_data.clone();
        out.extend_from_slice(pkt.payload_data);

        if let Err(e) = dlpi::send(dh, &[], out.as_slice(), None) {
            error!(log, "tx (ext,0): {}", e);
        }
    }

    /// Send a packet out the guest pci port using virtio.
    fn send_packet_to_cpu_port(
        pkt: &mut packet_out<'_>,
        virtio: &Arc<PortVirtioState>,
        log: &Logger,
    ) {
        let mem = match virtio.pci_state.acc_mem.access() {
            Some(mem) => mem,
            None => {
                warn!(log, "send packet to guest: no guest virtio memory");
                return;
            }
        };
        let mut chain = Chain::with_capacity(1);
        let vq = virtio.pci_virtio_state.queues.get(0).expect("a queue");
        if let None = vq.pop_avail(&mut chain, &mem) {
            return;
        }

        // write the virtio mystery bytes
        write_buf(&[0u8; 10], &mut chain, &mem);
        write_buf(pkt.header_data.as_mut_slice(), &mut chain, &mem);
        write_buf(pkt.payload_data, &mut chain, &mem);

        vq.push_used(&mut chain, &mem);
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum NetReg {
    Mac,
    Status,
    MaxVqPairs,
}
lazy_static! {
    static ref NET_DEV_REGS: RegMap<NetReg> = {
        let layout =
            [(NetReg::Mac, 6), (NetReg::Status, 2), (NetReg::MaxVqPairs, 2)];
        RegMap::create_packed(VIRTIO_NET_CFG_SIZE, &layout, None)
    };
}

mod bits {
    pub const VIRTIO_NET_CFG_SIZE: usize = 0xa;
}
use bits::*;

// helper functions to read/write a buffer from/to a guest
fn read_buf(mem: &MemCtx, chain: &mut Chain, buf: &mut [u8]) -> usize {
    let mut done = 0;
    chain.for_remaining_type(true, |addr, len| {
        let mut remain = GuestData::from(&mut buf[done..]);
        if let Some(copied) = mem.read_into(addr, &mut remain, len) {
            let need_more = copied != remain.len();
            done += copied;
            (copied, need_more)
        } else {
            (0, false)
        }
    })
}

/// Handle ASIC management messages from the guest using the loaded program.
fn handle_management_message(
    msg: ManagementRequest,
    pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
    uart: Arc<LpcUart>,
    radix: usize,
    log: Logger,
) {
    let mut pl_opt = pipeline.lock().unwrap();

    match msg {
        ManagementRequest::TableAdd(tm) => {
            let pl = match &mut *pl_opt {
                Some(pl) => pl,
                None => return,
            };
            pl.1.add_table_entry(
                &tm.table,
                &tm.action,
                &tm.keyset_data,
                &tm.parameter_data,
                0,
            );
        }
        ManagementRequest::TableRemove(tm) => {
            let pl = match &mut *pl_opt {
                Some(pl) => pl,
                None => return,
            };
            pl.1.remove_table_entry(&tm.table, &tm.keyset_data);
        }
        ManagementRequest::RadixRequest => {
            // the data is being sent back as ascii text because this is the
            // simplest way for the guest tty device to handle the data. control
            // characters coming through the pipe are acted on differently and
            // illumos does not currently have a raw mode for termio.
            //
            // - https://code.illumos.org/c/illumos-gate/+/1808
            let mut buf: Vec<u8> = Vec::new();
            buf.extend_from_slice(radix.to_string().as_bytes());
            buf.push(b'\n');
            for b in &buf {
                while !uart.write(*b) {
                    std::thread::yield_now();
                }
            }
            info!(log, "wrote: {:?}", buf.len());
        }
        ManagementRequest::DumpRequest => {
            info!(log, "dumping state");
            let result = {
                let pl = match &mut *pl_opt {
                    Some(pl) => &pl.1,
                    None => return,
                };

                // Create a response where a vector of table entries for each
                // table is indexed by the table id.
                let mut result = BTreeMap::new();

                for id in pl.get_table_ids() {
                    let entries = pl.get_table_entries(id);
                    result.insert(id, entries);
                }
                result
            };

            let buf = match serde_json::to_string(&result) {
                Ok(j) => {
                    let mut buf = j.as_bytes().to_vec();
                    info!(log, "writing: {}", j);
                    // Add trailing newline for proper tty handling.
                    buf.push(b'\n');
                    buf
                }
                Err(e) => {
                    warn!(log, "failed to serialize table state: {}", e);
                    b"{}\n".to_vec()
                }
            };

            for b in &buf {
                while !uart.write(*b) {
                    // If we cannot write to the uart, yield and come back once
                    // scheduled again.
                    std::thread::yield_now();
                }
            }

            info!(log, "management wrote: {}", buf.len());
        }
    }
}

#[derive(Debug, Default, Serialize, Deserialize)]
pub struct TableDump {
    pub tables: BTreeMap<String, Vec<p4rs::TableEntry>>,
}

struct ManagementMessageReader {
    uart: Arc<LpcUart>,
    log: Logger,
}

impl ManagementMessageReader {
    fn new(uart: Arc<LpcUart>, log: Logger) -> Self {
        Self { uart, log }
    }

    fn read(&self) -> ManagementRequest {
        loop {
            let mut buf = vec![0; 10240];
            let mut i = 0;
            let mut in_message = false;
            loop {
                let x = match self.uart.read() {
                    Some(b) => b,
                    None => {
                        // If we are in the middle of reading a message come
                        // back in a tight loop. Otherwise check back less
                        // regularly.
                        if in_message {
                            std::thread::yield_now();
                        } else {
                            sleep(Duration::from_millis(100));
                        }
                        continue;
                    }
                };
                if x == b'\n' {
                    break;
                }
                in_message = true;
                buf[i] = x;
                i += 1;
            }
            buf.resize(i, 0);
            // Ttys do cruel and unusual things to our messages.
            buf.retain(|x| *x != b'\r' && *x != b'\0');
            // Find the premable and push the buffer beyond that point.
            let msgbuf = match buf
                .iter()
                .position(|b| *b == MANAGEMENT_MESSAGE_PREAMBLE)
            {
                Some(p) => {
                    if p + 1 < buf.len() {
                        &buf[p + 1..]
                    } else {
                        continue;
                    }
                }
                None => continue,
            };
            match serde_json::from_slice(&msgbuf) {
                Ok(msg) => return msg,
                Err(e) => {
                    error!(self.log, "mgmt message deser: {}", e);
                    error!(self.log, "{:x?}", msgbuf);
                    error!(self.log, "{}", String::from_utf8_lossy(msgbuf));
                    continue;
                }
            }
        }
    }
}

pub struct SoftNpuP9Handler {
    source: String,
    target: String,
    radix: u16,
    log: Logger,
    pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
}

/// The file that a P4 program is written to while being streamed by the guest.
fn p4_temp_file() -> String {
    format!("/tmp/p4_tmp_{}.so", std::process::id())
}

/// The file that is dynamically loaded onto the ASIC.
fn p4_active_file() -> String {
    format!("/tmp/p4_active_{}.p4", std::process::id())
}

impl SoftNpuP9Handler {
    pub fn new(
        source: String,
        target: String,
        radix: u16,
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        log: Logger,
    ) -> Self {
        Self { source, target, radix, pipeline, log }
    }

    /// This function is called while the program is being streamed in from the
    /// guest. The program is incrementally written to a temporary file while
    /// the program is being loaded. A temporary file is used to prevent the
    /// active program's file from being written to while it is being run.
    /// Writing to a file that is mapped by `dlopen` causes explosions.
    fn write_program(buf: &[u8], offset: u64, log: &Logger) {
        info!(log, "loading {} byte program", buf.len());
        let path = p4_temp_file();
        let mut file = match offset {
            // This is the first write, so open the file in create mode.
            0 => match File::create(&path) {
                Ok(f) => f,
                Err(e) => {
                    error!(log, "failed to create p4 file {}: {}", &path, e);
                    return;
                }
            },
            // This is a subsequent write, so open the file win append mode.
            _ => {
                match OpenOptions::new().create(true).append(true).open(&path) {
                    Ok(f) => f,
                    Err(e) => {
                        error!(
                            log,
                            "failed to create p4 file {}: {}", &path, e
                        );
                        return;
                    }
                }
            }
        };

        if let Err(e) = file.write_all(&buf) {
            error!(log, "writing p4 program to file failed: {}", e);
            return;
        }
    }

    /// This function is called after a program has been completely copied from
    /// the guest. The current pipeline is dropped. Then the temporary program
    /// file is copied to the active program file. Then the pipeline is loaded
    /// from the active program file.
    fn load_program(
        pipeline: Arc<Mutex<Option<LoadedP4Program>>>,
        radix: u16,
        log: Logger,
    ) {
        let mut pl = pipeline.lock().unwrap();
        // drop anything that may already be loaded before attempting a dlopen
        if let Some((lib, pipe)) = pl.take() {
            // This order is very important, if the lib gets dropped before the
            // pipe the world explodes.
            drop(pipe);
            drop(lib);
        }

        let temp_path = p4_temp_file();
        let active_path = p4_active_file();

        if let Err(e) = fs::copy(&temp_path, &active_path) {
            warn!(log, "copying p4 program file failed: {}", e);
            return;
        }

        let lib = match unsafe { Library::open(Some(&active_path), RTLD_NOW) } {
            Ok(l) => l,
            Err(e) => {
                warn!(log, "failed to load p4 program: {}", e);
                return;
            }
        };
        let func: Symbol<unsafe extern "C" fn(u16) -> *mut dyn p4rs::Pipeline> =
            match unsafe { lib.get(b"_main_pipeline_create") } {
                Ok(f) => f,
                Err(e) => {
                    warn!(
                        log,
                        "failed to load _main_pipeline_create func: {}", e
                    );
                    return;
                }
            };

        // account for CPU port
        let radix = radix + 1;
        let boxpipe = unsafe { Box::from_raw(func(radix)) };
        let _ = pl.insert((lib, boxpipe));
    }
}

/// Implement a very specific P9 handler that only implements file writes in
/// order to load P4 programs.
impl P9Handler for SoftNpuP9Handler {
    fn source(&self) -> &str {
        &self.source
    }

    fn target(&self) -> &str {
        &self.target
    }

    fn msize(&self) -> u32 {
        65536
    }

    fn handle_version(&self, msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let mut msg: p9ds::proto::Version = match ispf::from_bytes_le(&msg_buf)
        {
            Err(e) => {
                error!(self.log, "could not parse p9fs version message: {}", e);
                return;
            }
            Ok(m) => m,
        };
        msg.typ = p9ds::proto::MessageType::Rversion;

        // This is a version of our own making. It's meant to deter clients that
        // may discover us from trying to use us as some sort of normal P9
        // file system. It also helps clients that are actually looking for the
        // SoftNpu P9 device to identify us as such.
        "9P2000.P4".clone_into(&mut msg.version);

        let mut out = ispf::to_bytes_le(&msg).unwrap();
        let buf = out.as_mut_slice();
        write_buf(buf, chain, mem);
    }

    fn handle_attach(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_walk(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_open(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_readdir(
        &self,
        _msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        _msize: u32,
    ) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_read(
        &self,
        _msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        _msize: u32,
    ) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_write(
        &self,
        msg_buf: &[u8],
        chain: &mut Chain,
        mem: &MemCtx,
        _msize: u32,
    ) {
        let msg: Twrite = ispf::from_bytes_le(&msg_buf).unwrap();
        let len = msg.data.len();

        Self::write_program(&msg.data, msg.offset, &self.log);

        let response = Rwrite::new(len as u32);
        let mut out = ispf::to_bytes_le(&response).unwrap();
        let buf = out.as_mut_slice();
        return write_buf(buf, chain, mem);
    }

    fn handle_clunk(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        let pipe = self.pipeline.clone();
        let log = self.log.clone();
        let radix = self.radix;

        spawn(move || Self::load_program(pipe, radix, log));

        let response = Rclunk::new();
        let mut out = ispf::to_bytes_le(&response).unwrap();
        let buf = out.as_mut_slice();
        return write_buf(buf, chain, mem);
    }

    fn handle_getattr(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        write_error(ENOTSUP as u32, chain, &mem)
    }

    fn handle_statfs(&self, _msg_buf: &[u8], chain: &mut Chain, mem: &MemCtx) {
        write_error(ENOTSUP as u32, chain, &mem)
    }
}


================================================
FILE: lib/propolis/src/hw/virtio/testutil.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Test utilities for constructing fake virtqueues backed by real guest memory.
//!
//! This module provides [`TestVirtQueue`] for single-queue tests and
//! [`TestVirtQueues`] for multi-queue devices. Both allocate guest memory via
//! a tempfile-backed [`PhysMap`], lay out virtio ring structures, and provide
//! helpers to enqueue descriptor chains — simulating a guest driver writing
//! to the available ring.

use std::sync::Arc;

use zerocopy::{FromBytes, IntoBytes};

use crate::accessors::MemAccessor;
use crate::common::GuestAddr;
use crate::vmm::mem::PhysMap;
use crate::vmm::MemCtx;

// Re-export queue types so tests outside this module can access them
// without requiring `queue` to be pub(crate).
pub use super::queue::{Chain, DescFlag, VirtQueue, VirtQueues, VqSize};

/// Page size for alignment (4 KiB).
const PAGE_SIZE: u64 = 0x1000;

/// Size in bytes of a virtio descriptor (addr: u64, len: u32, flags: u16, next: u16).
const DESC_SIZE: u64 = 16;

/// Size in bytes of a used ring element (id: u32, len: u32).
const USED_ELEM_SIZE: u64 = 8;

/// Size in bytes of an available ring entry (descriptor index: u16).
const AVAIL_ELEM_SIZE: u64 = 2;

/// Size in bytes of the ring header (flags: u16, idx: u16).
const RING_HEADER_SIZE: u64 = 4;

/// Number of pages to allocate for the data area in tests.
const DATA_AREA_PAGES: u64 = 64;

/// Align `val` up to the next multiple of `align` (must be power of 2).
pub const fn align_up(val: u64, align: u64) -> u64 {
    (val + align - 1) & !(align - 1)
}

/// 16-byte virtio descriptor, matching the on-wire/in-memory layout.
#[repr(C)]
#[derive(Copy, Clone, Default, FromBytes, IntoBytes)]
pub struct RawDesc {
    pub addr: u64,
    pub len: u32,
    pub flags: u16,
    pub next: u16,
}

/// 8-byte used ring element.
#[repr(C)]
#[derive(Copy, Clone, Default, FromBytes)]
pub struct RawUsedElem {
    pub id: u32,
    pub len: u32,
}

/// Guest physical address layout for a single virtqueue's ring structures.
#[derive(Copy, Clone, Debug)]
pub struct QueueLayout {
    pub desc_base: u64,
    pub avail_base: u64,
    pub used_base: u64,
    /// First GPA after this queue's structures.
    pub end: u64,
}

impl QueueLayout {
    /// Compute the ring layout for a queue of `size` entries starting at
    /// `base`.
    ///
    /// Layout follows the virtio 1.0 split virtqueue format:
    /// - Descriptor table: `size * DESC_SIZE` bytes
    /// - Available ring: header (4 bytes) + `size * 2` bytes for entries
    /// - Used ring: page-aligned, header (4 bytes) + `size * 8` bytes
    pub fn new(base: u64, size: u16) -> Self {
        let qsz = size as u64;
        let desc_base = base;
        let avail_base = desc_base + DESC_SIZE * qsz;
        let used_base = align_up(
            avail_base + RING_HEADER_SIZE + AVAIL_ELEM_SIZE * qsz,
            PAGE_SIZE,
        );
        let end = align_up(
            used_base + RING_HEADER_SIZE + USED_ELEM_SIZE * qsz,
            PAGE_SIZE,
        );
        Self { desc_base, avail_base, used_base, end }
    }
}

/// Per-queue writer for injecting descriptors into a virtqueue's rings.
pub struct QueueWriter {
    layout: QueueLayout,
    size: u16,
    /// Next free descriptor index.
    next_desc: u16,
    /// Start of data area for this queue.
    data_start: u64,
    /// Next free data area offset (GPA).
    data_cursor: u64,
    /// Avail ring index we've published up to.
    avail_idx: u16,
}

impl QueueWriter {
    /// Create a new QueueWriter for a queue with the given layout.
    pub fn new(layout: QueueLayout, size: u16, data_start: u64) -> Self {
        Self {
            layout,
            size,
            next_desc: 0,
            data_start,
            data_cursor: data_start,
            avail_idx: 0,
        }
    }

    /// Reset descriptor and data cursors to allow reusing slots.
    pub fn reset_cursors(&mut self) {
        self.next_desc = 0;
        self.data_cursor = self.data_start;
    }

    /// Write a descriptor and return its index.
    pub fn write_desc(
        &mut self,
        mem_acc: &MemAccessor,
        addr: u64,
        len: u32,
        flags: u16,
        next: u16,
    ) -> u16 {
        let idx = self.next_desc;
        assert!(idx < self.size, "descriptor table exhausted");
        self.next_desc += 1;

        let desc = RawDesc { addr, len, flags, next };
        let gpa = self.layout.desc_base + u64::from(idx) * DESC_SIZE;
        let mem = mem_acc.access().unwrap();
        mem.write(GuestAddr(gpa), &desc);
        idx
    }

    /// Allocate data space and write bytes into it. Returns the GPA.
    pub fn write_data(&mut self, mem_acc: &MemAccessor, data: &[u8]) -> u64 {
        let gpa = self.data_cursor;
        self.data_cursor += data.len() as u64;
        let mem = mem_acc.access().unwrap();
        mem.write_from(GuestAddr(gpa), data, data.len());
        gpa
    }

    /// Allocate data space without writing. Returns the GPA.
    pub fn alloc_data(&mut self, len: u32) -> u64 {
        let gpa = self.data_cursor;
        self.data_cursor += u64::from(len);
        gpa
    }

    /// Add a readable descriptor with the given data.
    pub fn add_readable(&mut self, mem_acc: &MemAccessor, data: &[u8]) -> u16 {
        let gpa = self.write_data(mem_acc, data);
        self.write_desc(mem_acc, gpa, data.len() as u32, 0, 0)
    }

    /// Add a writable descriptor of the given size.
    pub fn add_writable(&mut self, mem_acc: &MemAccessor, len: u32) -> u16 {
        let gpa = self.alloc_data(len);
        self.write_desc(mem_acc, gpa, len, DescFlag::WRITE.bits(), 0)
    }

    /// Chain two descriptors together via NEXT flag.
    pub fn chain(&self, mem_acc: &MemAccessor, from: u16, to: u16) {
        let gpa = self.layout.desc_base + u64::from(from) * DESC_SIZE;
        let mem = mem_acc.access().unwrap();
        let mut raw: RawDesc = *mem.read(GuestAddr(gpa)).unwrap();
        raw.flags |= DescFlag::NEXT.bits();
        raw.next = to;
        mem.write(GuestAddr(gpa), &raw);
    }

    /// Publish a descriptor chain head on the available ring.
    pub fn publish_avail(&mut self, mem_acc: &MemAccessor, head: u16) {
        // Available ring layout:
        // flags (u16) | idx (u16) | ring[size] (u16 each)
        let slot = self.layout.avail_base
            + RING_HEADER_SIZE
            + u64::from(self.avail_idx % self.size) * AVAIL_ELEM_SIZE;
        self.avail_idx += 1;
        let new_idx = self.avail_idx;
        let mem = mem_acc.access().unwrap();
        mem.write(GuestAddr(slot), &head);
        // Write new index at offset 2 (after flags u16)
        mem.write(GuestAddr(self.layout.avail_base + 2), &new_idx);
    }

    /// Read the used ring index.
    pub fn used_idx(&self, mem_acc: &MemAccessor) -> u16 {
        let mem = mem_acc.access().unwrap();
        // Used ring idx is at offset 2 (after flags u16)
        *mem.read(GuestAddr(self.layout.used_base + 2)).unwrap()
    }

    /// Read a used ring entry by index, returning (desc_id, len).
    pub fn read_used_elem(
        &self,
        mem_acc: &MemAccessor,
        used_index: u16,
    ) -> RawUsedElem {
        let mem = mem_acc.access().unwrap();
        // Used ring layout:
        // flags (u16) | idx (u16) | ring[size] (RawUsedElem each)
        let entry_gpa = self.layout.used_base
            + RING_HEADER_SIZE
            + u64::from(used_index % self.size) * USED_ELEM_SIZE;
        *mem.read(GuestAddr(entry_gpa)).unwrap()
    }

    /// Read raw bytes from the buffer of a descriptor.
    pub fn read_desc_data(
        &self,
        mem_acc: &MemAccessor,
        desc_id: u16,
        len: usize,
    ) -> Vec<u8> {
        let mem = mem_acc.access().unwrap();
        let desc_gpa = self.layout.desc_base + u64::from(desc_id) * DESC_SIZE;
        let raw_desc: RawDesc = *mem.read(GuestAddr(desc_gpa)).unwrap();

        let mut data = vec![0u8; len];
        mem.read_into(
            GuestAddr(raw_desc.addr),
            &mut crate::common::GuestData::from(data.as_mut_slice()),
            len,
        );
        data
    }
}

/// Multi-queue test harness for virtio devices that use multiple queues.
pub struct TestVirtQueues {
    /// Must stay alive to keep memory mappings valid.
    _phys: PhysMap,
    mem_acc: MemAccessor,
    queues: VirtQueues,
    layouts: Vec<QueueLayout>,
    sizes: Vec<u16>,
    /// Start of data area (after all queue structures).
    data_start: u64,
}

impl TestVirtQueues {
    /// Create a new multi-queue test harness.
    ///
    /// `sizes` specifies the size of each queue (must be powers of 2).
    pub fn new(sizes: &[VqSize]) -> Self {
        // Compute layouts for all queues sequentially
        let mut layouts = Vec::with_capacity(sizes.len());
        let mut size_vals = Vec::with_capacity(sizes.len());
        let mut offset = 0u64;
        for &size in sizes {
            let size_u16: u16 = size.into();
            let layout = QueueLayout::new(offset, size_u16);
            offset = layout.end;
            layouts.push(layout);
            size_vals.push(size_u16);
        }

        // Data area after all rings
        let data_start = offset;
        let data_area_size = PAGE_SIZE * DATA_AREA_PAGES;
        let total_size =
            align_up(data_start + data_area_size, PAGE_SIZE) as usize;

        let mut phys = PhysMap::new_test(total_size);
        phys.add_test_mem("test-vqs".to_string(), 0, total_size)
            .expect("add test mem");
        let mem_acc = phys.finalize();

        // Create VirtQueues
        let queues = VirtQueues::new(sizes);

        // Initialize each queue
        for (i, layout) in layouts.iter().enumerate() {
            let vq = queues.get(i as u16).unwrap();
            mem_acc.adopt(&vq.acc_mem, Some(format!("test-vq-{i}")));
            vq.map_virtqueue(
                layout.desc_base,
                layout.avail_base,
                layout.used_base,
            );
            vq.live.store(true, std::sync::atomic::Ordering::Release);
            vq.enabled.store(true, std::sync::atomic::Ordering::Release);

            // Zero out avail and used ring headers
            let mem = mem_acc.access().unwrap();
            mem.write(GuestAddr(layout.avail_base), &0u16);
            mem.write(GuestAddr(layout.avail_base + 2), &0u16);
            mem.write(GuestAddr(layout.used_base), &0u16);
            mem.write(GuestAddr(layout.used_base + 2), &0u16);
        }

        Self {
            _phys: phys,
            mem_acc,
            queues,
            layouts,
            sizes: size_vals,
            data_start,
        }
    }

    /// Get the memory accessor.
    pub fn mem_acc(&self) -> &MemAccessor {
        &self.mem_acc
    }

    /// Get the underlying VirtQueues.
    pub fn queues(&self) -> &VirtQueues {
        &self.queues
    }

    /// Get the VirtQueue at the given index.
    pub fn vq(&self, idx: u16) -> &Arc<VirtQueue> {
        self.queues.get(idx).unwrap()
    }

    /// Create a QueueWriter for the given queue index.
    ///
    /// `data_offset` is an offset from the shared data area start,
    /// allowing different queues to use different regions.
    pub fn writer(&self, queue_idx: usize, data_offset: u64) -> QueueWriter {
        let layout = self.layouts[queue_idx];
        let size = self.sizes[queue_idx];
        QueueWriter::new(layout, size, self.data_start + data_offset)
    }

    /// Get the layout for a queue.
    pub fn layout(&self, queue_idx: usize) -> QueueLayout {
        self.layouts[queue_idx]
    }
}

/// A test harness wrapping guest memory and a single virtqueue.
///
/// For multi-queue tests, use [`TestVirtQueues`] instead.
pub struct TestVirtQueue {
    inner: TestVirtQueues,
    writer: QueueWriter,
}

impl TestVirtQueue {
    /// Create a new test virtqueue.
    ///
    /// `queue_size` must be a power of 2.
    pub fn new(queue_size: u16) -> Self {
        let inner = TestVirtQueues::new(&[VqSize::new(queue_size)]);
        let writer = inner.writer(0, 0);
        Self { inner, writer }
    }

    /// Get the underlying `VirtQueue`.
    pub fn vq(&self) -> &Arc<VirtQueue> {
        self.inner.vq(0)
    }

    /// Get a `MemCtx` guard for directly reading/writing guest memory.
    pub fn mem(&self) -> impl std::ops::Deref<Target = MemCtx> + '_ {
        self.inner.mem_acc().access().expect("test mem accessible")
    }

    /// Add a readable descriptor containing `data`.
    ///
    /// Returns the descriptor index.
    pub fn add_readable(&mut self, data: &[u8]) -> u16 {
        self.writer.add_readable(self.inner.mem_acc(), data)
    }

    /// Add a writable descriptor of `len` bytes.
    ///
    /// Returns the descriptor index.
    pub fn add_writable(&mut self, len: u32) -> u16 {
        self.writer.add_writable(self.inner.mem_acc(), len)
    }

    /// Link descriptors into a chain by setting NEXT flags.
    ///
    /// `descs` should be in order: `[head, ..., tail]`.
    pub fn chain_descriptors(&mut self, descs: &[u16]) {
        for i in 0..descs.len().saturating_sub(1) {
            self.writer.chain(self.inner.mem_acc(), descs[i], descs[i + 1]);
        }
    }

    /// Publish a descriptor chain head on the available ring.
    pub fn publish_avail(&mut self, head: u16) {
        self.writer.publish_avail(self.inner.mem_acc(), head);
    }

    /// Read all entries from the used ring.
    ///
    /// Returns `(descriptor_id, bytes_written)` pairs.
    pub fn read_used(&self) -> Vec<(u32, u32)> {
        let used_idx = self.writer.used_idx(self.inner.mem_acc());
        (0..used_idx)
            .map(|i| {
                let elem = self.writer.read_used_elem(self.inner.mem_acc(), i);
                (elem.id, elem.len)
            })
            .collect()
    }

    /// Pop a chain from the available ring and return it.
    pub fn pop_chain(&self) -> Option<(Chain, u16, u32)> {
        let mem = self.inner.mem_acc().access()?;
        let mut chain = Chain::with_capacity(64);
        let (avail_idx, len) = self.vq().pop_avail(&mut chain, &mem)?;
        Some((chain, avail_idx, len))
    }

    /// Push a chain back to the used ring.
    pub fn push_used(&self, chain: &mut Chain) {
        let mem = self.inner.mem_acc().access().unwrap();
        self.vq().push_used(chain, &mem);
    }

    /// Get the GPA of a descriptor's buffer.
    pub fn desc_addr(&self, idx: u16) -> u64 {
        let mem = self.inner.mem_acc().access().unwrap();
        let desc_gpa =
            self.inner.layout(0).desc_base + u64::from(idx) * DESC_SIZE;
        let raw: RawDesc = *mem.read(GuestAddr(desc_gpa)).unwrap();
        raw.addr
    }

    /// Read raw bytes from guest memory at a given GPA.
    pub fn read_guest_mem(&self, addr: u64, len: usize) -> Vec<u8> {
        let mem = self.inner.mem_acc().access().unwrap();
        let mut buf = vec![0u8; len];
        let mut guest_buf = crate::common::GuestData::from(buf.as_mut_slice());
        mem.read_into(GuestAddr(addr), &mut guest_buf, len);
        buf
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn smoke_pop_avail_readable() {
        let mut tvq = TestVirtQueue::new(16);

        let data = b"hello virtqueue";
        let d0 = tvq.add_readable(data);
        tvq.publish_avail(d0);

        let (mut chain, _avail_idx, total_len) = tvq.pop_chain().unwrap();
        assert_eq!(total_len, data.len() as u32);

        let mem = tvq.mem();
        let mut buf = [0u8; 15];
        assert!(chain.read(&mut buf, &mem));
        assert_eq!(&buf, data);
    }

    #[test]
    fn smoke_pop_avail_writable() {
        let mut tvq = TestVirtQueue::new(16);

        let d0 = tvq.add_writable(64);
        tvq.publish_avail(d0);

        let (mut chain, _avail_idx, total_len) = tvq.pop_chain().unwrap();
        assert_eq!(total_len, 64);

        let mem = tvq.mem();
        let payload = b"written by device";
        assert!(chain.write(payload, &mem));
        drop(mem);

        tvq.push_used(&mut chain);

        let used = tvq.read_used();
        assert_eq!(used.len(), 1);
        assert_eq!(used[0].0, d0 as u32);
        assert_eq!(used[0].1, payload.len() as u32);

        let addr = tvq.desc_addr(d0);
        let read_back = tvq.read_guest_mem(addr, payload.len());
        assert_eq!(read_back, payload);
    }

    #[test]
    fn smoke_chained_descriptors() {
        let mut tvq = TestVirtQueue::new(16);

        let header_data = [0xAA; 8];
        let body_data = [0xBB; 32];
        let d0 = tvq.add_readable(&header_data);
        let d1 = tvq.add_readable(&body_data);
        tvq.chain_descriptors(&[d0, d1]);
        tvq.publish_avail(d0);

        let (mut chain, _avail_idx, total_len) = tvq.pop_chain().unwrap();
        assert_eq!(total_len, 40);

        let mem = tvq.mem();
        let mut hdr = [0u8; 8];
        assert!(chain.read(&mut hdr, &mem));
        assert_eq!(hdr, header_data);

        let mut body = [0u8; 32];
        assert!(chain.read(&mut body, &mem));
        assert_eq!(body, body_data);
    }

    #[test]
    fn smoke_mixed_chain() {
        let mut tvq = TestVirtQueue::new(16);

        let req_data = [0x01, 0x02, 0x03, 0x04];
        let d0 = tvq.add_readable(&req_data);
        let d1 = tvq.add_writable(128);
        tvq.chain_descriptors(&[d0, d1]);
        tvq.publish_avail(d0);

        let (mut chain, _, total_len) = tvq.pop_chain().unwrap();
        assert_eq!(total_len, 4 + 128);

        let mem = tvq.mem();

        let mut req = [0u8; 4];
        assert!(chain.read(&mut req, &mem));
        assert_eq!(req, req_data);

        let resp = [0xFF; 16];
        assert!(chain.write(&resp, &mem));
        drop(mem);

        tvq.push_used(&mut chain);

        let addr = tvq.desc_addr(d1);
        let read_back = tvq.read_guest_mem(addr, 16);
        assert_eq!(read_back, &resp);
    }

    #[test]
    fn empty_avail_ring_returns_none() {
        let tvq = TestVirtQueue::new(16);
        assert!(tvq.pop_chain().is_none());
    }

    #[test]
    fn multiple_chains() {
        let mut tvq = TestVirtQueue::new(16);

        let d0 = tvq.add_readable(b"first");
        tvq.publish_avail(d0);

        let d1 = tvq.add_readable(b"second");
        tvq.publish_avail(d1);

        let (chain0, _, _) = tvq.pop_chain().unwrap();
        let (chain1, _, _) = tvq.pop_chain().unwrap();
        assert!(tvq.pop_chain().is_none());

        assert_ne!(chain0.remain_read_bytes(), chain1.remain_read_bytes());
    }

    #[test]
    fn multi_queue_smoke() {
        let tvqs = TestVirtQueues::new(&[
            VqSize::new(64),
            VqSize::new(64),
            VqSize::new(1),
        ]);

        let mut writer0 = tvqs.writer(0, 0);
        let mut writer1 = tvqs.writer(1, PAGE_SIZE);

        let d0 = writer0.add_readable(tvqs.mem_acc(), b"queue0");
        writer0.publish_avail(tvqs.mem_acc(), d0);

        let d1 = writer1.add_readable(tvqs.mem_acc(), b"queue1");
        writer1.publish_avail(tvqs.mem_acc(), d1);

        // Pop from each queue
        let mem = tvqs.mem_acc().access().unwrap();
        let mut chain0 = Chain::with_capacity(64);
        let mut chain1 = Chain::with_capacity(64);

        assert!(tvqs.vq(0).pop_avail(&mut chain0, &mem).is_some());
        assert!(tvqs.vq(1).pop_avail(&mut chain1, &mem).is_some());

        assert_eq!(chain0.remain_read_bytes(), 6);
        assert_eq!(chain1.remain_read_bytes(), 6);
    }

    #[test]
    fn queue_writer_reset_cursors() {
        let tvqs = TestVirtQueues::new(&[VqSize::new(16)]);
        let mut writer = tvqs.writer(0, 0);

        // Add some descriptors
        let d0 = writer.add_readable(tvqs.mem_acc(), b"first");
        writer.publish_avail(tvqs.mem_acc(), d0);

        // Reset and reuse
        writer.reset_cursors();

        let d1 = writer.add_readable(tvqs.mem_acc(), b"second");
        assert_eq!(d1, 0, "descriptor index should reset to 0");
        writer.publish_avail(tvqs.mem_acc(), d1);

        // Both publishes should have worked
        assert_eq!(writer.used_idx(tvqs.mem_acc()), 0); // Nothing consumed yet
    }
}


================================================
FILE: lib/propolis/src/hw/virtio/viona.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![cfg_attr(not(target_os = "illumos"), allow(dead_code, unused_imports))]

use std::io::{self, Error, ErrorKind};
use std::num::NonZeroU16;
use std::os::unix::io::{AsRawFd, RawFd};
use std::sync::{Arc, Condvar, Mutex, Weak};

use crate::common::{RWOp, ReadOp};
use crate::hw::pci;
use crate::hw::virtio;
use crate::hw::virtio::queue::Chain;
use crate::lifecycle::{self, IndicatedState, Lifecycle};
use crate::migrate::{
    MigrateCtx, MigrateMulti, MigrateStateError, Migrator, PayloadOffers,
    PayloadOutputs,
};
use crate::util::regmap::RegMap;
use crate::vmm::{MemCtx, VmmHdl};

use super::bits::*;
use super::pci::{PciVirtio, PciVirtioState};
use super::queue::{self, VirtQueue, VirtQueues, VqSize};
use super::{VirtioDevice, VqChange, VqIntr};

use bit_field::BitField;
use lazy_static::lazy_static;
use tokio::io::unix::AsyncFd;
use tokio::io::Interest;
use tokio::sync::watch;
use tokio::task::JoinHandle;

// Re-export API versioning interface for convenience of propolis consumers
pub use viona_api::{api_version, ApiVersion};

pub const RX_QUEUE_SIZE: VqSize = VqSize::new(0x800);
pub const TX_QUEUE_SIZE: VqSize = VqSize::new(0x100);
pub const CTL_QUEUE_SIZE: VqSize = VqSize::new(32);

pub const VIRTIO_MQ_MIN_QPAIRS: u16 = 1;
pub const VIRTIO_MQ_MAX_QPAIRS: u16 = 0x8000;

pub const PROPOLIS_MAX_MQ_PAIRS: u16 = 11;

pub const fn max_num_queues() -> usize {
    PROPOLIS_MAX_MQ_PAIRS as usize * 2
}

const ETHERADDRL: usize = 6;

/// The caller of `set_use_pairs` will probably be inlined into a larger
/// function that is difficult to spot in a ustack(). This gives us a hint
/// about why we were `set_usepairs()`'ing.
#[repr(u8)]
enum MqSetPairsCause {
    Reset = 0,
    MqEnabled = 1,
    Commanded = 2,
    Import = 3,
}

#[usdt::provider(provider = "propolis")]
mod probes {
    fn virtio_viona_mq_set_use_pairs(cause: u8, npairs: u16) {}
}

/// Types and so forth for supporting the control queue.
/// Note that these come from the VirtIO spec, section
/// 5.1.6.2 in VirtIO 1.2.
pub mod control {
    use super::ETHERADDRL;
    use std::convert::TryFrom;
    use zerocopy::FromBytes;

    /// The control message header has two data: a u8 representing the "class"
    /// of control message, which describes what the message applies to, and a
    /// "command", which describes what action we should take in response to the
    /// command. So for example, class Mq and command Set means to set the
    /// number of multiqueue queue pairs.
    #[derive(Clone, Copy, Debug, Default, FromBytes)]
    #[repr(C)]
    pub struct Header {
        class: u8,
        command: u8,
    }

    #[derive(Clone, Copy, Debug)]
    pub enum Command {
        Rx(RxCmd),
        Mac(MacCmd),
        Vlan(VlanCmd),
        Announce(AnnounceCmd),
        Mq(MqCmd),
    }

    impl TryFrom<Header> for Command {
        type Error = Header;
        fn try_from(header: Header) -> Result<Self, Self::Error> {
            match (header.class, header.command) {
                (0, c) => Ok(Self::Rx(RxCmd::from_repr(c).ok_or(header)?)),
                (1, c) => Ok(Self::Mac(MacCmd::from_repr(c).ok_or(header)?)),
                (2, c) => Ok(Self::Vlan(VlanCmd::from_repr(c).ok_or(header)?)),
                (3, c) => {
                    Ok(Self::Announce(AnnounceCmd::from_repr(c).ok_or(header)?))
                }
                (4, c) => Ok(Self::Mq(MqCmd::from_repr(c).ok_or(header)?)),
                _ => Err(header),
            }
        }
    }

    #[derive(Clone, Copy, Debug)]
    pub enum Ack {
        Ok = 0,
        Err = 1,
    }

    #[derive(Clone, Copy, Debug, strum::FromRepr)]
    #[repr(u8)]
    pub enum RxCmd {
        Promisc = 0,
        AllMulticast = 1,
        AllUnicast = 2,
        NoMulticast = 3,
        NoUnicast = 4,
        NoBroadcast = 5,
    }

    #[derive(Clone, Copy, Debug, strum::FromRepr)]
    #[repr(u8)]
    pub enum MacCmd {
        TableSet = 0,
        AddrSet = 1,
    }

    #[derive(Clone, Copy, Debug, Default)]
    #[repr(C)]
    pub struct Mac {
        entries: u32,
        mac: [u8; ETHERADDRL],
    }

    #[derive(Clone, Copy, Debug, Default, FromBytes)]
    #[repr(C)]
    pub struct Mq {
        pub npairs: u16,
    }

    #[derive(Clone, Copy, Debug, strum::FromRepr)]
    #[repr(u8)]
    pub enum MqCmd {
        SetPairs = 0,
        RssConfig = 1,
        HashConfig = 2,
    }

    impl TryFrom<u8> for MqCmd {
        type Error = u8;
        fn try_from(value: u8) -> Result<MqCmd, Self::Error> {
            match value {
                0 => Ok(Self::SetPairs),
                v => Err(v),
            }
        }
    }

    #[derive(Clone, Copy, Debug, strum::FromRepr)]
    #[repr(u8)]
    pub enum VlanCmd {
        FilterAdd = 0,
        FilterDelete = 1,
    }
    #[derive(Clone, Copy, Debug, strum::FromRepr)]
    #[repr(u8)]
    pub enum AnnounceCmd {
        Ack = 0,
    }
}

/// Viona's in-kernel emulation of the device VirtQueues is performed in what
/// are calls "vrings". Since the userspace portion of the Viona emulation is
/// tasked with keeping the vring state in sync with the VirtQueue it
/// represents, we must track its perceived state.
#[derive(Copy, Clone, Default, Eq, PartialEq)]
enum VRingState {
    /// Initial state of the vring as it comes out of reset
    ///
    /// No guest-physical addresses, interrupt configuration, or avail/used
    /// indices are set on the vring.
    #[default]
    Init,

    /// Address(es) to valid VirtQueue data has been loaded into the vring but
    /// it has not been "kicked" to begin any processing.
    Ready,

    /// The vring has been "kicked" and it is proceeding to process TX/RX work
    /// as possible.
    Run,

    /// The vring has been issued a pause command to temporarily cease
    /// processing any work.  This is to allow the userspace emulation to gather
    /// a consistent snapshot of vring state.
    Paused,

    /// An error occurred while attempting to manipulate the vring.  This could
    /// be due to invalid configuration from the guest, or programmer error
    /// leading to unexpected device conditions.  If guest actions reset the
    /// vring state (by resetting the device, or reprogramming the VirtQueue),
    /// the vring can transition out of this error state.
    Error,

    /// An error occurred while attempting to reset the vring state.  This is
    /// unrecoverable and will assert a "failed" state on the VirtIO device as a
    /// whole.
    Fatal,
}

struct Inner {
    poller: Option<PollerHdl>,
    iop_state: Option<NonZeroU16>,
    notify_mmio_addr: Option<u64>,
    vring_state: Vec<VRingState>,
}
impl Inner {
    fn new(max_queues: usize) -> Self {
        let vring_state = vec![Default::default(); max_queues];
        let poller = None;
        let iop_state = None;
        let notify_mmio_addr = None;
        Self { poller, iop_state, notify_mmio_addr, vring_state }
    }

    /// Get the `VRingState` for a given VirtQueue
    fn for_vq(&mut self, vq: &VirtQueue) -> &mut VRingState {
        let id = vq.id as usize;
        assert!(id < self.vring_state.len());
        &mut self.vring_state[id]
    }
}

/// Configuration parmaeters for the underlying viona device
#[derive(Copy, Clone)]
pub struct DeviceParams {
    /// When transmitting packets, should viona (allocate and) copy the entire
    /// contents of the packet, rather than "loaning" the guest memory beyond
    /// the packet headers?
    ///
    /// There is a performance cost to copying the full packet, but it avoids
    /// certain issues pertaining to looped-back viona packets being delivered
    /// to native zones on the machine.
    ///
    /// This parameter requires [viona_api::ApiVersion::V3] or greater. This is
    /// before Propolis' minimum viona API version and can always be set.
    pub copy_data: bool,

    /// Byte count for padding added to the head of transmitted packets.  This
    /// padding can be used by subsequent operations in the transmission chain,
    /// such as encapsulation, which would otherwise need to re-allocate for the
    /// larger header.
    ///
    /// This parameter requires [viona_api::ApiVersion::V3] or greater. This is
    /// before Propolis' minimum viona API version and can always be set.
    pub header_pad: u16,
}
impl DeviceParams {
    #[cfg(target_os = "illumos")]
    fn set(&self, hdl: &VionaHdl) -> io::Result<()> {
        // Set parameters assuming an ApiVersion::V3 device
        let mut params = viona_api::NvList::new();
        params.add(c"tx_copy_data", self.copy_data);
        params.add(c"tx_header_pad", self.header_pad);
        if let Err(e) = hdl.0.set_parameters(&mut params) {
            match e {
                viona_api::ParamError::Io(io) => Err(io),
                viona_api::ParamError::Detailed(_) => Err(Error::new(
                    ErrorKind::InvalidInput,
                    "unsupported viona parameters",
                )),
            }
        } else {
            Ok(())
        }
    }

    #[cfg(not(target_os = "illumos"))]
    fn set(&self, _hdl: &VionaHdl) -> io::Result<()> {
        panic!("viona and libnvpair not present on non-illumos")
    }
}
impl Default for DeviceParams {
    fn default() -> Self {
        // Viona (as of V3) allocs/copies entire packet by default, with no
        // padding added to the header.
        Self { copy_data: true, header_pad: 0 }
    }
}

/// Represents a connection to the kernel's Viona (VirtIO Network Adapter)
/// driver.
pub struct PciVirtioViona {
    virtio_state: PciVirtioState,
    pci_state: pci::DeviceState,
    indicator: lifecycle::Indicator,

    dev_features: u64,
    mac_addr: [u8; ETHERADDRL],
    mtu: Option<u16>,
    hdl: VionaHdl,
    inner: Mutex<Inner>,
}

impl PciVirtioViona {
    pub fn new(
        vnic_name: &str,
        vm: &VmmHdl,
        viona_params: Option<DeviceParams>,
    ) -> io::Result<Arc<PciVirtioViona>> {
        Self::new_with_queue_sizes(
            vnic_name,
            RX_QUEUE_SIZE,
            TX_QUEUE_SIZE,
            CTL_QUEUE_SIZE,
            vm,
            viona_params,
        )
    }

    pub fn new_with_queue_sizes(
        vnic_name: &str,
        rx_queue_size: VqSize,
        tx_queue_size: VqSize,
        ctl_queue_size: VqSize,
        vm: &VmmHdl,
        viona_params: Option<DeviceParams>,
    ) -> io::Result<Arc<PciVirtioViona>> {
        let dlhdl = dladm::Handle::new()?;
        let info = dlhdl.query_link(vnic_name)?;
        let hdl = VionaHdl::new(info.link_id, vm.fd())?;

        #[cfg(feature = "falcon")]
        if let Err(e) = hdl.set_promisc(viona_api::VIONA_PROMISC_ALL_VLAN) {
            // Until/unless this support is integrated into stlouis/illumos,
            // this is an expected failure.   This is needed to use vlans,
            // but shouldn't affect any other use case.
            eprintln!("failed to enable promisc mode on {vnic_name}: {e:?}");
        }

        if let Some(vp) = viona_params {
            vp.set(&hdl)?;
        }

        // Do in-kernel configuration of device MTU
        if let Some(mtu) = info.mtu {
            if hdl.api_version().unwrap() >= viona_api::ApiVersion::V4 {
                hdl.set_mtu(mtu)?;
            } else if mtu != 1500 {
                // Squawk about MTUs not matching the default of 1500
                return Err(io::Error::new(
                    ErrorKind::Unsupported,
                    "viona device version is inadequate to set MTU",
                ));
            }
        }

        let queue_sizes = [rx_queue_size, tx_queue_size]
            .into_iter()
            .cycle()
            .take(max_num_queues())
            .chain([ctl_queue_size])
            .collect::<Vec<VqSize>>();
        // The vector is sized with the maximum number of rings/queues, but
        // until the driver negotiates multiqueue, we only use the first two.
        let queues = VirtQueues::new_with_len(3, &queue_sizes);
        if let Some(ctlq) = queues.get(2) {
            ctlq.set_control();
        }
        let nqueues = queues.max_capacity();
        hdl.set_pairs(1).unwrap();
        // Add one for config space.
        let msix_count = Some(1 + nqueues as u16);
        let (virtio_state, pci_state) = PciVirtioState::new(
            virtio::Mode::Transitional,
            queues,
            msix_count,
            virtio::DeviceId::Network,
            VIRTIO_NET_CFG_SIZE,
        );

        let dev_features = hdl.get_avail_features()?;
        let mut this = PciVirtioViona {
            virtio_state,
            pci_state,
            indicator: Default::default(),
            dev_features,
            mac_addr: [0; ETHERADDRL],
            mtu: info.mtu,
            hdl,
            inner: Mutex::new(Inner::new(nqueues)),
        };
        this.mac_addr.copy_from_slice(&info.mac_addr);
        let this = Arc::new(this);

        // Spawn the interrupt poller
        let mut inner = this.inner.lock().unwrap();
        inner.poller =
            Some(Poller::spawn(this.hdl.as_raw_fd(), Arc::downgrade(&this))?);
        drop(inner);

        Ok(this)
    }

    /// Get the minor instance number of the viona device.
    pub fn instance_id(&self) -> io::Result<u32> {
        self.hdl.instance_id()
    }

    fn process_interrupts(&self) {
        if let Some(mem) = self.pci_state.acc_mem.access() {
            self.hdl
                .intr_poll(self.virtio_state.queues.len() - 1, |vq_idx| {
                    self.hdl.ring_intr_clear(vq_idx).unwrap();
                    let vq = self.virtio_state.queues.get(vq_idx).unwrap();
                    vq.send_intr(&mem);
                })
                .unwrap();
        }
    }

    fn is_ctl_queue(&self, vq: &VirtQueue) -> bool {
        usize::from(vq.id) + 1 == self.virtio_state.queues.len()
    }

    fn ctl_queue_notify(&self, vq: &VirtQueue) {
        if let Some(mem) = self.pci_state.acc_mem.access() {
            while !vq.avail_is_empty(&mem) {
                let mut chain = Chain::with_capacity(4);
                let intrs_en = vq.disable_intr(&mem);
                while let Some((_idx, _len)) = vq.pop_avail(&mut chain, &mem) {
                    let res = match self.ctl_msg(vq, &mut chain, &mem) {
                        Ok(_) => control::Ack::Ok,
                        Err(_) => control::Ack::Err,
                    } as u8;
                    chain.write(&res, &mem);
                    vq.push_used(&mut chain, &mem);
                }
                if intrs_en {
                    vq.enable_intr(&mem);
                }
            }
        }
    }

    fn ctl_msg(
        &self,
        vq: &VirtQueue,
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Result<(), ()> {
        let mut header = control::Header::default();
        if !chain.read(&mut header, &mem) {
            return Err(());
        }
        use control::Command;
        match Command::try_from(header).map_err(|_| ())? {
            Command::Rx(cmd) => self.ctl_rx(cmd, vq, chain, mem),
            Command::Mac(cmd) => self.ctl_mac(cmd, vq, chain, mem),
            Command::Vlan(_) => Ok(()),
            Command::Announce(_) => Ok(()),
            Command::Mq(cmd) => self.ctl_mq(cmd, vq, chain, mem),
        }
    }

    fn ctl_rx(
        &self,
        cmd: control::RxCmd,
        vq: &VirtQueue,
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Result<(), ()> {
        let _todo = (cmd, vq, chain, mem);
        Err(())
    }

    fn ctl_mac(
        &self,
        cmd: control::MacCmd,
        vq: &VirtQueue,
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Result<(), ()> {
        let _todo = (cmd, vq, chain, mem);
        Err(())
    }

    fn set_use_pairs(&self, requested: u16) -> Result<(), ()> {
        if requested < 1 || PROPOLIS_MAX_MQ_PAIRS < requested {
            return Err(());
        }
        let npairs = requested as usize;
        if npairs == self.virtio_state.queues.len() {
            return Ok(());
        }
        self.hdl.set_usepairs(requested).unwrap();
        self.virtio_state
            .queues
            .set_len(npairs * 2 + 1)
            .expect("num queue pairs");
        Ok(())
    }

    fn ctl_mq(
        &self,
        cmd: control::MqCmd,
        vq: &VirtQueue,
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Result<(), ()> {
        use control::MqCmd;
        let _todo = vq;
        match cmd {
            MqCmd::SetPairs => {
                let mut msg = control::Mq::default();
                if !chain.read(&mut msg, &mem) {
                    return Err(());
                }
                let npairs = msg.npairs;
                probes::virtio_viona_mq_set_use_pairs!(|| (
                    MqSetPairsCause::Commanded as u8,
                    npairs
                ));
                self.set_use_pairs(npairs)
            }
            MqCmd::RssConfig => Err(()),
            MqCmd::HashConfig => Err(()),
        }
    }

    fn net_cfg_read(&self, id: &NetReg, ro: &mut ReadOp) {
        match id {
            NetReg::Mac => ro.write_bytes(&self.mac_addr),
            NetReg::Status => {
                // Always report link up
                ro.write_u16(VIRTIO_NET_S_LINK_UP);
            }
            NetReg::MaxVqPairs => {
                ro.write_u16(PROPOLIS_MAX_MQ_PAIRS);
            }
            NetReg::Mtu => {
                // Guests should not be asking for this value unless
                // VIRTIO_NET_F_MTU has been set. However, we'd rather lie
                // (return zero) than unwrap and panic here.
                ro.write_u16(self.mtu.unwrap_or(0));
            }
            NetReg::Speed
            | NetReg::Duplex
            | NetReg::RssMaxKeySize
            | NetReg::RssMaxIndirectionTableLen
            | NetReg::SupportedHashTypes => {}
        }
    }

    /// Pause the associated virtqueues and sync any in-kernel state for them
    /// into the userspace representation.
    fn queues_sync(&self) {
        let mut inner = self.inner.lock().unwrap();
        for vq in self.virtio_state.queues.iter() {
            // If the queue is not alive, there's nothing to do here.
            if !vq.is_alive() {
                continue;
            }

            let rs = inner.for_vq(vq);
            match *rs {
                VRingState::Ready | VRingState::Run | VRingState::Paused => {
                    // A control queue has no in-kernel state to synchronize.
                    // If this is the case, we simply mark the ring paused
                    // and continue.
                    if vq.is_control() {
                        *rs = VRingState::Paused;
                        continue;
                    }

                    // Ensure the ring is paused for a consistent snapshot
                    if *rs != VRingState::Paused {
                        if self.hdl.ring_pause(vq).is_err() {
                            *rs = VRingState::Error;
                            continue;
                        }
                        *rs = VRingState::Paused;
                    }

                    if let Ok(live) = self.hdl.ring_get_state(vq) {
                        let base = vq.get_state();
                        assert_eq!(
                            live.mapping.desc_addr,
                            base.mapping.desc_addr
                        );
                        vq.set_state(&queue::Info {
                            used_idx: live.used_idx,
                            avail_idx: live.avail_idx,
                            ..base
                        });
                    } else {
                        *rs = VRingState::Error;
                    }
                }
                _ => {
                    // The vring is in a state where it is either redundant to
                    // sync the state (Init), or impossible (Error, Fatal)
                }
            }
        }
    }

    fn queues_restart(&self) -> Result<(), ()> {
        let mut inner = self.inner.lock().unwrap();
        let mut res = Ok(());
        for vq in self.virtio_state.queues.iter() {
            let rs = inner.for_vq(vq);

            // The existing state machine for vrings in Viona does not allow for
            // a Paused -> Running transition, requiring instead that the vring
            // be reset and reloaded with state in order to proceed again.
            if self.hdl.ring_reset(vq).is_err() {
                *rs = VRingState::Fatal;
                res = Err(());
                // Although this fatal vring state means the device itself will
                // require a reset (which itself is unlikely to work), we
                // continue attempting to reset/restart the other VQs.
                continue;
            }

            *rs = VRingState::Init;
            if vq.is_mapped() {
                if self.hdl.ring_set_state(vq.as_ref()).is_err() {
                    *rs = VRingState::Error;
                    continue;
                }

                if let Some(intr_cfg) = vq.read_intr() {
                    if self.hdl.ring_cfg_msi(vq, Some(intr_cfg)).is_err() {
                        *rs = VRingState::Error;
                        continue;
                    }
                }
                *rs = VRingState::Ready;

                if vq.is_alive() {
                    // If the ring was already running, kick it.
                    if self.hdl.ring_kick(vq).is_err() {
                        *rs = VRingState::Error;
                        continue;
                    }
                    *rs = VRingState::Run;
                }
            }
        }
        res
    }

    /// Make sure all in-kernel virtqueue processing is stopped
    fn queues_kill(&self) {
        self.virtio_state.reset_queues(self);
    }

    fn poller_start(&self) {
        let mut inner = self.inner.lock().unwrap();
        let poller = inner.poller.as_mut().expect("poller should be spawned");
        let wait_state = poller.state.clone();
        let _ = poller.sender.send(TargetState::Run);
        drop(inner);
        // wait_running() will wait on a condition variable, but the signaller
        // of that condition variable is the Poller task that we've also spawned
        // on this runtime. `block_in_place` to avoid blocking this runtime
        // thread and help make sure the Poller we've asked to start actually
        // can.
        tokio::task::block_in_place(|| wait_state.wait_running());
    }
    fn poller_stop(&self, should_exit: bool) {
        let mut inner = self.inner.lock().unwrap();
        let wait_state = if should_exit {
            let poller = inner.poller.take().expect("poller should be spawned");
            let _ = poller.sender.send(TargetState::Exit);
            poller.state
        } else {
            let poller =
                inner.poller.as_mut().expect("poller should be spawned");
            let _ = poller.sender.send(TargetState::Pause);
            poller.state.clone()
        };
        drop(inner);
        // Same general problem as `wait_running` in `poller_start` above.
        tokio::task::block_in_place(|| wait_state.wait_stopped());
    }

    // Transition the emulation to a "running" state, either at initial start-up
    // or resumption from a "paused" state.
    fn run(&self) {
        self.poller_start();
        if self.queues_restart().is_err() {
            self.virtio_state.set_needs_reset(self);
            self.notify_port_update(None);
            self.notify_mmio_addr_update(None);
        } else {
            // If all is well with the queue restart, attempt to wire up the
            // notification ioport again.
            let state = self.inner.lock().unwrap();
            let _ = self.hdl.set_notify_io_port(state.iop_state);
            let _ = self.hdl.set_notify_mmio_addr(state.notify_mmio_addr);
        }
    }
}
impl VirtioDevice for PciVirtioViona {
    fn rw_dev_config(&self, mut rwo: RWOp) {
        NET_DEV_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => self.net_cfg_read(id, ro),
            RWOp::Write(_) => {
                //ignore writes
            }
        });
    }
    fn mode(&self) -> virtio::Mode {
        self.virtio_state.mode()
    }

    fn features(&self) -> u64 {
        let mut feat = VIRTIO_NET_F_MAC
            | VIRTIO_NET_F_STATUS
            | VIRTIO_NET_F_CTRL_VQ
            | VIRTIO_NET_F_MQ;
        // We drop the "VIRTIO_NET_F_MTU" flag from feat if we are unable to
        // query it. This can happen when executing within a non-global Zone.
        //
        // Context: https://www.illumos.org/issues/13992
        if self.mtu.is_some() {
            feat |= VIRTIO_NET_F_MTU;
        }
        feat |= self.dev_features;

        feat
    }

    fn set_features(&self, feat: u64) -> Result<(), ()> {
        self.hdl.set_features(feat).map_err(|_| ())?;
        if (feat & VIRTIO_NET_F_MQ) != 0 {
            self.hdl.set_pairs(PROPOLIS_MAX_MQ_PAIRS).map_err(|_| ())?;
            probes::virtio_viona_mq_set_use_pairs!(|| (
                MqSetPairsCause::MqEnabled as u8,
                PROPOLIS_MAX_MQ_PAIRS
            ));
            self.set_use_pairs(PROPOLIS_MAX_MQ_PAIRS)?;
        }
        Ok(())
    }

    fn queue_notify(&self, vq: &VirtQueue) {
        if self.is_ctl_queue(vq) {
            self.ctl_queue_notify(vq);
            return;
        }
        let mut inner = self.inner.lock().unwrap();
        let ring_state = inner.for_vq(vq);
        match ring_state {
            VRingState::Ready | VRingState::Run => {
                if self.hdl.ring_kick(vq).is_err() {
                    *ring_state = VRingState::Error;
                } else {
                    *ring_state = VRingState::Run;
                }
            }
            _ => {}
        }
    }
    fn queue_change(&self, vq: &VirtQueue, change: VqChange) -> Result<(), ()> {
        let mut inner = self.inner.lock().unwrap();
        let rs = inner.for_vq(vq);

        match change {
            VqChange::Reset => {
                if self.hdl.ring_reset(vq).is_err() {
                    *rs = VRingState::Fatal;
                    return Err(());
                }
                *rs = VRingState::Init;
            }
            VqChange::Address => {
                match *rs {
                    VRingState::Init => {}
                    VRingState::Ready
                    | VRingState::Run
                    | VRingState::Paused
                    | VRingState::Error => {
                        // Reset any vring not already in such a state
                        if self.hdl.ring_reset(vq).is_err() {
                            *rs = VRingState::Fatal;
                            return Err(());
                        }
                        *rs = VRingState::Init;
                    }
                    VRingState::Fatal => {
                        // No sense in trying anything further on a doomed vring
                        return Err(());
                    }
                }
                if !vq.is_mapped() {
                    return Ok(());
                }

                if !vq.is_control() && self.hdl.ring_init(vq).is_err() {
                    // Bad virtqueue configuration is not fatal.  While the
                    // vring will not transition to running, we will be content
                    // to wait for the guest to later provide a valid config.
                    *rs = VRingState::Error;
                    return Ok(());
                }

                if let Some(intr_cfg) = vq.read_intr() {
                    if self.hdl.ring_cfg_msi(vq, Some(intr_cfg)).is_err() {
                        *rs = VRingState::Error;
                    }
                }
                *rs = VRingState::Ready;
            }
            VqChange::IntrCfg => {
                if *rs != VRingState::Fatal {
                    let intr = vq.read_intr();
                    if self.hdl.ring_cfg_msi(vq, intr).is_err() {
                        *rs = VRingState::Error;
                    }
                }
            }
        }
        Ok(())
    }
}
impl Lifecycle for PciVirtioViona {
    fn type_name(&self) -> &'static str {
        "pci-virtio-viona"
    }
    fn reset(&self) {
        self.virtio_state.reset(self);
        probes::virtio_viona_mq_set_use_pairs!(|| (
            MqSetPairsCause::Reset as u8,
            1
        ));
        self.set_use_pairs(1).expect("can set viona back to one queue pair");
        self.hdl.set_pairs(1).expect("can set viona back to one queue pair");
        self.virtio_state.queues.reset_peak();
    }
    fn start(&self) -> anyhow::Result<()> {
        self.run();
        self.indicator.start();
        Ok(())
    }
    fn pause(&self) {
        self.poller_stop(false);
        self.queues_sync();

        // In case the device is being paused because of a pending instance
        // reinitialization (as part of a reboot/reset), the notification ioport
        // binding must be torn down.  Bhyve will emit failure of an attempted
        // reinitialization operation if any ioport hooks persist at that time.
        let _ = self.hdl.set_notify_io_port(None);
        let _ = self.hdl.set_notify_mmio_addr(None);

        self.indicator.pause();
    }
    fn resume(&self) {
        self.run();
        self.indicator.resume();
    }
    fn halt(&self) {
        self.poller_stop(true);
        // Destroy any in-kernel state to prevent it from impeding instance
        // destruction.
        self.queues_kill();
        let _ = self.hdl.delete();
        self.indicator.halt();
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }
}

impl PciVirtio for PciVirtioViona {
    fn virtio_state(&self) -> &PciVirtioState {
        &self.virtio_state
    }
    fn pci_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
    // The notification addresses (both port and MMIO) for the device can change
    // due to guest action, or other administrative tasks within propolis.
    fn notify_port_update(&self, port: Option<NonZeroU16>) {
        let mut state = self.inner.lock().unwrap();
        state.iop_state = port;
        // We want to update the in-kernel IO port hook when the address is
        // updated due to guest action; that is, when the device emulation is
        // actually running.
        if self.indicator.state() == IndicatedState::Run {
            let _ = self.hdl.set_notify_io_port(port);
        }
    }
    fn notify_mmio_addr_update(&self, addr: Option<u64>) {
        let mut state = self.inner.lock().unwrap();
        state.notify_mmio_addr = addr;
        // Only update the io-kernel address hook when changed by guest action,
        // similarly to the port IO case above.
        if self.indicator.state() == IndicatedState::Run {
            let _ = self.hdl.set_notify_mmio_addr(addr);
        }
    }
}

impl MigrateMulti for PciVirtioViona {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        <dyn PciVirtio>::export(self, output, ctx)
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError> {
        <dyn PciVirtio>::import(self, offer, ctx)?;

        let feat = self.virtio_state.negotiated_features();
        self.hdl.set_features(feat).map_err(|e| {
            MigrateStateError::ImportFailed(format!(
                "error while setting viona features ({feat:x}): {e:?}"
            ))
        })?;

        if (feat & VIRTIO_NET_F_MQ) != 0 {
            self.hdl.set_pairs(PROPOLIS_MAX_MQ_PAIRS).unwrap();
        }
        // Queue count is a NonZeroU16; hence `get` and -1 will not underflow.
        let io_queues = self.virtio_state.queues.count().get() - 1;
        let pairs = io_queues / 2;
        if !io_queues.is_multiple_of(2) {
            return Err(MigrateStateError::ImportFailed(format!(
                "source IO queue count was not even: {io_queues}"
            )));
        }
        probes::virtio_viona_mq_set_use_pairs!(|| (
            MqSetPairsCause::Import as u8,
            pairs
        ));
        self.hdl.set_usepairs(pairs).map_err(|e| {
            MigrateStateError::ImportFailed(format!(
                "error while restoring use pairs ({pairs}): {e:?}"
            ))
        })?;

        Ok(())
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum NetReg {
    Mac,
    Status,
    MaxVqPairs,
    Mtu,
    Speed,
    Duplex,
    RssMaxKeySize,
    RssMaxIndirectionTableLen,
    SupportedHashTypes,
}
lazy_static! {
    static ref NET_DEV_REGS: RegMap<NetReg> = {
        let layout = [
            (NetReg::Mac, 6),
            (NetReg::Status, 2),
            (NetReg::MaxVqPairs, 2),
            (NetReg::Mtu, 2),
            (NetReg::Speed, 4),
            (NetReg::Duplex, 1),
            (NetReg::RssMaxKeySize, 1),
            (NetReg::RssMaxIndirectionTableLen, 2),
            (NetReg::SupportedHashTypes, 4),
        ];
        RegMap::create_packed(VIRTIO_NET_CFG_SIZE, &layout, None)
    };
}

use viona_api::VionaFd;

impl From<&VirtQueue> for viona_api::vioc_ring_init_modern {
    fn from(vq: &VirtQueue) -> viona_api::vioc_ring_init_modern {
        let id = vq.id;
        let size = vq.size();
        let state = vq.get_state();
        let desc_addr = state.mapping.desc_addr;
        let avail_addr = state.mapping.avail_addr;
        let used_addr = state.mapping.used_addr;
        viona_api::vioc_ring_init_modern {
            rim_index: id,
            rim_qsize: size,
            rim_qaddr_desc: desc_addr,
            rim_qaddr_avail: avail_addr,
            rim_qaddr_used: used_addr,
            ..Default::default()
        }
    }
}

impl From<&VirtQueue> for viona_api::vioc_ring_state {
    fn from(vq: &VirtQueue) -> viona_api::vioc_ring_state {
        let id = vq.id;
        let size = vq.size();
        let state = vq.get_state();
        let desc_addr = state.mapping.desc_addr;
        let avail_addr = state.mapping.avail_addr;
        let used_addr = state.mapping.used_addr;
        let avail_idx = state.avail_idx;
        let used_idx = state.used_idx;
        viona_api::vioc_ring_state {
            vrs_index: id,
            vrs_qsize: size,
            vrs_qaddr_desc: desc_addr,
            vrs_qaddr_avail: avail_addr,
            vrs_qaddr_used: used_addr,
            vrs_used_idx: used_idx,
            vrs_avail_idx: avail_idx,
        }
    }
}

struct VionaHdl(VionaFd);
impl VionaHdl {
    fn new(link_id: u32, vm_fd: RawFd) -> io::Result<Self> {
        let vfd = VionaFd::new(link_id, vm_fd)?;

        Ok(Self(vfd))
    }
    fn delete(&self) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_DELETE, 0)?;
        Ok(())
    }
    fn get_avail_features(&self) -> io::Result<u64> {
        let mut features = 0;
        unsafe {
            self.0.ioctl(viona_api::VNA_IOC_GET_FEATURES, &mut features)?;
        }
        Ok(features)
    }
    fn set_features(&self, mut features: u64) -> io::Result<()> {
        unsafe {
            self.0.ioctl(viona_api::VNA_IOC_SET_FEATURES, &mut features)?;
        }
        Ok(())
    }
    fn set_pairs(&self, npairs: u16) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_SET_PAIRS, npairs as usize)?;
        Ok(())
    }
    fn set_usepairs(&self, npairs: u16) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_SET_USEPAIRS, npairs as usize)?;
        Ok(())
    }
    fn ring_init(&self, vq: &VirtQueue) -> io::Result<()> {
        if !vq.is_control() {
            let mut vna_ring_init = viona_api::vioc_ring_init_modern::from(vq);
            unsafe {
                self.0.ioctl(
                    viona_api::VNA_IOC_RING_INIT_MODERN,
                    &mut vna_ring_init,
                )?;
            }
        }
        Ok(())
    }
    fn ring_reset(&self, vq: &VirtQueue) -> io::Result<()> {
        if !vq.is_control() {
            let idx = vq.id as usize;
            self.0.ioctl_usize(viona_api::VNA_IOC_RING_RESET, idx)?;
        }
        Ok(())
    }
    fn ring_kick(&self, vq: &VirtQueue) -> io::Result<()> {
        if !vq.is_control() {
            let idx = vq.id as usize;
            self.0.ioctl_usize(viona_api::VNA_IOC_RING_KICK, idx)?;
        }
        Ok(())
    }
    fn ring_pause(&self, vq: &VirtQueue) -> io::Result<()> {
        if !vq.is_control() {
            let idx = vq.id as usize;
            self.0.ioctl_usize(viona_api::VNA_IOC_RING_PAUSE, idx)?;
        }
        Ok(())
    }
    fn ring_set_state(&self, vq: &VirtQueue) -> io::Result<()> {
        if !vq.is_control() {
            let mut cfg = viona_api::vioc_ring_state::from(vq);
            unsafe {
                self.0.ioctl(viona_api::VNA_IOC_RING_SET_STATE, &mut cfg)?;
            }
        }
        Ok(())
    }
    fn ring_get_state(&self, vq: &VirtQueue) -> io::Result<queue::Info> {
        let mut cfg = viona_api::vioc_ring_state {
            vrs_index: vq.id,
            ..Default::default()
        };
        if !vq.is_control() {
            unsafe {
                self.0.ioctl(viona_api::VNA_IOC_RING_GET_STATE, &mut cfg)?;
            }
        }
        Ok(queue::Info {
            mapping: queue::MapInfo {
                desc_addr: cfg.vrs_qaddr_desc,
                avail_addr: cfg.vrs_qaddr_avail,
                used_addr: cfg.vrs_qaddr_used,
                valid: true,
            },
            avail_idx: cfg.vrs_avail_idx,
            used_idx: cfg.vrs_used_idx,
        })
    }
    fn ring_cfg_msi(
        &self,
        vq: &VirtQueue,
        cfg: Option<VqIntr>,
    ) -> io::Result<()> {
        if !vq.is_control() {
            let (addr, msg) = match cfg {
                Some(VqIntr::Msi(a, m, masked)) if !masked => (a, m),
                // If MSI is disabled, or the entry is masked (individually,
                // or at the function level), then disable in-kernel
                // acceleration of MSI delivery.
                _ => (0, 0),
            };

            let mut vna_ring_msi = viona_api::vioc_ring_msi {
                rm_index: vq.id,
                _pad: [0; 3],
                rm_addr: addr,
                rm_msg: u64::from(msg),
            };
            unsafe {
                self.0.ioctl(
                    viona_api::VNA_IOC_RING_SET_MSI,
                    &mut vna_ring_msi,
                )?;
            }
        }
        Ok(())
    }
    fn intr_poll(
        &self,
        max_intrs: usize,
        mut f: impl FnMut(u16),
    ) -> io::Result<()> {
        let mut vna_ip = viona_api::vioc_intr_poll_mq::default();
        vna_ip.vipm_nrings = max_intrs as u16;
        let mut nintrs = unsafe {
            self.0.ioctl(viona_api::VNA_IOC_INTR_POLL_MQ, &mut vna_ip)?
        };
        let nrings = vna_ip.vipm_nrings as usize;
        for i in 0..nrings {
            let k = i / 32;
            let b = i % 32;
            if vna_ip.vipm_status[k].get_bit(b) {
                f(i as u16);
                nintrs -= 1;
                if nintrs == 0 {
                    break;
                }
            }
        }
        Ok(())
    }
    fn ring_intr_clear(&self, idx: u16) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_RING_INTR_CLR, idx as usize)?;
        Ok(())
    }

    /// Get the minor instance number of the viona device.
    /// This is used for matching kernal statistic entries to the viona device.
    fn instance_id(&self) -> io::Result<u32> {
        self.0.instance_id()
    }

    /// Set MTU for viona device
    fn set_mtu(&self, mtu: u16) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_SET_MTU, mtu.into())?;
        Ok(())
    }

    fn api_version(&self) -> io::Result<u32> {
        self.0.api_version()
    }

    /// Sets the address that viona recognizes for virtqueue notifications
    ///
    /// Viona can install a hook in the associated VM at a specified address (in
    /// either the guest port or physical address spaces) to recognize guest
    /// writes that notify in-kernel emulated virtqueues of available buffers.
    ///
    /// With a non-zero argument, viona will attempt to attach such a hook,
    /// replacing any currently in place.  When the argument is None, any
    /// existing hook is torn down.
    fn set_notify_io_port(&self, port: Option<NonZeroU16>) -> io::Result<()> {
        self.0.ioctl_usize(
            viona_api::VNA_IOC_SET_NOTIFY_IOP,
            port.map(|p| p.get()).unwrap_or(0) as usize,
        )?;
        Ok(())
    }
    fn set_notify_mmio_addr(&self, addr: Option<u64>) -> io::Result<()> {
        let mut vim = viona_api::vioc_notify_mmio::default();
        let ptr = addr
            .map(|vim_address| {
                vim.vim_address = vim_address;
                vim.vim_size = super::pci::NOTIFY_REG_SIZE as u32;
                &raw mut vim
            })
            .unwrap_or(std::ptr::null_mut());
        unsafe {
            self.0.ioctl(viona_api::VNA_IOC_SET_NOTIFY_MMIO, ptr)?;
        }
        Ok(())
    }

    ///
    /// Set the desired promiscuity level on this interface.
    #[cfg(feature = "falcon")]
    fn set_promisc(&self, p: i32) -> io::Result<()> {
        self.0.ioctl_usize(viona_api::VNA_IOC_SET_PROMISC, p as usize)?;
        Ok(())
    }
}

impl AsRawFd for VionaHdl {
    fn as_raw_fd(&self) -> RawFd {
        self.0.as_raw_fd()
    }
}

// This is an ugly hack to work around tokio's inability to poll for event
// readiness on states other than POLLIN/POLLOUT, since viona communicates
// changes to in-kernel ring interrupt state with POLLRDBAND.  In the short
// term, we can translate that to POLLIN using nested epoll.  The viona fd is
// added to an epoll handle, subscribing to EPOLLRDBAND.  When that condition is
// met for the device, epoll will generate an event, making the epoll fd itself
// readable.  We can subscribe to that using the normal tokio event system.
//
// In the long term, viona should probably move to something like eventfd to
// make polling on those ring interrupt events more accessible.
struct Poller {
    epfd: RawFd,
    receiver: watch::Receiver<TargetState>,
    dev: Weak<PciVirtioViona>,
    state: Arc<PollerState>,
}

enum TargetState {
    Pause,
    Run,
    Exit,
}
struct PollerState {
    cv: Condvar,
    running: Mutex<bool>,
}
impl PollerState {
    fn wait_stopped(&self) {
        let guard = self.running.lock().unwrap();
        let _res = self.cv.wait_while(guard, |g| *g).unwrap();
    }
    fn wait_running(&self) {
        let guard = self.running.lock().unwrap();
        let _res = self.cv.wait_while(guard, |g| !*g).unwrap();
    }
    fn set_stopped(&self) {
        let mut guard = self.running.lock().unwrap();
        if *guard {
            *guard = false;
            self.cv.notify_all();
        }
    }
    fn set_running(&self) {
        let mut guard = self.running.lock().unwrap();
        if !*guard {
            *guard = true;
            self.cv.notify_all();
        }
    }
}

struct PollerHdl {
    _join: JoinHandle<()>,
    sender: watch::Sender<TargetState>,
    state: Arc<PollerState>,
}

#[cfg(target_os = "illumos")]
impl Poller {
    fn spawn(
        viona_fd: RawFd,
        dev: Weak<PciVirtioViona>,
    ) -> io::Result<PollerHdl> {
        let epfd = unsafe { libc::epoll_create1(libc::EPOLL_CLOEXEC) } as RawFd;
        if epfd == -1 {
            return Err(Error::last_os_error());
        }
        let mut event =
            libc::epoll_event { events: libc::EPOLLRDBAND as u32, u64: 0 };
        let res = unsafe {
            libc::epoll_ctl(epfd, libc::EPOLL_CTL_ADD, viona_fd, &mut event)
        };
        if res == -1 {
            return Err(Error::last_os_error());
        }

        let state = Arc::new(PollerState {
            cv: Condvar::new(),
            running: Mutex::new(false),
        });
        let (sender, receiver) = watch::channel(TargetState::Pause);
        let mut poller = Poller { epfd, receiver, dev, state: state.clone() };

        let _join = tokio::spawn(async move {
            poller.poll_interrupts().await;
            poller.state.set_stopped();
        });

        Ok(PollerHdl { _join, sender, state })
    }
    fn event_present(&self) -> io::Result<bool> {
        let max_events = 1;
        let mut event = libc::epoll_event { events: 0, u64: 0 };
        let res =
            unsafe { libc::epoll_wait(self.epfd, &mut event, max_events, 0) };
        match res {
            -1 => {
                let err = Error::last_os_error();
                if matches!(err.kind(), ErrorKind::Interrupted) {
                    Ok(false)
                } else {
                    Err(err)
                }
            }
            0 => Ok(false),
            x if x == max_events => Ok(true),
            x => {
                panic!("unexpected {} events", x);
            }
        }
    }
    async fn poll_interrupts(&mut self) {
        let afd =
            AsyncFd::with_interest(self.epfd, Interest::READABLE).unwrap();
        loop {
            loop {
                match *self.receiver.borrow_and_update() {
                    TargetState::Exit => return,
                    TargetState::Run => {
                        self.state.set_running();
                        break;
                    }
                    TargetState::Pause => {
                        self.state.set_stopped();
                        // Fall through to wait for next state change
                    }
                }
                if self.receiver.changed().await.is_err() {
                    return;
                }
            }

            tokio::select! {
                readable = afd.readable() => {
                    if readable.is_err() {
                        return;
                    }
                    let mut readable = readable.unwrap();
                    match self.event_present() {
                        Ok(false) => {
                            readable.clear_ready();
                        }
                        Ok(true) => {
                            if let Some(dev) = Weak::upgrade(&self.dev) {
                                dev.process_interrupts();
                            } else {
                                // Underlying device has been dropped
                                return;
                            }
                        }
                        Err(_) => {
                            return;
                        }
                    };
                }
                _state_change = self.receiver.changed() => {
                    // Fall through to the state management above
                }
            }
        }
    }
}

// macOS doesn't expose the epoll_create1 function as well as some other
// constants used above. Given viona isn't available on non-illumos systems
// anyways, we stub with just enough that it builds and can run unit tests.
#[cfg(not(target_os = "illumos"))]
impl Poller {
    fn spawn(
        _viona_fd: RawFd,
        _dev: Weak<PciVirtioViona>,
    ) -> io::Result<PollerHdl> {
        Err(Error::new(
            ErrorKind::Other,
            "viona not available on non-illumos systems",
        ))
    }
}

impl Drop for Poller {
    fn drop(&mut self) {
        unsafe {
            libc::close(self.epfd);
        }
    }
}

pub(crate) mod bits {
    #![allow(unused)]

    pub const VIRTIO_NET_S_LINK_UP: u16 = 1 << 0;
    pub const VIRTIO_NET_S_ANNOUNCE: u16 = 1 << 1;

    pub const VIRTIO_NET_CFG_SIZE: usize = 6 + 2 + 2 + 2 + 4 + 1 + 1 + 2 + 4;
}
use bits::*;

/// Check that available viona API matches expectations of propolis crate
pub(crate) fn check_api_version() -> Result<(), crate::api_version::Error> {
    let vers = viona_api::api_version()?;

    // when setting up a vNIC, Propolis will unconditionally do the SET_PAIRS
    // ioctl, which requires V6.
    let want = viona_api::ApiVersion::V6 as u32;

    if vers < want {
        Err(crate::api_version::Error::TooLow { have: vers, want })
    } else {
        Ok(())
    }
}

/// Test functionality of the virtio-nic device as much as seems reasonable
/// without having a full guest and driver running the device. Unless stated
/// otherwise, the test expectations here are not grounded in any kind of
/// observed behavior, just "the VirtIO spec says ... so ..."
///
/// If guests require changes that cause these tests to fail, please note the
/// cirumstances carefully, and consider if these test expectations were correct
/// in the first place; in some sense these tests function as a bespoke
/// "virtio-nic driver" that lives only in Propolis' tests.
#[cfg(test)]
mod test {
    use crate::common::{GuestAddr, RWOp, ReadOp, WriteOp, MB, PAGE_SIZE};
    use crate::hw::chipset::i440fx::{self, I440FxHostBridge};
    use crate::hw::chipset::Chipset;
    use crate::hw::pci;
    use crate::hw::pci::device::Device;
    use crate::hw::pci::Bdf;
    use crate::hw::virtio::pci::Status;
    use crate::hw::virtio::viona::{
        VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_MAC, VIRTIO_NET_F_MQ,
        VIRTIO_NET_F_STATUS,
    };
    use crate::hw::virtio::PciVirtioViona;
    use crate::lifecycle::Lifecycle;
    use crate::migrate::{
        MigrateCtx, MigrateMulti, PayloadOffer, PayloadOffers, PayloadOutputs,
    };
    use crate::Machine;
    use std::env::VarError;
    use std::process::Command;
    use std::sync::Arc;

    struct TestCtx {
        test_name: &'static str,
        underlying_nic: String,
        vnic_name: String,
        machine: Machine,
        dev: Arc<PciVirtioViona>,
    }

    impl Drop for TestCtx {
        fn drop(&mut self) {
            Lifecycle::pause(self.dev.as_ref());
            Lifecycle::halt(self.dev.as_ref());
        }
    }

    impl TestCtx {
        fn create_driver(&self) -> VirtioNetDriver<'_, '_> {
            VirtioNetDriver::for_hardware(&self.machine, &self.dev)
        }

        fn migrate(self) -> TestCtx {
            let mut dev_payloads = PayloadOutputs::new();
            let acc_mem =
                self.machine.acc_mem.access().expect("machine has memory");
            let ctx = MigrateCtx { mem: &acc_mem };
            <PciVirtioViona>::export(&self.dev, &mut dev_payloads, &ctx)
                .expect("can export PciVirtioViona");
            let mut payloads = Vec::new();
            for output in dev_payloads.into_iter() {
                let bytes = serde_json::to_string(&output.payload)
                    .expect("serializing payload output");
                let serialized = (output.kind, output.version, bytes);
                payloads.push(serialized);
            }

            // Loosely follow the structure of `import_device` as `propolis-server` would; the
            // combination of type erasure and borrows make this somewhat more complicated than it
            // would ideally be..
            let mut desers = Vec::new();
            for (_, _, bytes) in payloads.iter() {
                desers.push(serde_json::Deserializer::from_str(&bytes));
            }
            let mut offers = Vec::new();
            for ((kind, version, _bytes), deser) in
                payloads.iter().zip(desers.iter_mut())
            {
                let deserialized =
                    Box::new(<dyn erased_serde::Deserializer>::erase(deser));
                offers.push(PayloadOffer {
                    kind,
                    version: *version,
                    payload: deserialized,
                });
            }
            let mut offers = PayloadOffers::new(offers);

            let vnic_name = self.vnic_name.clone();
            let underlying_nic = self.underlying_nic.clone();
            let test_name = self.test_name;

            std::mem::drop(acc_mem);
            std::mem::drop(self);

            delete_vnic(&vnic_name);
            create_vnic(&underlying_nic, &vnic_name);

            let new_ctx =
                create_test_ctx(test_name, &underlying_nic, &vnic_name);
            let acc_mem = new_ctx
                .machine
                .acc_mem
                .access()
                .expect("new machine has memory");
            let new_migrate = MigrateCtx { mem: &acc_mem };
            <PciVirtioViona>::import(&new_ctx.dev, &mut offers, &new_migrate)
                .expect("can import PciVirtioViona");
            Lifecycle::start(new_ctx.dev.as_ref())
                .expect("can start viona device");
            new_ctx
        }
    }

    fn create_test_ctx(
        test_name: &'static str,
        underlying_nic: &str,
        vnic_name: &str,
    ) -> TestCtx {
        // Create the VM with `force: true`: if we're running tests concurrently
        // this will trample an existing test (which should then fail!). We do
        // this so that if a test misconfiguration left a stray old VM hanging
        // around we'll get it out of the way for this test re-run.
        //
        // No reservoir because the test VM is tiny and we don't want to require
        // even more specific host configuration for tests. There's no reason
        // the reservoir should be affecting virtio-nic-related tests anyway.
        let vm_opts = crate::vmm::CreateOpts {
            force: true,
            use_reservoir: false,
            track_dirty: false,
        };
        let vm_name = format!("virtio-viona-test-{}", test_name);
        let machine = crate::vmm::Builder::new(&vm_name, vm_opts)
            .expect("can set up vmm builder")
            .add_mem_region(0, 64 * MB, "test mem")
            .expect("can add dummy mem region")
            .max_cpus(1)
            .expect("can add cpus")
            .finalize()
            .expect("can create test VMM");
        let pci_topology = pci::topology::Builder::new()
            .finish(&machine)
            .expect("can build empty topology")
            .topology;
        let chipset_hb = I440FxHostBridge::create(
            pci_topology,
            i440fx::Opts {
                power_pin: None,
                reset_pin: None,
                enable_pcie: false,
            },
        );
        let viona_dev = PciVirtioViona::new(vnic_name, &machine.hdl, None)
            .expect("can create test vnic");

        chipset_hb.pci_attach(i440fx::DEFAULT_HB_BDF, chipset_hb.clone(), None);
        chipset_hb.attach(&machine);
        chipset_hb.pci_attach(
            Bdf::new_unchecked(0, 8, 0),
            viona_dev.clone(),
            None,
        );

        TestCtx {
            machine,
            dev: viona_dev,
            test_name,
            underlying_nic: underlying_nic.to_owned(),
            vnic_name: vnic_name.to_owned(),
        }
    }

    /// Glue for a nicer test interface to read/write a specific structure in a
    /// PCI BAR.
    struct BarAccessor<'dev> {
        dev: &'dev PciVirtioViona,
        bar: pci::BarN,
        offset: usize,
    }

    impl<'dev> BarAccessor<'dev> {
        fn at(
            dev: &'dev PciVirtioViona,
            bar: pci::BarN,
            offset: usize,
        ) -> Self {
            Self { dev, bar, offset }
        }

        fn read(&self, addr: usize, buf: &mut [u8]) {
            let mut op = ReadOp::from_buf(self.offset + addr, buf);
            self.dev.bar_rw(self.bar, RWOp::Read(&mut op));
        }

        fn write(&self, addr: usize, buf: &[u8]) {
            let mut op = WriteOp::from_buf(self.offset + addr, buf);
            self.dev.bar_rw(self.bar, RWOp::Write(&mut op));
        }

        fn read_u8(&self, addr: usize) -> u8 {
            let mut b = [0];
            self.read(addr, &mut b);
            b[0]
        }

        fn read_le16(&self, addr: usize) -> u16 {
            let mut b = [0, 0];
            self.read(addr, &mut b);
            u16::from_le_bytes(b)
        }

        fn read_le32(&self, addr: usize) -> u32 {
            let mut b = [0, 0, 0, 0];
            self.read(addr, &mut b);
            u32::from_le_bytes(b)
        }

        fn write_u8(&self, addr: usize, v: u8) {
            self.write(addr, &[v]);
        }

        fn write_le16(&self, addr: usize, v: u16) {
            self.write(addr, &v.to_le_bytes());
        }

        fn write_le32(&self, addr: usize, v: u32) {
            self.write(addr, &v.to_le_bytes());
        }

        fn write_le64(&self, addr: usize, v: u64) {
            self.write(addr, &v.to_le_bytes());
        }
    }

    /// `COMMON_REGS` describes the common configuration structure for VirtIO
    /// devices, but that machinery is oriented around translating access
    /// offsets into a structured enum variant. In these tests though, we'll go
    /// from desired field access to offsets in an RWOp.
    ///
    /// This namespace gives names for various field offsets, matching
    /// `COMMON_REGS` and its source, `struct virtio_pci_common_cfg` from the
    /// VirtIO spec.
    // Items here are named to match struct fields from the VirtIO spec.
    #[allow(non_upper_case_globals, dead_code)]
    mod common_cfg {
        // > /* About the whole device. */
        pub const device_feature_select: usize = 0;
        pub const device_feature: usize = 4;
        pub const driver_feature_select: usize = 8;
        pub const driver_feature: usize = 12;
        pub const config_msix_vector: usize = 16;
        pub const num_queues: usize = 18;
        pub const device_status: usize = 20;
        pub const config_generation: usize = 21;

        // > /* About a specific virtqueue. */
        pub const queue_select: usize = 22;
        pub const queue_size: usize = 24;
        pub const queue_msix_vector: usize = 26;
        pub const queue_enable: usize = 28;
        pub const queue_notify_off: usize = 30;
        pub const queue_desc: usize = 32;
        pub const queue_driver: usize = 40;
        pub const queue_device: usize = 48;
        pub const queue_notify_data: usize = 56;
        pub const queue_reset: usize = 58;
    }

    #[allow(non_upper_case_globals, dead_code)]
    mod net_config {
        pub const mac: usize = 0;
        pub const status: usize = 6;
        // This field is only valid if VIRTIO_NET_F_MQ is negotiated.
        pub const max_virtqueue_pairs: usize = 8;
        // This field is only valid if VIRTIO_NET_F_MTU is negotiated.
        pub const mtu: usize = 10;
        // This and `duplex` are only valid if VIRTIO_NET_F_SPEED_DUPLEX is
        // negotiated.
        pub const speed: usize = 12;
        pub const duplex: usize = 16;
        // These fields all depend on VIRTIO_NET_F_RSS or related features,
        // which we won't set for these tests..
        pub const rss_max_key_size: usize = 17;
        pub const rss_max_indirection_table_length: usize = 18;
        pub const supported_hash_types: usize = 20;
    }

    #[test]
    fn test_common_cfg_size_is_right() {
        // TODO: in a more recent rust this could be a `const { assert_eq!() }`
        assert_eq!(
            common_cfg::queue_reset + 2,
            crate::hw::virtio::pci::COMMON_REG_SIZE_TEST
        )
    }

    /// A very simple "driver" to drive test operations on a VirtIO device based
    /// on our understanding of the VirtIO spec.
    ///
    /// This serves as a stand-in for some kind of guest software initializing
    /// (and potentially one day?) operating a virtio-nic device. Tests using
    /// this "driver" will often instantiate it multiple times as an
    /// approximation of various guest operating systems initializing their
    /// distinct drivers.
    struct VirtioNetDriver<'mach, 'nic> {
        machine: &'mach Machine,
        dev: &'nic PciVirtioViona,
        common_config: BarAccessor<'nic>,
        device_config: BarAccessor<'nic>,
        state: DriverState,
    }

    /// The "volatile" part of `VirtioNetDriver`: whatever "guest-side" part
    /// should remain constant when tests migrate the corresponding
    /// `PciVirtioViona`.
    //
    // Theoretically this state could live in the test VM, but that would
    // require "migrating" guest memory, which is more work than is strictly
    // necessary to test the de vice. On top of that it's a bit annoying to
    // fulfill "driver memory" as reads/writes into the test VM, so we don't.
    struct DriverState {
        max_pairs: Option<u16>,
        next_queue_gpa: u64,
    }

    impl DriverState {
        fn new() -> Self {
            Self {
                max_pairs: None,
                // Start virtio-nic queues somewhere other than address 0.
                next_queue_gpa: 2 * MB as u64,
            }
        }
    }

    impl<'mach, 'nic> VirtioNetDriver<'mach, 'nic> {
        fn for_hardware(
            machine: &'mach Machine,
            dev: &'nic PciVirtioViona,
        ) -> Self {
            Self::import(machine, dev, DriverState::new())
        }

        fn set_max_pairs(&mut self, pairs: Option<u16>) {
            self.state.max_pairs = pairs;
        }

        fn import(
            machine: &'mach Machine,
            dev: &'nic PciVirtioViona,
            state: DriverState,
        ) -> Self {
            // We place virtio_pci_common_cfg at BAR 2, offset 0, so hardcode this
            // in the test for now.
            //
            // TODO: it would be more appropriate to walk through the device's PCI
            // capabilities until we find VIRTIO_PCI_CAP_COMMON_CFG but that's a
            // little annoying..
            let common_config = BarAccessor::at(dev, pci::BarN::BAR2, 0);

            // Device-specific configuration offsets above are declared on their
            // own, so even though this is in the same BAR we'll set the base offset
            // to match.
            let device_config =
                BarAccessor::at(dev, pci::BarN::BAR2, PAGE_SIZE);

            Self { machine, dev, common_config, device_config, state }
        }

        fn export(self) -> DriverState {
            self.state
        }

        fn read_status(&self) -> Status {
            Status::from_bits(
                self.common_config.read_u8(common_cfg::device_status),
            )
            .unwrap()
        }

        fn write_status(&self, bits: Status) {
            self.common_config.write_u8(common_cfg::device_status, bits.bits());
        }

        fn set_status_bits(&self, bits: Status) {
            self.write_status(self.read_status() | bits);
        }

        fn status_ok(&self) -> bool {
            !self.read_status().intersects(Status::NEEDS_RESET | Status::FAILED)
        }

        // Modern and legacy queue layout requirements differ a bit, but this
        // sets up queues in the legacy format to be usable in both contexts.
        //
        // This does not actually initialize the descriptor tables in any
        // meaningful way! These queues are not actually usable!
        fn init_queue(&mut self, queue: u16) {
            self.common_config.write_le16(common_cfg::queue_select, queue);

            // We don't strictly *need* to check if the queue was already
            // active, but Linux does (setup_vq()->vp_modern_get_queue_enable())
            // and it is true that we should not be initializing already-enabled
            // queues. So we check here too.
            let already_enabled =
                self.common_config.read_le16(common_cfg::queue_enable) == 1;
            assert!(!already_enabled);

            let queue_size =
                self.common_config.read_le16(common_cfg::queue_size);
            assert_ne!(queue_size, 0);
            // In "2.7 Split Virtqueues",
            //
            // > The maximum Queue Size value is 32768.
            assert!(queue_size <= 32 * 1024);

            let page_u16: u16 = PAGE_SIZE.try_into().unwrap();
            let page_u64: u64 = PAGE_SIZE.try_into().unwrap();

            // For simplicity, shrink `queue_size` small enough that it fits in
            // one page. There are a few additional items for the various parts
            // of virtquues in addition to just an array of 16-byte elements, so
            // we use the next smaller power of two so we round up to one page
            // in the end.
            //
            // TODO: with support for VIRTIO_F_RING_PACKED we will be freed from
            // having to write power of 2 sizes
            let chosen_size = (page_u16 / 16) >> 1;
            if chosen_size < queue_size {
                self.common_config
                    .write_le16(common_cfg::queue_size, chosen_size);
            }

            let acc_mem =
                self.machine.acc_mem.access().expect("can access memory");

            let descriptor_table_gpa = self.state.next_queue_gpa;
            self.common_config
                .write_le64(common_cfg::queue_desc, descriptor_table_gpa);

            let avail_gpa = descriptor_table_gpa.next_multiple_of(page_u64);
            // First, flags.
            // > If the VIRTIO_F_EVENT_IDX feature bit is not negotiated:
            // > * The driver MUST set flags to 0 or 1.
            // > * The driver MAY set flags to 1 to advise the device that
            //     notifications are not needed.
            acc_mem.write::<u32>(GuestAddr(avail_gpa), &0);
            // Index. "This starts at 0, and increases."
            acc_mem.write::<u32>(GuestAddr(avail_gpa + 4), &0);
            // Leave all the `ring` entries uninitialized, and we've not
            // negotiated VIRTIO_F_EVENT_IDX so no `used_event` for now.
            self.common_config.write_le64(common_cfg::queue_driver, avail_gpa);

            let used_gpa = avail_gpa.next_multiple_of(page_u64);

            self.common_config.write_le64(common_cfg::queue_device, used_gpa);

            self.common_config.write_le16(common_cfg::queue_enable, 1);

            // Finally, round up so the next queue (if there is one) starts
            // page-aligned like it should.
            self.state.next_queue_gpa = used_gpa.next_multiple_of(page_u64);

            let msi_vector = 0x100 + queue;
            self.common_config
                .write_le16(common_cfg::queue_msix_vector, msi_vector);
            let configured_vector =
                self.common_config.read_le16(common_cfg::queue_msix_vector);
            assert_eq!(configured_vector, msi_vector);
        }

        /// Initialize a VirtIO device according to "Driver Requirements: Device
        /// Initialization". This includes the initial RESET.
        ///
        /// This will panic if device initialization concludes with the device
        /// in NEEDS_RESET.
        fn modern_device_init(&mut self, features: u64) {
            // > The driver MUST follow this sequence to initialize a device:
            // > 1. Reset the device.
            self.write_status(Status::RESET);

            // > 2. Set the ACKNOWLEDGE status bit: the guest OS has noticed the
            // > device.
            self.set_status_bits(Status::ACK);

            // > 3. Set the DRIVER status bit: the guest OS knows how to drive the
            // > device.
            self.set_status_bits(Status::DRIVER);

            // > 4. Read device feature bits, and write the subset of feature bits
            // > understood by the OS and driver to the device. During this step the
            // > driver MAY read (but MUST NOT write) the device-specific
            // > configuration fields to check that it can support the device before
            // > accepting it.
            let device_feats =
                self.common_config.read_le32(common_cfg::device_feature);

            // VirtIO defines features as up to 64 bits, but the register is an le32
            // with a separate register to select which part of feature space is to
            // be written. Ignore all this given that no features are defined in the
            // upper space yet (and if they were, we're not using them .. yet..?)
            let features_u32: u32 = features
                .try_into()
                .expect("we don't (yet?) care about features above u32");

            let unsupported = features_u32 & !device_feats;
            if unsupported != 0 {
                panic!(
                    "Test wants more features than the device offers? \n\
                    Device offers: {:#08x}\n\
                    Test wants:    {:#08x}\n\
                    Device lacks:  {:#08x}\n",
                    device_feats, features_u32, unsupported
                );
            }

            // TODO:
            // if `features` includes multi-queue, for example, a guest might check
            // the number of supported queues to decide if it actually can operate
            // the device in multi-queue mode...
            //
            // We know that `features` is a subset of `device_feats` by
            // `unsupported` being zero, above.
            eprintln!("writing features: {:#08x}", features_u32);
            self.common_config
                .write_le32(common_cfg::driver_feature, features_u32);

            self.set_status_bits(Status::FEATURES_OK);

            // > 6. Re-read device status to ensure the FEATURES_OK bit is still
            // > set: otherwise, the device does not support our subset of features
            // > and the device is unusable.
            let device_status = self.read_status();
            if !device_status.contains(Status::FEATURES_OK) {
                // Now, this *really* shouldn't happen, because we've checked that
                // the device just offered up all the features we've requested. But
                // it's possible that some features are mutually-exclusive and we've
                // made a bad choice, in theory..
                panic!(
                    "Device does not support requested features: {:#08x}",
                    features
                );
            }

            // Extra pedantically, the device should not be NEEDS_RESET or FAILED.
            assert!(self.status_ok());

            // > 7. Perform device-specific setup, including discovery of virtqueues
            // > for the device, optional per-bus setup, reading and possibly
            // > writing the device's virtio configuration space, and population of
            // > virtqueues.

            let n_qpairs = if features & VIRTIO_NET_F_MQ == 0 {
                1
            } else {
                // We'll configure all of the device's queues here. This is what
                // we've seen both Linux and Windows do with virtio devices (in
                // Linux, virtnet_probe()->init_vqs()). The number of
                // actually-used queues is only configured later.
                let max_pairs = self
                    .device_config
                    .read_le16(net_config::max_virtqueue_pairs);
                // TODO: the *right* thing to do here would be to set up the control
                // queue, send an MQ command with VQ_PAIRS_SET, then wait for a
                // (should be immediate) response. Or .. just call into the device
                // directly.
                self.dev
                    .set_use_pairs(max_pairs)
                    .expect("can set_use_pairs(max_pairs)");
                max_pairs
            };
            let n_queues = n_qpairs * 2;
            eprintln!("n_qpairs: {}", n_qpairs);

            for queue in 0..n_queues {
                eprintln!("initializing queue {}", queue);
                self.init_queue(queue);
                assert!(self.status_ok());
            }

            if n_qpairs > 1 {
                // Again following in the footsteps of observed Windows/Linux
                // virtio drivers: now that queues are all initialized, set the
                // number of queue pairs we'll actually use. The test (playing
                // the role of the guest OS) may have selected less than the
                // maximum queue pairs.
                let wanted_pairs = self.state.max_pairs.unwrap_or(n_qpairs);
                assert!(wanted_pairs <= n_qpairs);
                self.dev
                    .set_use_pairs(wanted_pairs)
                    .expect("can set_use_pairs(wanted_pairs)");
            }

            if features & VIRTIO_NET_F_CTRL_VQ != 0 {
                // configure the control queue too?
                self.common_config
                    .write_le16(common_cfg::queue_select, n_queues);
            }

            // From 5.1.4.2 "Driver Requirements: Device configuration layout",
            // > If the driver negotiates VIRTIO_NET_F_MTU, it MUST supply enough
            // > receive buffers to receive at least one receive packet of size mtu
            // > (plus low level ethernet ehader length) with gso_type NONE or ECN.
            //
            // TODO: Does this mean that if we do not provide buffers, but set
            // DRIVER_OK, that the device should fail initialization? huh!

            // > 8. Set the DRIVER_OK status bit. At this point the device is
            // > "live".
            //
            // Is the implication (given 7.) that at this point the the driver
            // should not? must not? write to the device's virtio configuration
            // space?
            self.set_status_bits(Status::DRIVER_OK);

            // Now that the device is initialized we can check once again that it
            // thinks everything is OK...
            assert!(self.status_ok());
        }
    }

    fn test_device_status_writes(test_ctx: TestCtx) -> TestCtx {
        // The device and driver collaborate via `device_status` to get the
        // device turned on. There's a subtlety here though, in VirtIO 1.2
        // section 2.1.2:
        //
        // > The driver MUST NOT clear a device status bit.
        //
        // which means if the device has set NEEDS_RESET, and a driver writes
        // back a status that would clear that bit, the driver is in violation.
        // Clearing any of the status bits will earn a warning and setting the
        // device status to NEEDS_RESET.

        let driver = test_ctx.create_driver();

        // First, if we just set up some bits and try to clear one, we won't
        // tolerate that..
        driver.write_status(Status::RESET);

        driver.set_status_bits(Status::ACK | Status::DRIVER);
        let mut status = driver.read_status();
        assert_eq!(status, Status::ACK | Status::DRIVER);

        status.remove(Status::DRIVER);
        driver.write_status(status);
        let status = driver.read_status();

        // No, no! If the guest has said they see the device and can drive it,
        // they can't decide to un-drive it anymore!
        assert!(status.contains(Status::NEEDS_RESET));

        // Okay, reset it and try again. This time we'll get it to NEEDS_RESET
        // "naturally"..
        driver.write_status(Status::RESET);

        driver.set_status_bits(Status::ACK | Status::DRIVER);

        let device_feats =
            driver.common_config.read_le32(common_cfg::device_feature);

        let features_u32: u32 = VIRTIO_NET_F_CTRL_VQ.try_into().unwrap();
        if device_feats & features_u32 == 0 {
            panic!("device does not support VIRTIO_NET_F_CTRL_VQ??");
        }

        driver
            .common_config
            .write_le32(common_cfg::driver_feature, features_u32);

        driver.set_status_bits(Status::FEATURES_OK);
        assert!(driver.read_status().contains(Status::FEATURES_OK));

        // Now write a bogus queue size. We'll set NEEDS_RESET for this.
        // VirtIO 1.2 says 32KiB is the max size. Further, we have not
        // negotiated VIRTIO_F_RING_PACKED, so the size must be a power of two.
        // Break both rules.
        driver.common_config.write_le16(common_cfg::queue_size, 65533);

        let mut status = driver.read_status();
        assert!(status.contains(Status::NEEDS_RESET));
        status.remove(Status::NEEDS_RESET);
        driver.write_status(status);

        // We should not be able to clear NEEDS_RESET without .. a reset.
        let real_status = driver.read_status();
        assert!(real_status.contains(Status::NEEDS_RESET));

        test_ctx
    }

    fn basic_operation_modern(test_ctx: TestCtx) -> TestCtx {
        let expected_feats =
            VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ;

        // Go through setting up the virtio NIC in a few scenarios, but don't
        // try using it or setting any interesting features.

        // First, we have a fresh device on a fresh VM. The test is playing the
        // role of the first use of the device by OVMF, an intiial bootloader,
        // or maybe the actual guest OS.
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);

        // Say we've done nothing with the device, but we've booted into
        // whatever next stage with its own driver that wants to operate the
        // device. It will go through 3.1.1 again.
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);

        // `Lifecycle::reset` is the kind of reset that occurs when a VM is
        // restarted. Do that now, as if the guest rebooted, triple faulted,
        // etc.
        Lifecycle::reset(test_ctx.dev.as_ref());

        // After a reset, reinit the device as if through OVMF->bootloader->OS
        // again..
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);

        test_ctx
    }

    fn basic_operation_multiqueue(test_ctx: TestCtx) -> TestCtx {
        // All the same operation as `basic_operation_modern`, but with
        // `VIRTIO_NET_F_MQ`.
        let expected_feats = VIRTIO_NET_F_MAC
            | VIRTIO_NET_F_STATUS
            | VIRTIO_NET_F_CTRL_VQ
            | VIRTIO_NET_F_MQ;

        let mut driver = test_ctx.create_driver();
        // OVMF just initializes all queues. Linux (at least 6.6.49/Alpine
        // 3.20.3) initializes all queues, then turns down the number of used
        // queues based on available CPUs, if this would be a limiter. Do
        // similar here to keep up the act.
        driver.set_max_pairs(Some(4));
        driver.modern_device_init(expected_feats);
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        Lifecycle::reset(test_ctx.dev.as_ref());
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        // Pretending to be Linux, like above set_max_pairs().
        driver.set_max_pairs(Some(4));
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);

        test_ctx
    }

    /// Roughly approximation of a MQ-capable OS restarting, booting through
    /// with a simple single-queue driver, then booting back to a MQ-capable OS.
    fn multiqueue_to_singlequeue_to_multiqueue(test_ctx: TestCtx) -> TestCtx {
        // All the same operation as `basic_operation_modern`, but with
        // `VIRTIO_NET_F_MQ`.
        let expected_feats =
            VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ;

        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);
        Lifecycle::reset(test_ctx.dev.as_ref());
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats);
        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        test_ctx
    }

    /// The same operations as `multiqueue_to_singlequeue_to_multiqueue`, above,
    /// but migrate the device between each operation.
    fn multiqueue_migration(test_ctx: TestCtx) -> TestCtx {
        // All the same operation as `basic_operation_modern`, but with
        // `VIRTIO_NET_F_MQ`.
        let expected_feats =
            VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ;

        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        let drv_state = driver.export();
        let test_ctx = test_ctx.migrate();
        let driver = VirtioNetDriver::import(
            &test_ctx.machine,
            &test_ctx.dev,
            drv_state,
        );
        assert!(driver.status_ok());

        let mut driver = test_ctx.create_driver();
        // `basic_operation_multiqueue()` talks about why it's an interesting
        // test to shink max pairs.
        driver.set_max_pairs(Some(4));
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        let drv_state = driver.export();
        let test_ctx = test_ctx.migrate();
        let driver = VirtioNetDriver::import(
            &test_ctx.machine,
            &test_ctx.dev,
            drv_state,
        );
        assert!(driver.status_ok());
        Lifecycle::reset(test_ctx.dev.as_ref());
        assert!(driver.status_ok());

        let drv_state = driver.export();
        let test_ctx = test_ctx.migrate();
        let mut driver = VirtioNetDriver::import(
            &test_ctx.machine,
            &test_ctx.dev,
            drv_state,
        );
        assert!(driver.status_ok());
        driver.modern_device_init(expected_feats);

        let drv_state = driver.export();
        let test_ctx = test_ctx.migrate();
        let mut driver = VirtioNetDriver::import(
            &test_ctx.machine,
            &test_ctx.dev,
            drv_state,
        );
        assert!(driver.status_ok());
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        test_ctx
    }

    /// Go through the steps like an OVMF->Linux boot as described in tests
    /// above, but only migrate once we've reinitialized the NIC after enabling
    /// more queues than actually used at migration time.
    ///
    /// We once had a subtle bug here where the excess queues exported as
    /// enabled, but were below the `queues.len()` number of currently-enabled
    /// queues. Such queues imported (correctly!) on the other end as enabled,
    /// but were still "enabled" because reset did not cover them, and would
    /// make guests determine the device was simply broken. They were right!
    fn multiqueue_migration_after_boot(test_ctx: TestCtx) -> TestCtx {
        let expected_feats =
            VIRTIO_NET_F_MAC | VIRTIO_NET_F_STATUS | VIRTIO_NET_F_CTRL_VQ;

        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);
        let mut driver = test_ctx.create_driver();
        // `basic_operation_multiqueue()` talks about why it's an interesting
        // test to shink max pairs.
        driver.set_max_pairs(Some(4));
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        let drv_state = driver.export();
        let test_ctx = test_ctx.migrate();
        let driver = VirtioNetDriver::import(
            &test_ctx.machine,
            &test_ctx.dev,
            drv_state,
        );
        assert!(driver.status_ok());
        Lifecycle::reset(test_ctx.dev.as_ref());
        assert!(driver.status_ok());

        let mut driver = test_ctx.create_driver();
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);
        let mut driver = test_ctx.create_driver();
        // Same as `set_max_pairs()` above.
        driver.set_max_pairs(Some(4));
        driver.modern_device_init(expected_feats | VIRTIO_NET_F_MQ);

        test_ctx
    }

    // Bears an uncanny resemblance to `phd-test`...
    struct TestCase {
        name: &'static str,
        test_fn: fn(TestCtx) -> TestCtx,
    }

    fn create_vnic(phys_nic: &str, vnic_name: &str) {
        let res = Command::new("pfexec")
            .arg("dladm")
            .arg("create-vnic")
            .arg("-t")
            .arg("-l")
            .arg(phys_nic)
            .arg("-m")
            .arg("2:8:20:ac:70:0")
            .arg(vnic_name)
            .status()
            .expect("can create vnic");
        assert!(res.success());
    }

    fn delete_vnic(vnic_name: &str) {
        let res = Command::new("pfexec")
            .arg("dladm")
            .arg("delete-vnic")
            .arg(vnic_name)
            .status()
            .expect("can delete vnic");
        assert!(res.success());
    }

    // We'll actually create and destroy some vnics so not only do we need
    // `dladm`, we need a recent enough viona and everything.. this test is only
    // meaningful on an illumos host:;
    #[test]
    #[cfg_attr(not(target_os = "illumos"), ignore)]
    fn run_viona_tests() {
        let rt = tokio::runtime::Builder::new_multi_thread()
            .enable_all()
            .build()
            .unwrap();

        macro_rules! testcase {
            ($test_fn:ident) => {
                TestCase { name: stringify!($test_fn), test_fn: $test_fn }
            };
        }

        let tests = &[
            testcase!(test_device_status_writes),
            testcase!(basic_operation_modern),
            testcase!(basic_operation_multiqueue),
            testcase!(multiqueue_to_singlequeue_to_multiqueue),
            testcase!(multiqueue_migration),
            testcase!(multiqueue_migration_after_boot),
        ];

        let underlying_nic = match std::env::var("VIONA_TEST_NIC") {
            Ok(val) => val,
            Err(VarError::NotPresent) => {
                eprintln!(
                    "Skipping viona tests as env does not have VIONA_TEST_NIC. \
                    Set this environment variable to an existing link that \
                    Propolis viona tests should create test vnics on.");
                let uname = nix::sys::utsname::uname().unwrap();
                if uname.machine() != std::ffi::OsStr::new("i86pc") {
                    // Since the tests are running on i86pc, this might be a dev
                    // host that does not actually want us messing with devices
                    // for tests.
                    //
                    // If the *tests* are running on a different architecture
                    // (say, "oxide"), assume that this is a misconfiguration
                    // instead and fail tests rather than "skip".
                    panic!(
                        "host ({}) is not i86pc, refusing to skip viona tests",
                        uname.machine().display()
                    );
                }
                return;
            }
            Err(VarError::NotUnicode(e)) => {
                panic!("non-unicode virtio host nic: {:?}", e.display());
            }
        };

        const TEST_VNIC: &'static str = "vnic_prop_test0";
        for test in tests {
            let underlying_nic = underlying_nic.clone();
            rt.block_on(async move {
                create_vnic(&underlying_nic, TEST_VNIC);

                let test_ctx =
                    create_test_ctx(test.name, &underlying_nic, TEST_VNIC);
                Lifecycle::start(test_ctx.dev.as_ref())
                    .expect("can start viona device");
                let test_ctx = (test.test_fn)(test_ctx);
                drop(test_ctx);

                delete_vnic(TEST_VNIC);
            });
        }
    }
}


================================================
FILE: lib/propolis/src/hw/virtio/vsock.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use lazy_static::lazy_static;
use slog::Logger;
use std::sync::Arc;

use crate::accessors::MemAccessor;
use crate::common::*;
use crate::hw::pci;
use crate::hw::virtio;
use crate::hw::virtio::queue::Chain;
use crate::hw::virtio::queue::VirtQueue;
use crate::hw::virtio::queue::VqSize;
use crate::migrate::*;
use crate::util::regmap::RegMap;
use crate::vmm::MemCtx;
use crate::vsock::packet::VsockPacket;
use crate::vsock::packet::VsockPacketError;
use crate::vsock::packet::VsockPacketHeader;
use crate::vsock::probes;
use crate::vsock::proxy::VsockPortMapping;
use crate::vsock::GuestCid;
use crate::vsock::VsockBackend;
use crate::vsock::VsockProxy;

use super::pci::PciVirtio;
use super::pci::PciVirtioState;
use super::queue::VirtQueues;
use super::VirtioDevice;

// virtio queue index numbers for virtio socket devices
pub const VSOCK_RX_QUEUE: u16 = 0x0;
pub const VSOCK_TX_QUEUE: u16 = 0x1;
pub const VSOCK_EVENT_QUEUE: u16 = 0x2;

/// A permit representing a reserved rx queue descriptor chain.
///
/// This guarantees we have space to send a packet to the guest before reading
/// data from a host socket, preventing data loss if the queue is full.
///
/// The permit holds a mutable reference to `VsockVq`, ensuring only one permit
/// can exist at a time (enforced at compile time). If dropped without calling
/// `write`, the chain is retained in `VsockVq` for reuse.
pub struct RxPermit<'a> {
    vq: &'a mut VsockVq,
}

impl RxPermit<'_> {
    /// Returns the maximum data payload that can fit in this descriptor chain.
    pub fn available_data_space(&self) -> usize {
        let header_size = std::mem::size_of::<VsockPacketHeader>();
        self.vq
            .rx_chain
            .as_ref()
            .expect("has chain")
            .remain_write_bytes()
            .saturating_sub(header_size)
    }

    pub fn write(self, header: &VsockPacketHeader, data: &[u8]) {
        // TODO: cannot access memory?
        let mem = self.vq.acc_mem.access().expect("mem access for write");
        let queue =
            self.vq.queues.get(VSOCK_RX_QUEUE as usize).expect("rx queue");

        // NB: `RxPermit` should only be created if the owning `VsockVq`
        // actually has a `Some(Chain)`. Unfortuantely there doesn't seem to be
        // a way to enforce this at compile time.
        let mut chain = self.vq.rx_chain.take().expect("has chain");
        chain.write(header, &mem);

        if !data.is_empty() {
            let mut done = 0;
            chain.for_remaining_type(false, |addr, len| {
                let to_write = &data[done..];
                if let Some(copied) = mem.write_from(addr, to_write, len) {
                    let need_more = copied != to_write.len();
                    done += copied;
                    (copied, need_more)
                } else {
                    (0, false)
                }
            });
        }

        probes::vsock_pkt_rx!(|| header);
        queue.push_used(&mut chain, &mem);
    }
}

pub struct VsockVq {
    queues: Vec<Arc<VirtQueue>>,
    acc_mem: MemAccessor,
    /// Cached rx chain for permit reuse when dropped without write
    rx_chain: Option<Chain>,
}

impl VsockVq {
    pub(crate) fn new(
        queues: Vec<Arc<VirtQueue>>,
        acc_mem: MemAccessor,
    ) -> Self {
        Self { queues, acc_mem, rx_chain: None }
    }

    /// Try to acquire a permit for sending a packet to the guest.
    ///
    /// Returns `Some(RxPermit)` if a descriptor chain is available,
    /// `None` if the rx queue is full.
    pub fn try_rx_permit(&mut self) -> Option<RxPermit<'_>> {
        let vq = self.queues.get(VSOCK_RX_QUEUE as usize)?;
        // See propolis#1110 & propolis#1115
        // If propolis-server has started the vsock device but a different
        // device has encountered an error at startup there's a good chance
        // we attempt to access guest memory and panic. A way of preventing
        // us from doing that is to first check if the virtqueue is alive. A
        // virtqueue only becomes alive once a guest vCPU has ran.
        if !vq.is_alive() {
            return None;
        }

        // Reuse cached chain or pop a new one
        if self.rx_chain.is_none() {
            // TODO: cannot access memory?
            let mem = self.acc_mem.access().expect("mem access for write");
            let mut chain = Chain::with_capacity(10);
            if let Some(_) = vq.pop_avail(&mut chain, &mem) {
                self.rx_chain = Some(chain);
            }
        }

        // We only return a permit iff we know that we are holding onto a valid
        // descriptor chain that can be used by the borrowing `RxPermit`
        match self.rx_chain {
            Some(_) => Some(RxPermit { vq: self }),
            None => None,
        }
    }

    /// Receive all available packets from the TX queue.
    ///
    /// Returns a Vec of parsed packets. In the future this may be refactored
    /// to return an iterator over GuestRegions to avoid copying packet data.
    pub fn recv_packet(&self) -> Option<Result<VsockPacket, VsockPacketError>> {
        // TODO: cannot access memory?
        let mem = self.acc_mem.access().expect("mem access for read");
        let vq = self
            .queues
            .get(VSOCK_TX_QUEUE as usize)
            .expect("vsock has tx queue");

        let mut chain = Chain::with_capacity(10);
        let Some((_idx, _clen)) = vq.pop_avail(&mut chain, &mem) else {
            return None;
        };

        let packet = VsockPacket::parse(&mut chain, &mem);
        vq.push_used(&mut chain, &mem);

        Some(packet)
    }

    /// Drop any cached descriptor chain.
    ///
    /// This MUST be called when reseting the virtio-socket device so
    /// that we don't use stale `GuestAddr`s across device resets.
    #[cfg(target_os = "illumos")]
    pub(crate) fn clear_rx_chain(&mut self) {
        self.rx_chain = None;
    }
}

pub struct PciVirtioSock {
    cid: GuestCid,
    backend: VsockProxy,
    virtio_state: PciVirtioState,
    pci_state: pci::DeviceState,
}

impl PciVirtioSock {
    pub fn new(
        queue_size: u16,
        cid: GuestCid,
        log: Logger,
        port_mappings: Vec<VsockPortMapping>,
    ) -> Arc<Self> {
        let queues = VirtQueues::new(&[
            // VSOCK_RX_QUEUE
            VqSize::new(queue_size),
            // VSOCK_TX_QUEUE
            VqSize::new(queue_size),
            // VSOCK_EVENT_QUEUE
            VqSize::new(1),
        ]);

        // One for rx, tx, event
        let msix_count = Some(3);
        let (virtio_state, pci_state) = PciVirtioState::new(
            virtio::Mode::Transitional,
            queues,
            msix_count,
            virtio::DeviceId::Socket,
            VIRTIO_VSOCK_CFG_SIZE,
        );

        let vvq = VsockVq::new(
            virtio_state.queues.iter().map(Clone::clone).collect(),
            pci_state.acc_mem.child(Some("vsock rx queue".to_string())),
        );
        let port_mappings = port_mappings.into_iter().collect();

        let backend = VsockProxy::new(log, cid, vvq, port_mappings);

        Arc::new(Self { cid, backend, virtio_state, pci_state })
    }
}

impl VirtioDevice for PciVirtioSock {
    fn rw_dev_config(&self, mut rwo: crate::common::RWOp) {
        VSOCK_DEV_REGS.process(&mut rwo, |id, rwo| match rwo {
            RWOp::Read(ro) => match id {
                VsockReg::GuestCid => {
                    ro.write_u64(self.cid.get());
                }
            },
            RWOp::Write(_) => {}
        })
    }

    fn features(&self) -> u64 {
        // We support VIRTIO_VSOCK_F_STREAM
        //
        // virtio spec 1.3:
        // The device SHOULD offer the VIRTIO_VSOCK_F_NO_IMPLIED_STREAM feature.
        (VsockFeatures::NO_IMPLIED_STREAM | VsockFeatures::STREAM).bits()
    }

    fn set_features(&self, feat: u64) -> Result<(), ()> {
        // We only care about the vsock specific bits so grab just those
        match VsockFeatures::from_bits_truncate(feat) {
            // If no feature bit has been negotiated, the device SHOULD act as
            // if VIRTIO_VSOCK_F_STREAM has been negotiated.
            f if f.is_empty() => Ok(()),
            f if f == VsockFeatures::STREAM => Ok(()),
            // We have not advertised SEQPACKET so we don't expect it to show up
            // here.
            _ => Err(()),
        }
    }

    fn mode(&self) -> virtio::Mode {
        virtio::Mode::Transitional
    }

    fn queue_notify(&self, vq: &VirtQueue) {
        let _ = self.backend.queue_notify(vq.id);
    }
}

impl PciVirtio for PciVirtioSock {
    fn virtio_state(&self) -> &PciVirtioState {
        &self.virtio_state
    }
    fn pci_state(&self) -> &pci::DeviceState {
        &self.pci_state
    }
}

impl Lifecycle for PciVirtioSock {
    fn type_name(&self) -> &'static str {
        "pci-virtio-socket"
    }
    fn start(&self) -> Result<(), anyhow::Error> {
        self.backend.start();
        Ok(())
    }
    fn pause(&self) {
        let _ = self.backend.pause();
        self.backend.wait_stopped();
    }
    fn reset(&self) {
        self.virtio_state.reset(self);
        self.backend.reset();
    }
    fn resume(&self) {
        self.backend.resume();
    }
    fn halt(&self) {
        self.backend.halt();
    }
    fn migrate(&'_ self) -> Migrator<'_> {
        // TODO (MTZ):
        // We need to support migration propolis#1065
        Migrator::NonMigratable
    }
}

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum VsockReg {
    GuestCid,
}

lazy_static! {
    static ref VSOCK_DEV_REGS: RegMap<VsockReg> = {
        let layout = [(VsockReg::GuestCid, 8)];
        RegMap::create_packed(VIRTIO_VSOCK_CFG_SIZE, &layout, None)
    };
}

mod bits {
    pub const VIRTIO_VSOCK_CFG_SIZE: usize = 0x8;

    bitflags! {
        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
        pub struct VsockFeatures: u64 {
            const STREAM    = 1 << 0;
            const SEQPACKET = 1 << 1;
            const NO_IMPLIED_STREAM = 1 << 2;
        }
    }

    #[allow(unused)]
    pub const VIRTIO_VSOCK_EVENT_TRANSPORT_RESET: u32 = 0;
}
use bits::*;

impl VsockPacket {
    // TODO: We may want to consider operating on `Vec<GuestRegion>` to avoid
    // double copying the packet contents. For now we are reading all of the
    // packet data at once because it's convenient.
    fn parse(
        chain: &mut Chain,
        mem: &MemCtx,
    ) -> Result<Self, VsockPacketError> {
        let mut packet = VsockPacket::default();

        // Attempt to read the vsock packet header from the descriptor chain
        // before we can process the full packet.
        if !chain.read(&mut packet.header, mem) {
            return Err(VsockPacketError::ChainHeaderRead);
        }

        // If the packet header indicates there is no data in this packet, then
        // there's no point in attempting to continue reading from the chain.
        if packet.header.len() == 0 {
            return Ok(packet);
        }

        let hdr_len = usize::try_from(packet.header.len())
            .expect("running on a 64bit platform");
        let chain_len = chain.remain_read_bytes();

        // Ensure that the vsock packet header length matches the reality of
        // the desc chain.
        if hdr_len > chain_len {
            return Err(VsockPacketError::InvalidPacketLen {
                hdr_len,
                chain_len,
            });
        }
        let mut data = vec![0; hdr_len];

        // While we are here we should validate that packets cid fields do no
        // contain reserved bits
        if packet.header.src_cid() >> 32 != 0 {
            return Err(VsockPacketError::InvalidSrcCid {
                src_cid: packet.header.src_cid(),
            });
        }
        if packet.header.dst_cid() >> 32 != 0 {
            return Err(VsockPacketError::InvalidDstCid {
                dst_cid: packet.header.dst_cid(),
            });
        }

        let mut done = 0;
        let copied = chain.for_remaining_type(true, |addr, len| {
            let mut remain = GuestData::from(&mut data[done..]);
            if let Some(copied) = mem.read_into(addr, &mut remain, len) {
                let need_more = copied != remain.len();
                done += copied;
                (copied, need_more)
            } else {
                (0, false)
            }
        });

        // If we fail to copy the correct amount of bytes from the desc chain
        // something is clearly wrong.
        if copied != hdr_len {
            return Err(VsockPacketError::InsufficientBytes {
                expected: hdr_len,
                remaining: copied,
            });
        }

        packet.data = data.into();

        Ok(packet)
    }
}


================================================
FILE: lib/propolis/src/intr_pins.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(clippy::mutex_atomic)]

use std::sync::{Arc, Mutex, Weak};

use crate::vmm::VmmHdl;

const PIN_COUNT: u8 = 16;

pub trait IntrPin: Send + Sync + 'static {
    fn assert(&self);
    fn deassert(&self);
    fn is_asserted(&self) -> bool;
    fn pulse(&self) {
        if !self.is_asserted() {
            self.assert();
            self.deassert();
        }
    }
    fn set_state(&self, is_asserted: bool) {
        if is_asserted {
            self.assert();
        } else {
            self.deassert();
        }
    }

    /// Set the state of this interrupt pin *without* treating the state change
    /// as an edge in the interrupt line. Used when importing a guest.
    ///
    /// This method differs from [`IntrPin::set_state`], as it updates the
    /// internal accounting of the pin's state without notifying consumers of
    /// the interrupt pin of a rising/falling edge. This is because it's called
    /// during an import of a guest from a different VMM, which has *already*
    /// observed the edge event; this VMM is just updating its internal state to
    /// match the imported state.
    ///
    /// For example, when importing a guest which currently has a [`LegacyPIC`]
    /// pin asserted, `import_state` will not call the `ioctl` that asserts that
    /// pin in the kernel VMM, while [`IntrPin::set_state`] would. This is
    /// important, as the kernel-emulated interrupt state is imported separately
    /// from the userspace state, so calling [`IntrPin::set_state`] would result
    /// in inconsistent state between the kernel and userspace.
    fn import_state(&self, is_asserted: bool);
}

/// Describes the operation to take with an interrupt pin.
pub enum PinOp {
    /// Asserts the interrupt.
    Assert,
    /// Deasserts the interrupt.
    Deassert,
    /// Asserts and then deasserts the interrupt.
    Pulse,
}

pub struct LegacyPIC {
    inner: Mutex<Inner>,
    hdl: Arc<VmmHdl>,
}

struct Inner {
    pins: [Entry; PIN_COUNT as usize],
}

#[derive(Default, Copy, Clone)]
struct Entry {
    level: usize,
}
impl Entry {
    fn process_op(&mut self, op: &PinOp) -> bool {
        match op {
            PinOp::Assert => {
                self.level += 1;
                // Notify if going 0->1
                self.level == 1
            }
            PinOp::Deassert => {
                assert!(self.level != 0);
                self.level -= 1;
                // Notify if going 1->0
                self.level == 0
            }
            PinOp::Pulse => {
                // Notify if going 0->1->0
                self.level == 0
            }
        }
    }
}

impl LegacyPIC {
    /// Creates a new virtual PIC.
    pub fn new(hdl: Arc<VmmHdl>) -> Arc<Self> {
        Arc::new(Self {
            inner: Mutex::new(Inner {
                pins: [Entry::default(); PIN_COUNT as usize],
            }),
            hdl,
        })
    }

    pub fn pin_handle(self: &Arc<Self>, irq: u8) -> Option<LegacyPin> {
        if irq >= PIN_COUNT && irq == 2 {
            return None;
        }
        Some(LegacyPin::new(irq, Arc::downgrade(self)))
    }

    fn import_irq(&self, op: PinOp, irq: u8) {
        assert!(irq < PIN_COUNT);

        let mut inner = self.inner.lock().unwrap();

        // Update our tracked pin level count, but *don't* actually perform the
        // ioctl to assert the bhyve interrupt, since the kernelspace pin states
        // are imported separately.
        inner.pins[irq as usize].process_op(&op);
    }

    fn do_irq(&self, op: PinOp, irq: u8) {
        assert!(irq < PIN_COUNT);

        let mut inner = self.inner.lock().unwrap();
        if inner.pins[irq as usize].process_op(&op) {
            match op {
                PinOp::Assert => {
                    self.hdl.isa_assert_irq(irq, Some(irq)).unwrap();
                }
                PinOp::Deassert => {
                    self.hdl.isa_deassert_irq(irq, Some(irq)).unwrap();
                }
                PinOp::Pulse => {
                    self.hdl.isa_pulse_irq(irq, Some(irq)).unwrap();
                }
            }
        }
    }
}

pub struct LegacyPin {
    irq: u8,
    asserted: Mutex<bool>,
    pic: Weak<LegacyPIC>,
}
impl LegacyPin {
    fn new(irq: u8, pic: Weak<LegacyPIC>) -> Self {
        Self { irq, asserted: Mutex::new(false), pic }
    }
}
impl IntrPin for LegacyPin {
    fn assert(&self) {
        let mut asserted = self.asserted.lock().unwrap();
        if !*asserted {
            *asserted = true;
            if let Some(pic) = Weak::upgrade(&self.pic) {
                pic.do_irq(PinOp::Assert, self.irq);
            }
        }
    }
    fn deassert(&self) {
        let mut asserted = self.asserted.lock().unwrap();
        if *asserted {
            *asserted = false;
            if let Some(pic) = Weak::upgrade(&self.pic) {
                pic.do_irq(PinOp::Deassert, self.irq);
            }
        }
    }
    fn pulse(&self) {
        let asserted = self.asserted.lock().unwrap();
        if !*asserted {
            if let Some(pic) = Weak::upgrade(&self.pic) {
                pic.do_irq(PinOp::Pulse, self.irq);
            }
        }
    }
    fn is_asserted(&self) -> bool {
        let asserted = self.asserted.lock().unwrap();
        *asserted
    }
    fn import_state(&self, is_asserted: bool) {
        let mut asserted = self.asserted.lock().unwrap();
        if *asserted != is_asserted {
            *asserted = is_asserted;
            if let Some(pic) = Weak::upgrade(&self.pic) {
                let op =
                    if is_asserted { PinOp::Assert } else { PinOp::Deassert };
                pic.import_irq(op, self.irq);
            }
        }
    }
}

/// Interrupt pin which calls a provided function on rising and falling edges.
///
/// The consumer-provided function is called when the pin undergoes a state
/// transition (`low->high` or `high->low`) with its argument corresponding to the
/// pin level after the transition (true = high).
///
/// That function call is made under the protection of a mutex, excluding all
/// other operations on the pin until it returns.
pub struct FuncPin(Mutex<FPInner>);
impl FuncPin {
    pub fn new(func: Box<dyn Fn(bool) + Send + 'static>) -> Self {
        Self(Mutex::new(FPInner { level: false, func }))
    }
}
impl IntrPin for FuncPin {
    fn assert(&self) {
        let mut inner = self.0.lock().unwrap();
        if !inner.level {
            inner.level = true;
            (inner.func)(inner.level);
        }
    }
    fn deassert(&self) {
        let mut inner = self.0.lock().unwrap();
        if inner.level {
            inner.level = false;
            (inner.func)(inner.level);
        }
    }
    fn is_asserted(&self) -> bool {
        let inner = self.0.lock().unwrap();
        inner.level
    }
    fn import_state(&self, is_asserted: bool) {
        let mut inner = self.0.lock().unwrap();
        // Set the state to the imported state without calling the function ---
        // presumably, whatever the function does already happened prior to the
        // import.
        inner.level = is_asserted;
    }
}
struct FPInner {
    level: bool,
    func: Box<dyn Fn(bool) + Send + 'static>,
}

pub struct NoOpPin {}

impl IntrPin for NoOpPin {
    fn assert(&self) {}
    fn deassert(&self) {}
    fn pulse(&self) {}
    fn is_asserted(&self) -> bool {
        false
    }
    fn import_state(&self, _: bool) {}
}


================================================
FILE: lib/propolis/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

#![allow(
    clippy::style,

    // Propolis will only ever be built as 64-bit, so wider enums are acceptable
    clippy::enum_clike_unportable_variant
)]

pub extern crate bhyve_api;
pub extern crate usdt;
#[macro_use]
extern crate bitflags;

pub mod accessors;
pub mod api_version;
pub mod attestation;
pub mod block;
pub mod chardev;
pub mod common;
pub mod cpuid;
pub mod enlightenment;
pub mod exits;
pub mod firmware;
pub mod hw;
pub mod intr_pins;
pub mod lifecycle;
pub mod migrate;
pub mod mmio;
pub mod msr;
pub mod pio;
pub mod tasks;
pub mod util;
pub mod vcpu;
pub mod vmm;
pub mod vsock;

pub use exits::{VmEntry, VmExit};
pub use vmm::Machine;


================================================
FILE: lib/propolis/src/lifecycle.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::atomic::{AtomicUsize, Ordering};

use futures::future::{self, BoxFuture};

use crate::migrate::Migrator;

/// General trait for emulated devices in the system.
///
/// As the VM goes through its lifecycle, the emulated devices which it contains
/// are so driven through those phases via various event functions (`start`,
/// `pause`, `resume`, etc).
///
/// NOTE: Calling any of these methods on a [Lifecycle] instance should be done
/// from a context with access to a [Tokio runtime](tokio::runtime::Runtime):
///
/// - [Lifecycle::start]
/// - [Lifecycle::pause]
/// - [Lifecycle::resume]
/// - [Lifecycle::paused]
/// - [Lifecycle::reset]
/// - [Lifecycle::halt]
/// - [Lifecycle::migrate]
pub trait Lifecycle: Send + Sync + 'static {
    /// Unique name for devices for a given type
    ///
    /// Intended to be `const` once stabilized for trait functions.
    fn type_name(&self) -> &'static str;

    /// Return a future indicating when the device has finished pausing.
    fn paused(&self) -> BoxFuture<'static, ()> {
        Box::pin(future::ready(()))
    }

    /// Called when an instance is about to begin running, just before any
    /// vCPU threads are started. After this returns, the callee device should
    /// be ready to do work on the guest's behalf.
    ///
    /// Note that this is only called the first time an instance begins running.
    /// If it reboots, the device will observe a paused -> reset -> resumed
    /// transition instead.
    fn start(&self) -> anyhow::Result<()> {
        Ok(())
    }

    /// Directs this device to pause. A paused device must stop producing work
    /// for other devices (and any associated backend(s)), but must accept (and
    /// hold onto) new work from other devices while in the paused state.
    ///
    /// The device is not required to finish pausing inline. Instead, its
    /// implementation of [`Lifecycle::paused`] should return a future that
    /// completes only when the device is paused.
    ///
    /// WARNING: This function may only be called after pausing all of a VM's
    /// vCPUs. This allows components to mutate state (such as VM memory) in
    /// ways that should otherwise not be visible to a running vCPU.
    fn pause(&self) {}

    /// Directs this device to resume servicing the guest after pausing.
    ///
    /// WARNING: This function must be called before resuming any of a VM's
    /// vCPUs. This allows components to restore any world-state they preserved
    /// during their `pause` callouts before vCPUs get a chance to perceive it.
    ///
    /// NOTE: It is legal to call `reset` between pausing and resuming. If this
    /// occurs, the caller must ensure that all VM devices and CPUs will be
    /// reset and reinitialized before resuming any devices.
    fn resume(&self) {}

    /// Directs this device to reset itself to the state it would have on a cold
    /// start.
    ///
    /// N.B. The state driver ensures this is called only on paused devices.
    ///      It also ensures that the entire VM will be reset and reinitialized
    ///      before resuming any devices.
    fn reset(&self) {}

    /// Indicates that the device's instance is stopping and will soon be
    /// discarded.
    ///
    /// N.B. The state driver ensures this is called only on paused devices.
    fn halt(&self) {}

    /// Return the Migrator object that will be used to export/import
    /// this device's state.
    ///
    /// By default, we return a simple impl that assumes the device
    /// has no state that needs to be exported/imported but still wants
    /// to opt into being migratable. For more complex cases, a device
    /// may implement the `Migrate` trait along with its export/import
    /// methods. A device which shouldn't be migrated should instead
    /// override this method and explicity return [`Migrator::NonMigratable`].
    fn migrate(&'_ self) -> Migrator<'_> {
        Migrator::Empty
    }
}

/// Indicator for tracking [Lifecycle] states.
///
/// As implementors of the [Lifecycle] trait are driven through various state
/// changes when called through its functions, the underlying resource may want
/// to keep track of which state it is in.  Device emulation may wish to
/// postpone certain setup tasks when undergoing state import during a
/// migration, for example.
///
/// The [Indicator] is meant as a convenience tool for such implementors to
/// simply track the current state by calling the appropriate methods
/// corresponding to their [Lifecycle] counterparts.
#[derive(Default)]
pub struct Indicator(AtomicUsize);
impl Indicator {
    pub const fn new() -> Self {
        Self(AtomicUsize::new(IndicatedState::Init as usize))
    }
    /// Indicate that holder is has started
    ///
    /// To be called as part of the implementor's [Lifecycle::start()] method
    pub fn start(&self) {
        self.state_change(IndicatedState::Run);
    }
    /// Indicate that holder is has paused
    ///
    /// To be called as when [Lifecycle::pause()] has been called on the
    /// implementor _and_ that pause has completed (if there is async work
    /// required for such a state).
    pub fn pause(&self) {
        self.state_change(IndicatedState::Pause);
    }
    /// Indicate that holder is has resumed
    ///
    /// To be called as part of the implementor's [Lifecycle::resume()] method
    pub fn resume(&self) {
        self.state_change(IndicatedState::Run);
    }
    /// Indicate that holder is has halted
    ///
    /// To be called as part of the implementor's [Lifecycle::halt()] method
    pub fn halt(&self) {
        self.state_change(IndicatedState::Halt);
    }
    /// Get the currently indicated state for the holder
    pub fn state(&self) -> IndicatedState {
        IndicatedState::from_usize(self.0.load(Ordering::SeqCst))
    }

    fn state_change(&self, new: IndicatedState) {
        let old = self.0.swap(new as usize, Ordering::SeqCst);
        IndicatedState::assert_valid_transition(
            IndicatedState::from_usize(old),
            new,
        );
    }
}

/// Represents the current state held in [Indicator]
#[derive(Copy, Clone, Eq, PartialEq, strum::FromRepr, Debug)]
#[repr(usize)]
pub enum IndicatedState {
    /// Lifecycle entity has been created, but has not been
    /// (started)[Lifecycle::start()] yet.
    Init,
    /// Lifecycle entity is running
    Run,
    /// Lifecycle entity is paused
    Pause,
    /// Lifecycle entity has halted
    Halt,
}
impl IndicatedState {
    const fn valid_transition(old: Self, new: Self) -> bool {
        use IndicatedState::*;
        match (old, new) {
            (Init, Run) | (Init, Pause) => true,
            (Run, Pause) => true,
            (Pause, Run) | (Pause, Halt) => true,
            _ => false,
        }
    }
    fn from_usize(raw: usize) -> Self {
        IndicatedState::from_repr(raw)
            .expect("raw value {raw} should be valid IndicatedState")
    }
    fn assert_valid_transition(old: Self, new: Self) {
        assert!(
            Self::valid_transition(old, new),
            "transition from {old:?} to {new:?} is not valid"
        );
    }
}


================================================
FILE: lib/propolis/src/migrate.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::vmm::MemCtx;

use serde::{Deserialize, Serialize};
use thiserror::Error;

/// Errors encountered while trying to export/import device state.
#[derive(Debug, Error)]
pub enum MigrateStateError {
    /// The device doesn't support live migration.
    #[error("device not migratable")]
    NonMigratable,

    /// The device isn't in a state where its state can be exported. Because
    /// fully-initialized devices should be able to pause and export their state
    /// at any time, this generally means that the device was asked to export
    /// its state before it was fully initialized.
    #[error("device's state is not ready to be exported")]
    NotReadyForExport,

    /// I/O Error encounted while performing import/export
    #[error("IO Error")]
    Io(#[from] std::io::Error),

    /// Encountered an error trying to deserialize the device state during import.
    #[error("could not deserialize device state: {0}")]
    DeserializationFailed(String),

    /// The device failed to import the deserialized device state.
    #[error("failed to apply deserialized device state: {0}")]
    ImportFailed(String),

    /// State of kind/version suitable for import not found
    #[error("failed to find suitable import payload")]
    DataMissing,

    /// The kind and/or version of payload was not expected
    #[error("kind/version of payload not expected: {0} v{1}")]
    UnexpectedPayload(String, u32),
}

impl From<erased_serde::Error> for MigrateStateError {
    fn from(err: erased_serde::Error) -> Self {
        MigrateStateError::DeserializationFailed(err.to_string())
    }
}

/// Type representing the migration support (if any) for a given device.
pub enum Migrator<'a> {
    /// The device is not migratable
    NonMigratable,

    /// No device specific logic is needed
    Empty,

    /// Migration state for the device consists of a single payload.
    ///
    /// The device may be capable of handing differing formats and/or versions
    /// of said payload, but only one at a time is expected for a given device
    /// during migration.
    Single(&'a dyn MigrateSingle),

    /// Migration state for the device consists of multiple payloads.  This is
    /// the case for more complex devices where emulation is composed from
    /// several abstractions.
    ///
    /// One example would be pci-virtio-block, where one payload contains to the
    /// emulated PCI state, while another contains the virtio state (virtqueues,
    /// etc).
    Multi(&'a dyn MigrateMulti),
}

/// A device which can be migrated using a single typed payload to represent its
/// internal state.
pub trait MigrateSingle: Send + Sync + 'static {
    fn export(
        &self,
        ctx: &MigrateCtx,
    ) -> Result<PayloadOutput, MigrateStateError>;
    fn import(
        &self,
        offer: PayloadOffer,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError>;
}

/// A device which can be migrated using multiple differently-typed payloads to
/// represent its internal state.
pub trait MigrateMulti: Send + Sync {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError>;
    fn import(
        &self,
        offer: &mut PayloadOffers,
        ctx: &MigrateCtx,
    ) -> Result<(), MigrateStateError>;
}

/// Additional (borrowed) context data used during device import and export.
pub struct MigrateCtx<'a> {
    pub mem: &'a MemCtx,
}

/// Device state payload (of a given kind/version) offered to the import logic
/// for that device during a migration.
pub struct PayloadOffer<'a> {
    pub kind: &'a str,
    pub version: u32,
    pub payload: Box<dyn erased_serde::Deserializer<'a> + 'a>,
}
impl<'a> PayloadOffer<'a> {
    /// Attempt to parse the data in this payload if the offer matches the
    /// kind/version of a specified Schema
    pub fn parse<T: Schema<'a>>(&mut self) -> Result<T, MigrateStateError> {
        if !self.matches::<T>() {
            return Err(MigrateStateError::UnexpectedPayload(
                self.kind.into(),
                self.version,
            ));
        }
        let res = erased_serde::deserialize(&mut self.payload)?;
        Ok(res)
    }

    /// Returns `true` if the `kind` and `version` held in this `PayloadOffer`
    /// match those defined for a provided Schema.
    fn matches<'x, T: Schema<'x>>(&self) -> bool {
        let id = T::id();
        id.0 == self.kind && id.1 == self.version
    }
}

/// Collection of [`PayloadOffer`] instances, as provided to device state import
/// logic during a migration.
pub struct PayloadOffers<'a>(Vec<PayloadOffer<'a>>);
impl<'a> PayloadOffers<'a> {
    /// Create new [`PayloadOffers`] from iterator of [`PayloadOffer`]
    /// instances.
    pub fn new(items: impl IntoIterator<Item = PayloadOffer<'a>>) -> Self {
        Self(Vec::from_iter(items))
    }

    /// Attempt to take a payload from the contained offers, provided that it
    /// matches the specified [`Schema`](trait@Schema).
    pub fn take<T: Schema<'a>>(&mut self) -> Result<T, MigrateStateError> {
        self.take_schema(T::id())
            .ok_or_else(|| MigrateStateError::DataMissing)?
            .parse()
    }

    /// Returns `true` if all of the payload offers been consumed via
    /// [`Self::take()`].
    pub fn is_consumed(&self) -> bool {
        self.0.is_empty()
    }

    /// Return the remaining PayloadOffer instances which have not been
    /// consumed.
    ///
    /// Intended for use by the logic driving the migration itself to determine
    /// if a given device importation failed to consume all of its payload(s).
    pub fn remaining(self) -> Remaining<'a> {
        Remaining(self.0.into_iter())
    }

    fn take_schema(&mut self, id: SchemaId) -> Option<PayloadOffer<'a>> {
        let mut search = self.0.iter().enumerate().filter(|(_idx, offer)| {
            offer.kind == id.0 && offer.version == id.1
        });

        // Success if we find one, and only one, matching the criteria
        if let (Some((idx, _offer)), None) = (search.next(), search.next()) {
            return Some(self.0.remove(idx));
        }

        None
    }
}

pub struct Remaining<'a>(std::vec::IntoIter<PayloadOffer<'a>>);
impl<'a> Iterator for Remaining<'a> {
    type Item = PayloadOffer<'a>;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

/// Device state payload (of a given kind/version) output as part of the export
/// logic for that device during a migration.
///
/// Easiest to safely instantiate via the [`From`] trait implemented for
/// [`Schema`] to ensure that the kind/version matches the serialized payload.
pub struct PayloadOutput {
    pub kind: &'static str,
    pub version: u32,
    pub payload: Box<dyn erased_serde::Serialize>,
}

pub struct PayloadOutputs(Vec<PayloadOutput>);
impl PayloadOutputs {
    pub fn new() -> Self {
        Self(Vec::new())
    }

    /// Add a provided [`PayloadOutput`] to be included in the payload(s) for a
    /// given exported device
    pub fn push(
        &mut self,
        output: PayloadOutput,
    ) -> Result<(), MigrateStateError> {
        self.0.push(output);
        Ok(())
    }
}
impl IntoIterator for PayloadOutputs {
    type Item = PayloadOutput;

    type IntoIter = OutputIter;

    fn into_iter(self) -> Self::IntoIter {
        OutputIter(self.0.into_iter())
    }
}
// Hide the internal implementation details of PayloadOutputs by providing a
// wrapper type for its output iterator.
pub struct OutputIter(std::vec::IntoIter<PayloadOutput>);
impl Iterator for OutputIter {
    type Item = PayloadOutput;

    fn next(&mut self) -> Option<Self::Item> {
        self.0.next()
    }
}

/// Combination of kind (`&str`) and version (u32) which identifies a specific
/// data schema for device migration state.
pub type SchemaId = (&'static str, u32);

/// Define the type (kind) and version for a migration payload data structure.
pub trait Schema<'de>: Serialize + Deserialize<'de> + Sized + 'static {
    /// The [`SchemaId`] associated with a given device state data type.
    ///
    /// This would be `const` if such functions were allowed in traits without
    /// an unstable rust feature.
    fn id() -> SchemaId;
}

impl<'a, T: Schema<'a>> From<T> for PayloadOutput {
    fn from(value: T) -> Self {
        let id = T::id();
        PayloadOutput { kind: id.0, version: id.1, payload: Box::new(value) }
    }
}


================================================
FILE: lib/propolis/src/mmio.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::util::aspace::ASpace;
pub use crate::util::aspace::{Error, Result};

use byteorder::{ByteOrder, LE};

#[usdt::provider(provider = "propolis")]
mod probes {
    fn mmio_read(addr: u64, bytes: u8, value: u64, was_handled: u8) {}
    fn mmio_write(addr: u64, bytes: u8, value: u64, was_handled: u8) {}
}

pub type MmioFn = dyn Fn(usize, RWOp) + Send + Sync + 'static;

pub struct MmioBus {
    map: Mutex<ASpace<Arc<MmioFn>>>,
}
impl MmioBus {
    pub fn new(max: usize) -> Self {
        assert!(max != 0);
        Self { map: Mutex::new(ASpace::new(0, max)) }
    }

    pub fn register(
        &self,
        start: usize,
        len: usize,
        func: Arc<MmioFn>,
    ) -> Result<()> {
        self.map.lock().unwrap().register(start, len, func)
    }
    pub fn unregister(&self, addr: usize) -> Result<()> {
        self.map.lock().unwrap().unregister(addr).map(|_| ())
    }

    pub fn handle_write(&self, addr: usize, bytes: u8, val: u64) -> Result<()> {
        let buf = val.to_le_bytes();
        let data = match bytes {
            1 => &buf[0..1],
            2 => &buf[0..2],
            4 => &buf[0..4],
            8 => &buf[0..],
            _ => panic!(),
        };
        let handled = self.do_mmio(addr, |a, o, func| {
            let mut wo = WriteOp::from_buf(o, data);
            func(a, RWOp::Write(&mut wo))
        });

        probes::mmio_write!(|| (
            addr as u64,
            bytes,
            val,
            u8::from(handled.is_ok())
        ));
        handled
    }
    pub fn handle_read(&self, addr: usize, bytes: u8) -> Result<u64> {
        let mut buf = [0xffu8; 8];
        let mut data = match bytes {
            1 => &mut buf[0..1],
            2 => &mut buf[0..2],
            4 => &mut buf[0..4],
            8 => &mut buf[0..],
            _ => panic!(),
        };
        let handled = self.do_mmio(addr, |a, o, func| {
            let mut ro = ReadOp::from_buf(o, &mut data);
            func(a, RWOp::Read(&mut ro))
        });

        let val = LE::read_u64(&buf);
        probes::mmio_read!(|| (
            addr as u64,
            bytes,
            val,
            u8::from(handled.is_ok())
        ));
        handled.map(|_| val)
    }

    fn do_mmio<F>(&self, addr: usize, f: F) -> Result<()>
    where
        F: FnOnce(usize, usize, &Arc<MmioFn>),
    {
        let map = self.map.lock().unwrap();
        let (start, _len, func) = map.region_at(addr)?;
        let func = Arc::clone(func);
        // unlock map before entering handler
        drop(map);
        f(start, addr - start, &func);
        Ok(())
    }

    pub(crate) fn clear(&self) {
        let mut map = self.map.lock().unwrap();
        map.clear();
    }
}


================================================
FILE: lib/propolis/src/msr.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Types used to assist with handling reads and writes to model-specific
//! registers (MSRs).

/// A model-specific register (MSR) number.
#[derive(Clone, Copy, Debug)]
pub struct MsrId(pub u32);

/// An outcome resulting from a request to emulate the RDMSR instruction.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum RdmsrOutcome {
    /// This RDMSR was not handled. The caller must decide how to dispose of it.
    NotHandled,

    /// This read was handled and produced the contained value, which should be
    /// returned to the guest.
    Handled(u64),

    /// This read is illegal. The caller should inject #GP into the CPU that
    /// attempted it.
    GpException,
}

/// An outcome resulting from a request to emulate the WRMSR instruction.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum WrmsrOutcome {
    /// This WRMSR was not handled. The caller must decide how to dispose of it.
    NotHandled,

    /// This write was handled and no further action is needed from the caller.
    Handled,

    /// This write is illegal. The caller should inject #GP into the CPU that
    /// attempted it.
    GpException,
}


================================================
FILE: lib/propolis/src/pio.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::{Arc, Mutex};

use crate::common::*;
use crate::util::aspace::ASpace;
pub use crate::util::aspace::{Error, Result};

use byteorder::{ByteOrder, LE};

#[usdt::provider(provider = "propolis")]
mod probes {
    fn pio_in(port: u16, bytes: u8, value: u32, was_handled: u8) {}
    fn pio_out(port: u16, bytes: u8, value: u32, was_handled: u8) {}
}

pub type PioFn = dyn Fn(u16, RWOp<'_, '_>) + Send + Sync + 'static;

/// Port IO bus.
pub struct PioBus {
    map: Mutex<ASpace<Arc<PioFn>>>,
}

impl PioBus {
    pub fn new() -> Self {
        Self { map: Mutex::new(ASpace::new(0, u16::MAX as usize)) }
    }

    pub fn register(
        &self,
        start: u16,
        len: u16,
        func: Arc<PioFn>,
    ) -> Result<()> {
        self.map.lock().unwrap().register(start as usize, len as usize, func)
    }
    pub fn unregister(&self, start: u16) -> Result<()> {
        self.map.lock().unwrap().unregister(start as usize).map(|_| ())
    }

    pub fn handle_out(&self, port: u16, bytes: u8, val: u32) -> Result<()> {
        let buf = val.to_le_bytes();
        let data = match bytes {
            1 => &buf[0..1],
            2 => &buf[0..2],
            4 => &buf[0..],
            _ => panic!(),
        };
        let handled = self.do_pio(port, |a, o, func| {
            let mut wo = WriteOp::from_buf(o as usize, data);
            func(a, RWOp::Write(&mut wo))
        });
        probes::pio_out!(|| (port, bytes, val, u8::from(handled.is_ok())));
        handled
    }

    pub fn handle_in(&self, port: u16, bytes: u8) -> Result<u32> {
        let mut buf = [0xffu8; 4];
        let mut data = match bytes {
            1 => &mut buf[0..1],
            2 => &mut buf[0..2],
            4 => &mut buf[0..],
            _ => panic!(),
        };
        let handled = self.do_pio(port, |a, o, func| {
            let mut ro = ReadOp::from_buf(o as usize, &mut data);
            func(a, RWOp::Read(&mut ro))
        });

        let val = LE::read_u32(&buf);
        probes::pio_in!(|| (port, bytes, val, u8::from(handled.is_ok())));
        handled.map(|_| val)
    }

    fn do_pio<F>(&self, port: u16, f: F) -> Result<()>
    where
        F: FnOnce(u16, u16, &Arc<PioFn>),
    {
        let map = self.map.lock().unwrap();
        let (start, _len, func) = map.region_at(port as usize)?;
        let func = Arc::clone(func);
        // unlock map before entering handler
        drop(map);
        f(start as u16, port - start as u16, &func);
        Ok(())
    }

    pub(crate) fn clear(&self) {
        let mut map = self.map.lock().unwrap();
        map.clear();
    }
}


================================================
FILE: lib/propolis/src/tasks.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::any::Any;
use std::future::Future;
use std::pin::Pin;
use std::sync::{Arc, Condvar, Mutex, MutexGuard, Weak};
use std::task::{Context, Poll, Waker};
use std::thread;

use futures::stream::{FuturesUnordered, StreamExt};
use thiserror::Error;
use tokio::task;

pub type NotifyFn = dyn Fn() + Send + Sync + 'static;

#[derive(Default)]
struct Control {
    /// Waker for a future polling on events via [TaskHdl]
    waker_task: Option<Waker>,
    /// Waker for a future polling on events via [TaskCtrl]
    waker_ctrl: Option<Waker>,
    should_exit: bool,
    should_hold: bool,
    is_held: bool,
}
impl Control {
    fn wake_task(&self) {
        if let Some(w) = self.waker_task.as_ref() {
            w.wake_by_ref();
        }
    }
    fn wake_ctrl(&self) {
        if let Some(w) = self.waker_ctrl.as_ref() {
            w.wake_by_ref();
        }
    }
}
struct Inner {
    /// The `Control` of a task can be manipulated from both sync and async
    /// contexts.  Care should be taken not to otherwise block while holding the
    /// mutex guard, as it could cause an undue stall for any async pollers.
    ctrl: Mutex<Control>,
    cv: Condvar,
    notify_fn: Option<Box<NotifyFn>>,
}
impl Inner {
    fn request_hold<'a>(
        &'a self,
        mut guard: MutexGuard<'a, Control>,
        ctrl_waker: Option<Waker>,
    ) -> MutexGuard<'a, Control> {
        guard.should_hold = true;
        if let Some(waker) = ctrl_waker {
            guard.waker_ctrl = Some(waker);
        }
        guard.wake_task();
        self.cv.notify_all();
        self.notify_task(guard)
    }
    fn unrequest_hold(&self, guard: &mut MutexGuard<Control>) {
        guard.should_hold = false;
        guard.waker_ctrl = None;
        // If the task was not held already, and has not been requested to exit,
        // then removing the hold request counts as a state of no pending events
        let has_event = guard.is_held || guard.should_exit;
        if has_event {
            guard.wake_task();
            self.cv.notify_all();
        }
    }
    fn request_exit(&self, mut guard: MutexGuard<Control>) {
        guard.should_exit = true;
        guard.wake_task();
        self.cv.notify_all();
        let _guard = self.notify_task(guard);
    }

    /// Call notifier function (if one exists) for task.
    ///
    /// This notification may synchronously access resources which would
    /// otherwise be exclusive of the `MutexGuard` held on the [`Control`], so
    /// the said guard _must_ be dropped while calling the notifier.
    fn notify_task<'a>(
        &'a self,
        guard: MutexGuard<'a, Control>,
    ) -> MutexGuard<'a, Control> {
        match self.notify_fn.as_ref() {
            // Task notification only required if it is not already held
            Some(notify) if !guard.is_held => {
                drop(guard);
                notify();
                self.ctrl.lock().unwrap()
            }
            _ => guard,
        }
    }

    fn set_held(
        &self,
        guard: &mut MutexGuard<Control>,
        task_waker: Option<Waker>,
    ) {
        guard.is_held = true;
        if let Some(waker) = task_waker {
            guard.waker_task = Some(waker);
        }
        // Notify ctrl of changed state
        guard.wake_ctrl();
        self.cv.notify_all();
    }
    fn set_unheld(&self, guard: &mut MutexGuard<Control>) {
        guard.is_held = false;
        guard.waker_task = None;
        // Notify ctrl of changed state
        guard.wake_ctrl();
        self.cv.notify_all();
    }
}

#[derive(Copy, Clone, Debug)]
pub enum Event {
    Hold,
    Exit,
}

/// Handle held by a task, used to heed requests (via its corresponding
/// [TaskCtrl]) to hold or exit execution.
pub struct TaskHdl(Arc<Inner>);
impl TaskHdl {
    pub fn new(notify_fn: Option<Box<NotifyFn>>) -> (Self, TaskCtrl) {
        Self::new_inner(false, notify_fn)
    }
    pub fn new_held(notify_fn: Option<Box<NotifyFn>>) -> (Self, TaskCtrl) {
        Self::new_inner(true, notify_fn)
    }
    fn new_inner(
        initial_hold: bool,
        notify_fn: Option<Box<NotifyFn>>,
    ) -> (Self, TaskCtrl) {
        let inner = Arc::new(Inner {
            ctrl: Mutex::new(Control {
                should_hold: initial_hold,
                ..Default::default()
            }),
            cv: Condvar::new(),
            notify_fn,
        });
        let ctrl = TaskCtrl(Arc::downgrade(&inner));
        (Self(inner), ctrl)
    }

    /// Check for a pending control event for this task.
    pub fn pending_event(&self) -> Option<Event> {
        let ctrl = self.0.ctrl.lock().unwrap();
        if ctrl.should_exit {
            Some(Event::Exit)
        } else if ctrl.should_hold {
            Some(Event::Hold)
        } else {
            None
        }
    }

    /// Enter this task into a `held` state.  It will return when the task has
    /// been requested to exit or when no request for a hold remains.
    pub fn hold(&self) {
        let mut guard = self.0.ctrl.lock().unwrap();
        if guard.should_exit || !guard.should_hold {
            return;
        }
        assert!(!guard.is_held, "task already held");
        guard.is_held = true;
        let cv = &self.0.cv;
        cv.notify_all();
        let mut guard =
            cv.wait_while(guard, |g| g.should_hold && !g.should_exit).unwrap();
        guard.is_held = false;
    }

    /// Immediately force the task into a `held` state, as if the [TaskCtrl] had
    /// requested such a hold.  Like [TaskHdl::hold], it will return when an
    /// exit is requested, or the requested hold is cleared.
    pub fn force_hold(&self) {
        let mut guard = self.0.ctrl.lock().unwrap();
        if guard.should_exit {
            return;
        }
        // induce ourself into the held state
        guard.should_hold = true;
        assert!(!guard.is_held, "task already held");
        guard.is_held = true;
        let cv = &self.0.cv;
        cv.notify_all();
        let mut guard =
            cv.wait_while(guard, |g| g.should_hold && !g.should_exit).unwrap();
        guard.is_held = false;
    }

    /// Async interface equivalent to [TaskHdl::hold]
    ///
    /// Places the task in the `held` state while the future is pending.
    ///
    /// The future will become [Ready](Poll::Ready) when the task is requested
    /// to exit, or the hold request is cleared.
    pub async fn wait_held(&mut self) {
        Held::new(self).await
    }

    /// Async interface equivalent to [TaskHdl::pending_event]
    ///
    /// Emits a result when a `hold` or `exit` request is made on this task.
    pub async fn get_event(&mut self) -> Event {
        GetEvent::new(self).await
    }
}

struct Held<'a> {
    hdl: &'a mut TaskHdl,
    held: bool,
}
impl<'a> Held<'a> {
    fn new(hdl: &'a mut TaskHdl) -> Self {
        Self { hdl, held: false }
    }
}
impl Future for Held<'_> {
    type Output = ();

    fn poll(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Self::Output> {
        let mut guard = self.hdl.0.ctrl.lock().unwrap();
        if guard.should_exit || !guard.should_hold {
            if self.held {
                self.hdl.0.set_unheld(&mut guard);
                drop(guard);
                self.held = false;
            }
            Poll::Ready(())
        } else {
            self.hdl.0.set_held(&mut guard, Some(cx.waker().clone()));
            drop(guard);
            self.held = true;
            Poll::Pending
        }
    }
}
impl Drop for Held<'_> {
    fn drop(&mut self) {
        if self.held {
            let mut guard = self.hdl.0.ctrl.lock().unwrap();
            self.hdl.0.set_unheld(&mut guard);
        }
    }
}

struct GetEvent<'a> {
    hdl: &'a mut TaskHdl,
    waiting: bool,
}
impl<'a> GetEvent<'a> {
    fn new(hdl: &'a mut TaskHdl) -> Self {
        Self { hdl, waiting: false }
    }
}
impl Future for GetEvent<'_> {
    type Output = Event;

    fn poll(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Self::Output> {
        let mut guard = self.hdl.0.ctrl.lock().unwrap();
        if guard.should_exit {
            Poll::Ready(Event::Exit)
        } else if guard.should_hold {
            Poll::Ready(Event::Hold)
        } else {
            guard.waker_task = Some(cx.waker().clone());
            drop(guard);
            self.waiting = true;
            Poll::Pending
        }
    }
}
impl Drop for GetEvent<'_> {
    fn drop(&mut self) {
        if self.waiting {
            let mut guard = self.hdl.0.ctrl.lock().unwrap();
            guard.waker_task = None;
        }
    }
}
#[derive(Debug, Eq, PartialEq, Error)]
pub enum Error {
    #[error("task already exited")]
    Exited,
    #[error("task is marked to exit")]
    ExitInProgress,
    #[error("task marked to run while waiting for hold")]
    HoldInterrupted,
}

/// Exercise control of a task via requests through its [TaskHdl]
pub struct TaskCtrl(Weak<Inner>);
impl TaskCtrl {
    /// Clear any requested hold on the task.  It will fail if the task has been
    /// requested to exit (via [TaskCtrl::exit]) or has exited on its own.
    pub fn run(&mut self) -> Result<(), Error> {
        let inner = self.0.upgrade().ok_or_else(|| Error::Exited)?;
        let mut guard = inner.ctrl.lock().unwrap();
        if guard.should_exit {
            Err(Error::ExitInProgress)
        } else {
            inner.unrequest_hold(&mut guard);
            Ok(())
        }
    }

    /// Request that the task enter the `held` state.  This will return when the
    /// task enters such a state, or (for whatever reason) has exited.
    pub fn hold(&mut self) -> Result<(), Error> {
        let inner = self.0.upgrade().ok_or_else(|| Error::Exited)?;
        let mut guard = inner.ctrl.lock().unwrap();
        if guard.should_exit {
            Err(Error::ExitInProgress)
        } else {
            guard = inner.request_hold(guard, None);
            guard = inner
                .cv
                .wait_while(guard, |g| {
                    (!g.is_held && g.should_hold) && !g.should_exit
                })
                .unwrap();
            if guard.should_exit {
                Err(Error::ExitInProgress)
            } else if !guard.should_hold {
                // Someone else swooped into to clear the hold
                Err(Error::HoldInterrupted)
            } else {
                Ok(())
            }
        }
    }
    /// Request that the task exit.  Returns immediately, without waiting for
    /// said exit to actually occur.
    pub fn exit(&mut self) {
        if let Some(inner) = self.0.upgrade() {
            let guard = inner.ctrl.lock().unwrap();
            inner.request_exit(guard);
        }
    }

    /// Async interface equivalent to [TaskCtrl::hold]
    ///
    /// Requests that the task enter the `held` state.  The future becomes
    /// [ready](Poll::Ready) when the task enters `held` state or exits.
    pub async fn held(&mut self) -> Result<(), Error> {
        CtrlHeld::new(self).await
    }
}

struct CtrlHeld<'a> {
    tc: &'a mut TaskCtrl,
    pending_request: bool,
}
impl<'a> CtrlHeld<'a> {
    fn new(tc: &'a mut TaskCtrl) -> Self {
        Self { tc, pending_request: false }
    }
}
impl Future for CtrlHeld<'_> {
    type Output = Result<(), Error>;

    fn poll(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Self::Output> {
        if let Some(inner) = self.tc.0.upgrade() {
            let mut ctrl = inner.ctrl.lock().unwrap();
            if ctrl.should_exit {
                ctrl.waker_ctrl = None;
                self.pending_request = false;
                return Poll::Ready(Err(Error::ExitInProgress));
            } else if ctrl.is_held {
                ctrl.waker_ctrl = None;
                self.pending_request = false;
                return Poll::Ready(Ok(()));
            }

            if !ctrl.should_hold {
                if self.pending_request {
                    // Someone else swooped into to clear the hold
                    return Poll::Ready(Err(Error::HoldInterrupted));
                }
                let _ctrl = inner.request_hold(ctrl, Some(cx.waker().clone()));
                self.pending_request = true;
            }
            Poll::Pending
        } else {
            Poll::Ready(Err(Error::Exited))
        }
    }
}
impl Drop for CtrlHeld<'_> {
    fn drop(&mut self) {
        // Clean up the hold request if being cancelled
        if self.pending_request {
            if let Some(inner) = self.tc.0.upgrade() {
                let mut ctrl = inner.ctrl.lock().unwrap();
                inner.unrequest_hold(&mut ctrl);
            }
        }
    }
}

/// Holds a group of tokio task [task::JoinHandle]s to be later joined as a
/// group when they have all concluded.
pub struct TaskGroup(Mutex<Vec<task::JoinHandle<()>>>);
impl TaskGroup {
    pub fn new() -> Self {
        Self(Mutex::new(Vec::new()))
    }

    /// Add to the group of contained tasks
    pub fn extend<I>(&self, tasks: I)
    where
        I: Iterator<Item = task::JoinHandle<()>>,
    {
        let mut guard = self.0.lock().unwrap();
        guard.extend(tasks);
    }

    /// Waits until all the workers in this task group have completed.
    ///
    /// # Return value
    ///
    /// `None` if all the tasks completed successfully. `Some` if at least one
    /// task failed; the wrapped value is a `Vec` of all of the returned errors.
    pub async fn join_all(&self) -> Option<Vec<task::JoinError>> {
        let workers = {
            let mut guard = self.0.lock().unwrap();
            std::mem::replace(&mut *guard, Vec::new())
        };

        if workers.is_empty() {
            return None;
        }

        let errors = FuturesUnordered::from_iter(workers)
            .filter_map(|res| futures::future::ready(res.err()))
            .collect::<Vec<_>>()
            .await;

        if errors.is_empty() {
            None
        } else {
            Some(errors)
        }
    }
}

/// Convenience type alias for the [thread::JoinHandle::join()] error type
pub type ThreadErr = Box<dyn Any + Send + 'static>;

/// Holds a group of thread [thread::JoinHandle]s to be later joined as a group
/// when they have all concluded.
pub struct ThreadGroup(Mutex<Vec<thread::JoinHandle<()>>>);
impl ThreadGroup {
    pub fn new() -> Self {
        Self(Mutex::new(Vec::new()))
    }

    /// Add to group of contained threads
    ///
    /// The first (if any) [std::io::Error] encountered while among the
    /// `threads` items will determine the return from this function.  All
    /// non-error [thread::JoinHandle]s will be added to the group even if one
    /// or more items are an `Error`.
    pub fn extend<I>(&self, threads: I) -> std::io::Result<()>
    where
        I: Iterator<Item = std::io::Result<thread::JoinHandle<()>>>,
    {
        let mut guard = self.0.lock().unwrap();
        let mut status = Ok(());
        for result in threads {
            match result {
                Err(e) => {
                    // record the first error
                    if status.is_ok() {
                        status = Err(e);
                    }
                }
                Ok(hdl) => guard.push(hdl),
            }
        }
        status
    }

    /// Block until all contained [thread::JoinHandle]s have been joined,
    /// returning any resulting [ThreadErr]s after doing so.
    pub fn block_until_joined(&self) -> Option<Vec<ThreadErr>> {
        let mut guard = self.0.lock().unwrap();
        let errors =
            guard.drain(..).filter_map(|t| t.join().err()).collect::<Vec<_>>();
        if errors.is_empty() {
            None
        } else {
            Some(errors)
        }
    }
}


================================================
FILE: lib/propolis/src/util/aspace.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Utilities to construct and manipulate an address space.

use std::collections::{btree_map, BTreeMap};
use std::io::{Error as IoError, ErrorKind};
use std::ops::Bound::{Excluded, Included, Unbounded};
use std::ops::RangeBounds;

use thiserror::Error;

/// Generic container storing items in a region representing an address space.
///
/// Stores ranges by (start, length), but also allows association
/// of generic objects with each region.
#[derive(Debug)]
pub struct ASpace<T> {
    start: usize,
    end: usize,
    map: BTreeMap<usize, (usize, T)>,
}

#[derive(Debug, Eq, PartialEq, Error)]
pub enum Error {
    #[error("address outside acceptable range")]
    OutOfRange,
    #[error("zero or overflowing length")]
    BadLength,
    #[error("would conflict with existing entry")]
    Conflict,
    #[error("entry not found")]
    NotFound,
}
impl From<Error> for IoError {
    fn from(e: Error) -> Self {
        IoError::new(ErrorKind::Other, e)
    }
}

pub type Result<T> = std::result::Result<T, Error>;

/// Represents (start: usize, end: usize, item: &T) region in the space
type SpaceItem<'a, T> = (usize, usize, &'a T);

impl<T> ASpace<T> {
    /// Create a instance with inclusive range [`start`, `end`]
    ///
    /// # Panics
    ///
    /// - Panics if start > end.
    pub const fn new(start: usize, end: usize) -> ASpace<T> {
        assert!(start <= end);
        Self { start, end, map: BTreeMap::new() }
    }

    /// Register an inclusive region [`start`, `end`].
    ///
    /// Returns an error if the region extends beyond the start/end of the
    /// address space, or if it conflicts with any existing registration.
    pub fn register(
        &mut self,
        start: usize,
        len: usize,
        item: T,
    ) -> Result<()> {
        let end = safe_end(start, len).ok_or(Error::BadLength)?;
        if start < self.start || start > self.end || end > self.end {
            return Err(Error::OutOfRange);
        }

        // Do any entries conflict with the registration?
        if self.covered_by((Included(start), Included(end))).next().is_some() {
            return Err(Error::Conflict);
        }

        let was_overlap = self.map.insert(start, (len, item));
        assert!(was_overlap.is_none());
        Ok(())
    }

    /// Unregister region which begins at `start`
    pub fn unregister(&mut self, start: usize) -> Result<T> {
        match self.map.remove(&start) {
            Some((_len, item)) => Ok(item),
            None => Err(Error::NotFound),
        }
    }

    /// Search for region which contains `point`
    pub fn region_at(&self, point: usize) -> Result<SpaceItem<'_, T>> {
        if point < self.start || point > self.end {
            return Err(Error::OutOfRange);
        }
        if let Some((start, ent)) =
            self.map.range((Unbounded, Included(&point))).next_back()
        {
            if safe_end(*start, ent.0).unwrap() >= point {
                return Ok((*start, ent.0, &ent.1));
            }
        }
        Err(Error::NotFound)
    }

    /// Get an iterator for items in the space, sorted by starting point
    pub fn iter(&self) -> Iter<'_, T> {
        Iter { inner: self.map.iter() }
    }

    pub fn lowest_addr<P>(&self, mut predicate: P) -> Option<usize>
    where
        P: FnMut(&T) -> bool,
    {
        let (&k, _) =
            self.map.iter().find(|(_, (_, entry))| predicate(entry))?;
        Some(k)
    }

    pub fn highest_addr<P>(&self, mut predicate: P) -> Option<usize>
    where
        P: FnMut(&T) -> bool,
    {
        let (&k, &(len, _)) =
            self.map.iter().rev().find(|(_, (_, entry))| predicate(entry))?;
        Some(k + len - 1)
    }

    /// Get an iterator for all empty space, sorted by starting point
    ///
    /// Returns all space which does not overlap with registered regions.
    pub fn inverse_iter(&self) -> InverseIter<'_, T> {
        InverseIter { inner: self.map.iter(), next: 0, end: self.end }
    }

    /// Get iterator for regions which are (partially or totally) covered by a range
    pub fn covered_by<R>(&self, range: R) -> Range<'_, T>
    where
        R: RangeBounds<usize>,
    {
        // The front bound needs to be adjusted to search for any region which preceeds the start
        // point, since that region may extend into the target search range.
        let fixed_front = match range.start_bound() {
            Unbounded => Unbounded,
            Excluded(pos) => {
                if let Ok((start, _, _)) = self.region_at(pos + 1) {
                    Included(start)
                } else {
                    Excluded(*pos)
                }
            }
            Included(pos) => {
                if let Ok((start, _, _)) = self.region_at(*pos) {
                    Included(start)
                } else {
                    Excluded(*pos)
                }
            }
        };
        let tail = match range.end_bound() {
            Unbounded => Unbounded,
            Excluded(a) => Excluded(*a),
            Included(a) => Included(*a),
        };
        Range { inner: self.map.range((fixed_front, tail)) }
    }

    /// Clear all items from the space
    pub fn clear(&mut self) {
        self.map.clear();
    }
}

// Compute the end of the inclusive range beginning at `start` and covering
// `len` address space.
fn safe_end(start: usize, len: usize) -> Option<usize> {
    if len == 0 {
        None
    } else if start == 0 {
        Some((start + len) - 1)
    } else {
        (start - 1).checked_add(len)
    }
}

#[test]
fn safe_end_bounds() {
    // An inclusive region of size zero is nonsense.
    assert_eq!(safe_end(0, 0), None);
    assert_eq!(safe_end(1, 0), None);

    // An inclusive region of size one can exist anywhere.
    assert_eq!(safe_end(0, 1), Some(0));
    assert_eq!(safe_end(16, 1), Some(16));
    assert_eq!(safe_end(usize::MAX, 1), Some(usize::MAX));

    // Given `[0, usize::MAX]` as possible addresses, the size of that set is
    // actually `usize::MAX + 1`. This means:
    // * there is no `len` that covers the entire possible span
    // * start=0, len=usize::MAX is a span that ends at usize::MAX-1
    assert_eq!(safe_end(0, usize::MAX), Some(usize::MAX - 1));
    assert_eq!(safe_end(1, usize::MAX), Some(usize::MAX));
}

// Flatten the K/V nested tuple
fn kv_flatten<'a, T>(i: (&'a usize, &'a (usize, T))) -> SpaceItem<'a, T> {
    let start = *i.0;
    let end = (i.1).0;
    let item = &(i.1).1;
    (start, end, item)
}

/// Iterator for all items in an [ASpace], constructed by [ASpace::iter].
pub struct Iter<'a, T> {
    inner: btree_map::Iter<'a, usize, (usize, T)>,
}

impl<'a, T> Iterator for Iter<'a, T> {
    /// Item represents (start, end, &item)
    type Item = SpaceItem<'a, T>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(kv_flatten)
    }
}

/// Represents a region in the space.
#[derive(Clone, Copy, Debug, PartialEq)]
pub struct Extent {
    start: usize,
    len: usize,
}

impl Extent {
    pub fn start(&self) -> usize {
        self.start
    }
    pub fn len(&self) -> usize {
        self.len
    }
}

/// Iterator for empty space in an [ASpace], created by [ASpace::inverse_iter].
pub struct InverseIter<'a, T> {
    inner: btree_map::Iter<'a, usize, (usize, T)>,
    // Next potential empty region starting address.
    next: usize,
    end: usize,
}

impl<T> Iterator for InverseIter<'_, T> {
    /// Item represents unregistered region in address mapping.
    type Item = Extent;

    fn next(&mut self) -> Option<Self::Item> {
        while self.next < self.end {
            match self.inner.next() {
                Some((registered_start, (registered_len, _))) => {
                    if self.next < *registered_start {
                        // Empty space exists before the next region.
                        let extent = Extent {
                            start: self.next,
                            len: *registered_start - self.next,
                        };
                        // Jump past the registered region.
                        self.next = *registered_start + registered_len;
                        return Some(extent);
                    } else {
                        // This space is registered. Move beyond it to find
                        // empty space.
                        self.next = *registered_start + registered_len;
                        continue;
                    }
                }
                None => {
                    // If we've run out of registered regions, return everything
                    // up to the end of the address space.
                    let extent =
                        Extent { start: self.next, len: self.end - self.next };
                    self.next = self.end;
                    return Some(extent);
                }
            }
        }
        None
    }
}

/// Iterator for items in an [ASpace] overlapping with a range, constructed by [ASpace::covered_by].
pub struct Range<'a, T> {
    inner: btree_map::Range<'a, usize, (usize, T)>,
}

impl<'a, T> Iterator for Range<'a, T> {
    /// Item represents (start, end, &item)
    type Item = SpaceItem<'a, T>;

    fn next(&mut self) -> Option<Self::Item> {
        self.inner.next().map(kv_flatten)
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn create_one_elem() {
        let mut s: ASpace<u32> = ASpace::new(0, 0);
        s.register(0, 1, 0xaa)
            .expect("can register an element in one-elem ASpace");
        assert_eq!(s.register(0, 2, 0x11), Err(Error::OutOfRange));
        assert_eq!(s.register(1, 2, 0x22), Err(Error::OutOfRange));
        assert_eq!(s.region_at(0), Ok((0, 1, &0xaa)));
    }
    #[test]
    fn create_max() {
        let _s: ASpace<u32> = ASpace::new(0, usize::max_value());
    }
    #[test]
    fn create_normal() {
        let _s: ASpace<u32> = ASpace::new(0x1000, 0xffff);
    }
    #[test]
    fn register_plain() {
        let mut s: ASpace<u32> = ASpace::new(0, 0xffff);

        assert!(s.register(0, 0x1000, 0).is_ok());
    }
    #[test]
    fn register_invalid() {
        let mut s: ASpace<u32> = ASpace::new(0, 0x1000);

        assert_eq!(s.register(0x100, 0, 0), Err(Error::BadLength));
        assert_eq!(
            s.register(0x100, usize::MAX - 0x50, 0),
            Err(Error::BadLength)
        );
    }
    #[test]
    fn register_outside() {
        let start = 0x100;
        let end = 0x1ff;
        let len = end - start + 1;
        let mut s: ASpace<u32> = ASpace::new(start, end);

        let expect: Result<()> = Err(Error::OutOfRange);
        assert_eq!(s.register(0, start - 1, 0), expect);
        assert_eq!(s.register(0, start, 0), expect);
        assert_eq!(s.register(0, start + 1, 0), expect);

        assert_eq!(s.register(start + 1, len, 0), expect);
        assert_eq!(s.register(start + len, 1, 0), expect);
    }
    #[test]
    fn register_overlaps() {
        let mut s: ASpace<u32> = ASpace::new(0, 0xffff);
        assert!(s.register(0, 0x1000, 0).is_ok());
        assert!(s.register(0x2000, 0x1000, 0).is_ok());
        assert!(s.register(0xf000, 0x1000, 0).is_ok());
        let expect: Result<()> = Err(Error::Conflict);

        // direct overlap
        assert_eq!(s.register(0, 0x1000, 0), expect);
        assert_eq!(s.register(0x2000, 0x1000, 0), expect);
        assert_eq!(s.register(0xf000, 0x1000, 0), expect);

        // tail overlap
        assert_eq!(s.register(0x1ff0, 0x0011, 0), expect);
        assert_eq!(s.register(0x1ff0, 0x0012, 0), expect);
        assert_eq!(s.register(0x1ff0, 0x1000, 0), expect);
        assert_eq!(s.register(0xefff, 0x0002, 0), expect);
        assert_eq!(s.register(0xefff, 0x0003, 0), expect);
        assert_eq!(s.register(0xefff, 0x1000, 0), expect);

        // head overlap
        assert_eq!(s.register(0x0ffe, 0x10, 0), expect);
        assert_eq!(s.register(0x0fff, 0x10, 0), expect);
        assert_eq!(s.register(0x2ffe, 0x10, 0), expect);
        assert_eq!(s.register(0x2fff, 0x10, 0), expect);

        // total overlap
        assert_eq!(s.register(0x1ff0, 0x1010, 0), expect);
    }
    #[test]
    fn region_at_outside() {
        let end = 0xffff;
        let mut s: ASpace<u32> = ASpace::new(0, end);

        assert!(s.register(0x1000, 0x1000, 0).is_ok());
        assert_eq!(s.region_at(end + 1), Err(Error::OutOfRange));
        assert_eq!(s.region_at(end + 10), Err(Error::OutOfRange));
    }
    #[test]
    fn region_at_normal() {
        let end = 0xffff;
        let mut s: ASpace<u32> = ASpace::new(0, end);

        let ent: [(usize, usize, &u32); 3] = [
            (0x100, 0x100, &0),
            (0x2000, 0x1000, &1),
            (end - 0xfff, 0x1000, &2),
        ];
        for (a, b, c) in ent.iter() {
            assert!(s.register(*a, *b, **c).is_ok());
        }

        assert_eq!(s.region_at(0x100), Ok(ent[0]));
        assert_eq!(s.region_at(0x110), Ok(ent[0]));
        assert_eq!(s.region_at(0x1ff), Ok(ent[0]));
        assert_eq!(s.region_at(0x2990), Ok(ent[1]));
        assert_eq!(s.region_at(0x2fff), Ok(ent[1]));
        assert_eq!(s.region_at(0xfff0), Ok(ent[2]));
        assert_eq!(s.region_at(end), Ok(ent[2]));

        assert_eq!(s.region_at(0), Err(Error::NotFound));
        assert_eq!(s.region_at(0x200), Err(Error::NotFound));
        assert_eq!(s.region_at(0x5000), Err(Error::NotFound));

        assert_eq!(s.region_at(end + 1), Err(Error::OutOfRange));
        assert_eq!(s.region_at(end + 10), Err(Error::OutOfRange));
    }
    #[test]
    fn bounds_accessors() {
        let start = 0x1000;
        let end = 0xffff;
        let mut s: ASpace<u32> = ASpace::new(start, end);

        assert!(s.register(0x2000, 0x1000, 0).is_ok());
        assert!(s.register(0x3000, 0x1000, 1).is_ok());
        assert!(s.register(0x5000, 0x1000, 2).is_ok());
        assert!(s.register(0xa000, 0x1000, 3).is_ok());

        assert_eq!(s.lowest_addr(|_| true), Some(0x2000));
        assert_eq!(s.highest_addr(|_| true), Some(0xafff));
    }
    #[test]
    fn inverse_iterator_alloc_middle() {
        let end = 100;
        let mut s: ASpace<()> = ASpace::new(0, end);

        // Registrations in the middle of the mapping, with free space at the
        // edges of the address space.
        assert!(s.register(10, 10, ()).is_ok());
        assert!(s.register(30, 10, ()).is_ok());

        let mut iter = s.inverse_iter();
        assert_eq!(Extent { start: 0, len: 10 }, iter.next().unwrap());
        assert_eq!(Extent { start: 20, len: 10 }, iter.next().unwrap());
        assert_eq!(Extent { start: 40, len: 60 }, iter.next().unwrap());
        assert!(iter.next().is_none());
    }

    #[test]
    fn inverse_iterator_alloc_ends() {
        let end = 100;
        let mut s: ASpace<()> = ASpace::new(0, end);

        // Registrations at the edges of the address space.
        assert!(s.register(0, 10, ()).is_ok());
        assert!(s.register(40, 20, ()).is_ok());
        assert!(s.register(90, 10, ()).is_ok());

        let mut iter = s.inverse_iter();
        assert_eq!(Extent { start: 10, len: 30 }, iter.next().unwrap());
        assert_eq!(Extent { start: 60, len: 30 }, iter.next().unwrap());
        assert!(iter.next().is_none());
    }

    #[test]
    fn inverse_iterator_sequential_registrations() {
        let end = 100;
        let mut s: ASpace<()> = ASpace::new(0, end);

        // Back-to-back registrations within the address space.
        assert!(s.register(10, 10, ()).is_ok());
        assert!(s.register(20, 10, ()).is_ok());
        assert!(s.register(30, 10, ()).is_ok());

        let mut iter = s.inverse_iter();
        assert_eq!(Extent { start: 0, len: 10 }, iter.next().unwrap());
        assert_eq!(Extent { start: 40, len: 60 }, iter.next().unwrap());
        assert!(iter.next().is_none());
    }

    #[test]
    fn inverse_iterator_empty() {
        let end = 100;
        let mut s: ASpace<()> = ASpace::new(0, end);

        // Entire address space occupied.
        assert!(s.register(0, 100, ()).is_ok());

        let mut iter = s.inverse_iter();
        assert!(iter.next().is_none());
    }

    #[test]
    fn inverse_iterator_full() {
        let end = 100;
        let s: ASpace<()> = ASpace::new(0, end);

        // Entire address space empty
        let mut iter = s.inverse_iter();
        assert_eq!(Extent { start: 0, len: 100 }, iter.next().unwrap());
        assert!(iter.next().is_none());
    }
}


================================================
FILE: lib/propolis/src/util/id.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// Define a newtype for a kind of run-time identifier. Exact semantics for a
// given identifier depend on what uses it, but they are expected to be used
// only in places internal to a VM's instantiation. DTrace probes for VM
// operations, device state that is expected to be lost across migration, that
// kind of thing.
//
// As an example of what not to do, these identifiers must not be used as field
// on Oximeter metrics: because the identifier for some item in the VM may
// change across instantiations, the field may become meaningless across
// stop/start or migration. In such cases a more persistent identifier (PCI BDF,
// NIC MAC, vCPU ACPI ID, etc) must be used instead.
//
// If an item needs stable identifiers, this is not the tool to use.
//
// This macro takes syntax matching the newtype definition primarily so that
// grepping for the newtype like `struct DeviceId` finds corresponding macro
// invocations.
macro_rules! define_id {
    {
        $(#[$meta_items:meta])*
        pub struct $id_name:ident($visibility:vis u32);
    } => {
        ::paste::paste! {
            $(#[$meta_items])*
            pub struct $id_name($visibility u32);

            impl $id_name {
                pub const INVALID: $id_name = $id_name(u32::MAX);
                pub fn new() -> Self {
                    static [<_NEXT_ $id_name:upper>]: ::std::sync::atomic::AtomicU32 =
                        ::std::sync::atomic::AtomicU32::new(0);

                    let id = [<_NEXT_ $id_name:upper>].fetch_add(
                        1,
                        ::std::sync::atomic::Ordering::Relaxed
                    );
                    $id_name(id)
                }
            }
        }
    }
}
pub(crate) use define_id;


================================================
FILE: lib/propolis/src/util/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod aspace;
pub mod id;
pub mod regmap;

mod ioctl {
    use std::io::{Error, Result};
    use std::os::fd::RawFd;
    use std::os::raw::c_void;

    pub(crate) unsafe fn ioctl(
        fd: RawFd,
        cmd: i32,
        data: *mut c_void,
    ) -> Result<i32> {
        // Paper over differing type for request parameter
        match libc::ioctl(fd, cmd as _, data) {
            -1 => Err(Error::last_os_error()),
            res => Ok(res),
        }
    }
}
pub(crate) use ioctl::ioctl;


================================================
FILE: lib/propolis/src/util/regmap.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::cmp::Ordering;
use std::ops::Bound::Included;

use super::aspace::ASpace;
use crate::common::*;

#[derive(Debug)]
struct RegDef<ID> {
    id: ID,
    flags: Flags,
}

/// Represents a mapping of registers within an address space.
#[derive(Debug)]
pub struct RegMap<ID> {
    len: usize,
    space: ASpace<RegDef<ID>>,
}

bitflags! {
    #[derive(Default, Debug)]
    pub struct Flags: u8 {
        const DEFAULT = 0;
        const NO_READ_EXTEND = 0b00000001;
        const NO_WRITE_EXTEND = 0b00000010;
        const NO_READ_MOD_WRITE = 0b00000100;
        const PASSTHRU = Self::NO_READ_EXTEND.bits() |
            Self::NO_WRITE_EXTEND.bits();
    }
}

struct RegXfer<'a, ID> {
    reg: &'a RegDef<ID>,
    reg_len: usize,
    offset: usize,
    skip_front_idx: usize,
    split_back_idx: usize,
}

impl<ID> RegMap<ID> {
    pub const fn new(len: usize) -> Self {
        Self { len, space: ASpace::new(0, len - 1) }
    }

    pub fn define(&mut self, start: usize, len: usize, id: ID) {
        self.define_with_flags(start, len, id, Flags::DEFAULT)
    }

    pub fn define_with_flags(
        &mut self,
        start: usize,
        len: usize,
        id: ID,
        flags: Flags,
    ) {
        self.space.register(start, len, RegDef { id, flags }).unwrap();
    }

    pub fn process<F>(&self, op: &mut RWOp<'_, '_>, mut f: F)
    where
        F: FnMut(&ID, RWOp),
    {
        match op {
            RWOp::Read(ro) => {
                self.read(ro, &mut f);
            }
            RWOp::Write(wo) => {
                self.write(wo, &mut f);
            }
        }
    }
    pub fn read<F>(&self, ro: &mut ReadOp, f: &mut F)
    where
        F: FnMut(&ID, RWOp),
    {
        assert!(ro.len() != 0);
        assert!(ro.offset() + ro.len() - 1 < self.len);

        self.iterate_transfers(ro.offset(), ro.len(), |xfer: &RegXfer<ID>| {
            let mut copy_op = ReadOp::new_child(
                xfer.offset,
                ro,
                xfer.skip_front_idx..xfer.split_back_idx,
            );

            debug_assert!(copy_op.len() != 0);
            Self::reg_read(xfer.reg, xfer.reg_len, &mut copy_op, f);
        })
    }

    pub fn write<F>(&self, wo: &mut WriteOp, f: &mut F)
    where
        F: FnMut(&ID, RWOp),
    {
        assert!(wo.len() != 0);
        assert!(wo.offset() + wo.len() - 1 < self.len);

        self.iterate_transfers(wo.offset(), wo.len(), |xfer| {
            let mut copy_op = WriteOp::new_child(
                xfer.offset,
                wo,
                xfer.skip_front_idx..xfer.split_back_idx,
            );

            debug_assert!(copy_op.len() != 0);
            Self::reg_write(xfer.reg, xfer.reg_len, &mut copy_op, f);
        })
    }

    fn reg_read<F>(
        reg: &RegDef<ID>,
        reg_len: usize,
        copy_op: &mut ReadOp,
        f: &mut F,
    ) where
        F: FnMut(&ID, RWOp),
    {
        if reg.flags.contains(Flags::NO_READ_EXTEND) && reg_len != copy_op.len()
        {
            f(&reg.id, RWOp::Read(copy_op));
        } else if reg_len == copy_op.len() {
            debug_assert!(copy_op.offset() == 0);
            f(&reg.id, RWOp::Read(copy_op));
        } else {
            let mut scratch = vec![0; reg_len];
            let mut sro = ReadOp::from_buf(0, &mut scratch);

            f(&reg.id, RWOp::Read(&mut sro));
            copy_op.write_bytes(
                &scratch[copy_op.offset()..(copy_op.offset() + copy_op.len())],
            );
        }
    }

    fn reg_write<F>(
        reg: &RegDef<ID>,
        reg_len: usize,
        copy_op: &mut WriteOp,
        f: &mut F,
    ) where
        F: FnMut(&ID, RWOp),
    {
        if reg.flags.contains(Flags::NO_WRITE_EXTEND)
            && reg_len != copy_op.len()
        {
            f(&reg.id, RWOp::Write(copy_op));
        } else if reg_len == copy_op.len() {
            debug_assert!(copy_op.offset() == 0);
            f(&reg.id, RWOp::Write(copy_op));
        } else {
            let mut scratch = vec![0; reg_len];

            if !reg.flags.contains(Flags::NO_READ_MOD_WRITE) {
                let mut sro = ReadOp::from_buf(0, &mut scratch);
                f(&reg.id, RWOp::Read(&mut sro));
            }
            copy_op.read_bytes(
                &mut scratch
                    [copy_op.offset()..(copy_op.offset() + copy_op.len())],
            );

            f(&reg.id, RWOp::Write(&mut WriteOp::from_buf(0, &scratch)));
        }
    }

    fn iterate_transfers<F>(&self, offset: usize, len: usize, mut do_xfer: F)
    where
        F: FnMut(&RegXfer<'_, ID>),
    {
        let last_position = offset + len - 1;
        let mut position = offset;

        assert!(len != 0);
        assert!(last_position < self.len);

        for (reg_start, reg_len, reg) in
            self.space.covered_by((Included(offset), Included(last_position)))
        {
            let mut skip_front = 0;
            let mut split_back = 0;
            let mut reg_offset = 0;

            let consumed = position - offset;
            let remain = len - consumed;

            match position.cmp(&reg_start) {
                Ordering::Equal => {
                    if remain > reg_len {
                        split_back = remain - reg_len;
                    }
                }
                Ordering::Less => {
                    debug_assert!(position + remain > reg_start);
                    skip_front = reg_start - position;
                    if remain - skip_front > reg_len {
                        split_back = remain - (skip_front + reg_len);
                    }
                }
                Ordering::Greater => {
                    reg_offset = position - reg_start;
                    if reg_offset + remain > reg_len {
                        split_back = reg_offset + remain - reg_len;
                    }
                }
            };
            let xfer_len = remain - (skip_front + split_back);
            debug_assert!(xfer_len <= reg_len);

            do_xfer(&RegXfer {
                reg,
                reg_len,
                offset: reg_offset,
                skip_front_idx: consumed + skip_front,
                split_back_idx: consumed + skip_front + xfer_len,
            });

            position = reg_start + reg_offset + xfer_len;
        }
    }
}
impl<ID: Copy + Eq> RegMap<ID> {
    pub fn create_packed(
        size: usize,
        regdef: &[(ID, usize)],
        resv_reg: Option<ID>,
    ) -> Self {
        let mut map = RegMap::new(size);
        let mut off = 0;
        for reg in regdef.iter() {
            let (id, reg_size) = (reg.0, reg.1);
            let flags = match resv_reg.as_ref() {
                Some(resv) if *resv == id => {
                    Flags::NO_READ_EXTEND | Flags::NO_WRITE_EXTEND
                }
                _ => Flags::DEFAULT,
            };
            map.define_with_flags(off, reg_size, id, flags);
            off += reg_size;
        }
        assert_eq!(size, off);

        map
    }
    pub fn create_packed_passthru(size: usize, regdef: &[(ID, usize)]) -> Self {
        let mut map = RegMap::new(size);
        let mut off = 0;
        for reg in regdef.iter() {
            let (id, reg_size) = (reg.0, reg.1);
            map.define_with_flags(off, reg_size, id, Flags::PASSTHRU);
            off += reg_size;
        }
        assert_eq!(size, off);

        map
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[derive(Clone, Copy, Eq, PartialEq, Debug)]
    enum XferDir {
        Read,
        Write,
    }
    #[derive(Clone, Copy, Eq, PartialEq, Debug)]
    struct Xfer<ID: Copy + Eq> {
        dir: XferDir,
        reg: ID,
        off: usize,
        len: usize,
    }
    impl<ID: Copy + Eq> Xfer<ID> {
        fn from_rwo(id: &ID, rwo: RWOp) -> Self {
            let dir = match rwo {
                RWOp::Read(_) => XferDir::Read,
                RWOp::Write(_) => XferDir::Write,
            };
            Xfer { dir, reg: *id, off: rwo.offset(), len: rwo.len() }
        }
        fn read(id: ID, off: usize, len: usize) -> Self {
            Xfer { dir: XferDir::Read, reg: id, off, len }
        }
        #[allow(unused)]
        fn write(id: ID, off: usize, len: usize) -> Self {
            Xfer { dir: XferDir::Write, reg: id, off, len }
        }
    }

    fn read(off: usize, len: usize, cb: impl FnOnce(RWOp)) {
        let mut buf = vec![0; len];
        let mut ro = ReadOp::from_buf(off, &mut buf);
        cb(RWOp::Read(&mut ro))
    }
    #[allow(unused)]
    fn write(off: usize, len: usize, cb: impl FnOnce(&mut RWOp)) {
        let mut buf = vec![0; len];
        let mut wo = WriteOp::from_buf(off, &buf);
        cb(&mut RWOp::Write(&mut wo))
    }
    fn drive_reads<ID: Copy + Eq>(
        xfers: &[(usize, usize)],
        map: &RegMap<ID>,
    ) -> Vec<Xfer<ID>> {
        let mut res = Vec::new();

        for &(off, len) in xfers {
            read(off, len, |mut rwo| {
                map.process(&mut rwo, |id, rwo| {
                    res.push(Xfer::from_rwo(id, rwo))
                })
            })
        }
        res
    }

    #[test]
    fn simple() {
        // Simple map with varied sizing
        let map = RegMap::create_packed(
            0x10,
            &[('a', 1), ('b', 1), ('c', 2), ('d', 4), ('e', 8)],
            None,
        );
        let expected = &[
            Xfer::read('a', 0, 1),
            Xfer::read('b', 0, 1),
            Xfer::read('c', 0, 2),
            Xfer::read('d', 0, 4),
            Xfer::read('e', 0, 8),
        ];
        // Each field individually
        let reads = [(0, 1), (1, 1), (2, 2), (4, 4), (8, 8)];
        let res = drive_reads(&reads, &map);
        assert_eq!(&res, expected);
        // One big op, covering all
        let reads = [(0, 0x10)];
        let res = drive_reads(&reads, &map);
        assert_eq!(&res, expected);
    }
    #[test]
    fn misaligned() {
        // Map shaped like virtio-net config with weird offsets due to mac addr
        let map = RegMap::create_packed(
            12,
            &[('a', 6), ('b', 2), ('c', 2), ('d', 2)],
            None,
        );

        let expected = &[
            Xfer::read('a', 0, 6),
            Xfer::read('a', 0, 6),
            Xfer::read('b', 0, 2),
            Xfer::read('c', 0, 2),
            Xfer::read('d', 0, 2),
        ];
        // Each field individually with 4-byte reads
        let reads = [(0, 4), (4, 4), (8, 4)];
        let res = drive_reads(&reads, &map);
        assert_eq!(&res, expected);
    }
}


================================================
FILE: lib/propolis/src/vcpu.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Virtual CPU functionality.

use std::io::Result;
use std::sync::Arc;

use crate::common::GuestData;
use crate::common::Lifecycle;
use crate::common::VcpuId;
use crate::cpuid;
use crate::enlightenment::Enlightenment;
use crate::exits::*;
use crate::migrate::*;
use crate::mmio::MmioBus;
use crate::msr::MsrId;
use crate::msr::RdmsrOutcome;
use crate::msr::WrmsrOutcome;
use crate::pio::PioBus;
use crate::tasks;
use crate::vmm::VmmHdl;
use cpuid_utils::{CpuidMapConversionError, CpuidSet};
use migrate::VcpuReadWrite;
use thiserror::Error;

use bhyve_api::ApiVersion;
use propolis_types::{CpuidIdent, CpuidValues, CpuidVendor};

#[usdt::provider(provider = "propolis")]
mod probes {
    fn vm_entry(vcpuid: u32) {}
    fn vm_exit(vcpuid: u32, rip: u64, code: u32) {}
}

#[cfg(not(feature = "omicron-build"))]
pub const MAXCPU: usize = bhyve_api::VM_MAXCPU;

// Helios (stlouis) is built with an expanded limit of 254
#[cfg(feature = "omicron-build")]
pub const MAXCPU: usize = 254;

#[derive(Debug, Error)]
pub enum GetCpuidError {
    #[error("failed to read CPUID values from bhyve")]
    ReadIoctlFailed(#[source] std::io::Error),

    #[error("failed to build a map of CPUID entries")]
    MapConversion(#[from] CpuidMapConversionError),

    #[error("unsupported CPUID vendor string: {0}")]
    UnsupportedVendor(&'static str),
}

/// A handle to a virtual CPU.
pub struct Vcpu {
    hdl: Arc<VmmHdl>,
    pub id: i32,
    pub bus_mmio: Arc<MmioBus>,
    pub bus_pio: Arc<PioBus>,
    pub guest_hv: Arc<dyn Enlightenment>,

    /// Vendor of the underlying CPU hardware
    hardware_vendor: CpuidVendor,
}

impl Vcpu {
    /// Creates a handle to a virtual CPU.
    pub(crate) fn new(
        hdl: Arc<VmmHdl>,
        id: i32,
        bus_mmio: Arc<MmioBus>,
        bus_pio: Arc<PioBus>,
        guest_hv: Arc<dyn Enlightenment>,
    ) -> Arc<Self> {
        #[cfg(target_arch = "x86_64")]
        fn query_hardware_vendor() -> CpuidVendor {
            let res = unsafe { core::arch::x86_64::__cpuid(0) };
            CpuidValues::from(res).try_into().expect("CPU vendor is recognized")
        }

        #[cfg(not(target_arch = "x86_64"))]
        fn query_hardware_vendor() -> CpuidVendor {
            // Just default to AMD when building for tests/etc on non-x86
            CpuidVendor::Amd
        }

        Arc::new(Self {
            hdl,
            id,
            bus_mmio,
            bus_pio,
            guest_hv,
            hardware_vendor: query_hardware_vendor(),
        })
    }

    /// ID of the virtual CPU.
    pub fn cpuid(&self) -> i32 {
        self.id
    }

    pub fn is_bsp(&self) -> bool {
        self.id == 0
    }

    /// Sets the capabilities of the virtual CPU.
    pub fn set_default_capabs(&self) -> Result<()> {
        // Enable exit-on-HLT so the host CPU does not spin in VM context when
        // the guest enters a HLT instruction.
        let mut cap = bhyve_api::vm_capability {
            cpuid: self.id,
            captype: bhyve_api::vm_cap_type::VM_CAP_HALT_EXIT as i32,
            capval: 1,
            allcpus: 0,
        };
        unsafe { self.hdl.ioctl(bhyve_api::VM_SET_CAPABILITY, &mut cap) }
    }

    /// Sets the value of a register within the CPU.
    pub fn set_reg(&self, reg: bhyve_api::vm_reg_name, val: u64) -> Result<()> {
        let mut regcmd = bhyve_api::vm_register {
            cpuid: self.id,
            regnum: reg as i32,
            regval: val,
        };

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_SET_REGISTER, &mut regcmd)?;
        }
        Ok(())
    }

    /// Gets the value of a register within the CPU.
    pub fn get_reg(&self, reg: bhyve_api::vm_reg_name) -> Result<u64> {
        let mut regcmd = bhyve_api::vm_register {
            cpuid: self.id,
            regnum: reg as i32,
            regval: 0,
        };

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_GET_REGISTER, &mut regcmd)?;
        }
        Ok(regcmd.regval)
    }

    /// Set a segment register `reg` to a particular value `seg`.
    ///
    /// If `reg` is not a valid segment register, an error will
    /// be returned.
    pub fn set_segreg(
        &self,
        reg: bhyve_api::vm_reg_name,
        seg: &bhyve_api::seg_desc,
    ) -> Result<()> {
        let mut req = bhyve_api::vm_seg_desc {
            cpuid: self.id,
            regnum: reg as i32,
            desc: *seg,
        };

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_SET_SEGMENT_DESCRIPTOR, &mut req)?;
        }
        Ok(())
    }

    /// Get the contents of segment register `reg`
    ///
    /// If `reg` is not a valid segment register, an error will
    /// be returned.
    pub fn get_segreg(
        &self,
        reg: bhyve_api::vm_reg_name,
    ) -> Result<bhyve_api::seg_desc> {
        let mut req = bhyve_api::vm_seg_desc {
            cpuid: self.id,
            regnum: reg as i32,
            desc: bhyve_api::seg_desc::default(),
        };

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_GET_SEGMENT_DESCRIPTOR, &mut req)?;
        }
        Ok(req.desc)
    }

    /// Configure the (in-kernel) `cpuid` emulation state for this vCPU.
    ///
    /// If `values` contains no cpuid entries, then legacy emulation handling
    /// will be used.
    pub fn set_cpuid(&self, values: CpuidSet) -> Result<()> {
        let mut config = bhyve_api::vm_vcpu_cpuid_config {
            vvcc_vcpuid: self.id,
            ..Default::default()
        };
        if values.is_empty() {
            config.vvcc_flags = bhyve_api::VCC_FLAG_LEGACY_HANDLING;
            unsafe {
                self.hdl.ioctl(bhyve_api::VM_SET_CPUID, &mut config)?;
            }
        } else {
            if values.vendor().is_intel() {
                config.vvcc_flags |= bhyve_api::VCC_FLAG_INTEL_FALLBACK;
            }
            let mut entries: Vec<bhyve_api::vcpu_cpuid_entry> = values.into();
            entries.sort_by(bhyve_api::vcpu_cpuid_entry::eval_sort);
            config.vvcc_nent = entries.len() as u32;
            config.vvcc_entries = entries.as_mut_ptr() as *mut libc::c_void;
            unsafe {
                self.hdl.ioctl(bhyve_api::VM_SET_CPUID, &mut config)?;
            }
        }

        Ok(())
    }

    /// Query the configured (in-kernel) `cpuid` emulation state for this vCPU.
    ///
    /// If legacy cpuid handling is configured, the resulting [Set](CpuidSet)
    /// will contain no entries.
    pub fn get_cpuid(&self) -> std::result::Result<CpuidSet, GetCpuidError> {
        let mut config = bhyve_api::vm_vcpu_cpuid_config {
            vvcc_vcpuid: self.id,
            vvcc_nent: 0,
            ..Default::default()
        };
        // Query the number of entries configured in-kernel
        //
        // We expect an error (E2BIG) when attempting a VM_GET_CPUID with a
        // vvcc_nent which falls below the number of entries stored in the
        // kernel.  When that occurs, vvcc_nent will be updated with that
        // existing count so we may allocate an array to receive it on a
        // subsquent ioctl.
        let count = match unsafe {
            self.hdl.ioctl(bhyve_api::VM_GET_CPUID, &mut config)
        } {
            Err(_) if config.vvcc_nent != 0 => Ok(config.vvcc_nent),
            Ok(_) => {
                assert_eq!(config.vvcc_nent, 0);
                Ok(0)
            }
            Err(e) => Err(e),
        }
        .map_err(GetCpuidError::ReadIoctlFailed)?;

        let mut entries = Vec::with_capacity(count as usize);
        entries.fill(bhyve_api::vcpu_cpuid_entry::default());
        config.vvcc_entries = entries.as_mut_ptr() as *mut libc::c_void;
        unsafe {
            self.hdl
                .ioctl(bhyve_api::VM_GET_CPUID, &mut config)
                .map_err(GetCpuidError::ReadIoctlFailed)?;
        }

        if config.vvcc_flags & bhyve_api::VCC_FLAG_LEGACY_HANDLING != 0 {
            // Since the legacy handling takes care of vendor-specific handling
            // (by nature of doing the cpuid queries against the host CPU) it
            // ignores the INTEL_FALLBACK flag.  We must determine the vendor
            // kind by querying it.
            let vendor = CpuidVendor::try_from(cpuid_utils::host::query(
                CpuidIdent::leaf(0),
            ))
            .map_err(GetCpuidError::UnsupportedVendor)?;

            return Ok(CpuidSet::new(vendor));
        }
        let intel_fallback =
            config.vvcc_flags & bhyve_api::VCC_FLAG_INTEL_FALLBACK != 0;
        let mut set = CpuidSet::new(match intel_fallback {
            true => CpuidVendor::Intel,
            false => CpuidVendor::Amd,
        });

        for entry in entries {
            let (ident, value) = cpuid::from_raw(entry);
            let conflict = set
                .insert(ident, value)
                .map_err(CpuidMapConversionError::SubleafConflict)?;

            if conflict.is_some() {
                return Err(CpuidMapConversionError::DuplicateLeaf(
                    ident.leaf,
                    ident.subleaf,
                )
                .into());
            }
        }
        Ok(set)
    }

    /// Issues a command to reset all state for the virtual CPU (including registers and
    /// pending interrupts).
    pub fn reboot_state(&self) -> Result<()> {
        let mut vvr = bhyve_api::vm_vcpu_reset {
            vcpuid: self.id,
            kind: bhyve_api::vcpu_reset_kind::VRK_RESET as u32,
        };

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_RESET_CPU, &mut vvr)?;
        }

        Ok(())
    }
    /// Activates the virtual CPU.
    ///
    /// Fails if the CPU has already been activated.
    pub fn activate(&self) -> Result<()> {
        let mut cpu = self.id;

        unsafe {
            self.hdl.ioctl(bhyve_api::VM_ACTIVATE_CPU, &mut cpu)?;
        }
        Ok(())
    }

    /// Set the state of a virtual CPU.
    pub fn set_run_state(
        &self,
        state: u32,
        sipi_vector: Option<u8>,
    ) -> Result<()> {
        let mut state = bhyve_api::vm_run_state {
            vcpuid: self.id,
            state,
            sipi_vector: sipi_vector.unwrap_or(0),
            ..Default::default()
        };
        unsafe {
            self.hdl.ioctl(bhyve_api::VM_SET_RUN_STATE, &mut state)?;
        }
        Ok(())
    }

    /// Get the state of the virtual CPU.
    pub fn get_run_state(&self) -> Result<bhyve_api::vm_run_state> {
        let mut state =
            bhyve_api::vm_run_state { vcpuid: self.id, ..Default::default() };
        unsafe {
            self.hdl.ioctl(bhyve_api::VM_GET_RUN_STATE, &mut state)?;
        }
        Ok(state)
    }

    /// Executes the guest by running the virtual CPU.
    ///
    /// Blocks the calling thread until the vCPU returns execution,
    /// and returns the reason for exiting ([`VmExit`]).
    ///
    /// When `exit_when_consistent` is asserted, it will instruct the in-kernel
    /// logic to force a [`VmExitKind::Bogus`] exit when the vCPU reaches a
    /// consistent state.  Other exit conditions, such as pending instruction
    /// emulation will take precedence until they are resolved.
    pub fn run(
        &self,
        entry: &VmEntry,
        exit_when_consistent: bool,
    ) -> Result<VmExit> {
        let mut exit: bhyve_api::vm_exit = Default::default();
        let mut entry = entry.to_raw(self.id, &mut exit);

        let api_version = self.hdl.api_version()?;

        if exit_when_consistent {
            if api_version >= ApiVersion::V15 {
                entry.cmd |=
                    bhyve_api::vm_entry_cmds::VEC_FLAG_EXIT_CONSISTENT as u32;
            } else {
                // On older platforms without EXIT_CONSISTENT, we may spend more
                // time inside VM_RUN than desired, but there is little else
                // that can be done.
            }
        }
        probes::vm_entry!(|| self.id as u32);
        let _res = unsafe { self.hdl.ioctl(bhyve_api::VM_RUN, &mut entry)? };
        probes::vm_exit!(|| (self.id as u32, exit.rip, exit.exitcode as u32));

        Ok(VmExit::parse(&exit, api_version))
    }

    /// Issue a "barrier" for the vCPU, forcing an exit from guest context
    pub fn barrier(&self) -> Result<()> {
        if self.hdl.api_version()? >= ApiVersion::V16 {
            // Use the official barrier operation, if available
            self.hdl
                .ioctl_usize(bhyve_api::VM_VCPU_BARRIER, self.id as usize)?;
        } else {
            // Prior to first-class support for a barrier, just force the vCPU
            // out of guest context by reading %rax.  If the vCPU thread happens
            // to be on its way into VM_RUN, but not already there, this old
            // method can fail to incur a proper exit.
            let mut regcmd = bhyve_api::vm_register {
                cpuid: self.id,
                regnum: bhyve_api::vm_reg_name::VM_REG_GUEST_RAX as i32,
                regval: 0,
            };
            unsafe {
                self.hdl.ioctl(bhyve_api::VM_GET_REGISTER, &mut regcmd)?;
            }
        }
        Ok(())
    }

    /// Emit a barrier `Fn`, suitable for use as a
    /// [`TaskHdl`](tasks::TaskHdl) notifier to kick a vCPU out of VMM
    /// context so it undergo state changes in userspace.
    pub fn barrier_fn(self: &Arc<Self>) -> Box<tasks::NotifyFn> {
        let wake_ref = Arc::downgrade(self);
        Box::new(move || {
            if let Some(vcpu) = wake_ref.upgrade() {
                let _ = vcpu.barrier();
            }
        })
    }

    /// Send a Non Maskable Interrupt (NMI) to the vcpu.
    pub fn inject_nmi(&self) -> Result<()> {
        let mut vm_nmi = bhyve_api::vm_nmi { cpuid: self.cpuid() };
        unsafe { self.hdl.ioctl(bhyve_api::VM_INJECT_NMI, &mut vm_nmi) }
    }

    pub fn inject_gp(&self) -> Result<()> {
        let mut vm_excp = bhyve_api::vm_exception {
            cpuid: self.cpuid(),
            vector: i32::from(bits::IDT_GP),
            error_code: 0,
            error_code_valid: 0,
            restart_instruction: 1,
        };
        unsafe { self.hdl.ioctl(bhyve_api::VM_INJECT_EXCEPTION, &mut vm_excp) }
    }

    /// Process [`VmExit`] in the context of this vCPU, emitting a [`VmEntry`]
    /// if the parameters of the exit were such that they could be handled.
    pub fn process_vmexit(&self, exit: &VmExit) -> Option<VmEntry> {
        match exit.kind {
            VmExitKind::Bogus => Some(VmEntry::Run),
            VmExitKind::Inout(io) => match io {
                InoutReq::Out(io, val) => self
                    .bus_pio
                    .handle_out(io.port, io.bytes, val)
                    .map(|_| VmEntry::InoutFulfill(InoutRes::Out(io)))
                    .ok(),
                InoutReq::In(io) => self
                    .bus_pio
                    .handle_in(io.port, io.bytes)
                    .map(|val| VmEntry::InoutFulfill(InoutRes::In(io, val)))
                    .ok(),
            },
            VmExitKind::Mmio(mmio) => match mmio {
                MmioReq::Read(read) => self
                    .bus_mmio
                    .handle_read(read.addr as usize, read.bytes)
                    .map(|val| {
                        VmEntry::MmioFulfill(MmioRes::Read(MmioReadRes {
                            addr: read.addr,
                            bytes: read.bytes,
                            data: val,
                        }))
                    })
                    .ok(),
                MmioReq::Write(write) => self
                    .bus_mmio
                    .handle_write(write.addr as usize, write.bytes, write.data)
                    .map(|_| {
                        VmEntry::MmioFulfill(MmioRes::Write(MmioWriteRes {
                            addr: write.addr,
                            bytes: write.bytes,
                        }))
                    })
                    .ok(),
            },
            VmExitKind::Rdmsr(msr) => {
                match self.guest_hv.rdmsr(VcpuId::from(self.id), MsrId(msr)) {
                    RdmsrOutcome::NotHandled => None,
                    RdmsrOutcome::Handled(val) => {
                        let eax = val & 0xFFFF_FFFF;
                        let edx = val >> 32;
                        self.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RAX,
                            eax,
                        )
                        .expect("setting eax should always succeed");
                        self.set_reg(
                            bhyve_api::vm_reg_name::VM_REG_GUEST_RDX,
                            edx,
                        )
                        .expect("setting edx should always succeed");
                        Some(VmEntry::Run)
                    }
                    RdmsrOutcome::GpException => {
                        self.inject_gp()
                            .expect("injecting #GP should always succeed");
                        Some(VmEntry::Run)
                    }
                }
            }
            VmExitKind::Wrmsr(msr, val) => {
                match self.guest_hv.wrmsr(
                    VcpuId::from(self.id),
                    MsrId(msr),
                    val,
                ) {
                    WrmsrOutcome::NotHandled => None,
                    WrmsrOutcome::Handled => Some(VmEntry::Run),
                    WrmsrOutcome::GpException => {
                        self.inject_gp()
                            .expect("injecting #GP should always succeed");
                        Some(VmEntry::Run)
                    }
                }
            }
            VmExitKind::Debug => {
                // Until there is an interface to delay until a vCPU is no
                // longer under control of the debugger, we have no choice but
                // attempt reentry (and probably spin until the debugger is
                // detached from this vCPU).
                Some(VmEntry::Run)
            }
            VmExitKind::Suspended(_) => None,

            VmExitKind::InstEmul(_)
            | VmExitKind::Paging(_, _)
            | VmExitKind::VmxError(_)
            | VmExitKind::SvmError(_) => None,
            _ => None,
        }
    }
}

impl Lifecycle for Vcpu {
    fn type_name(&self) -> &'static str {
        "bhyve-vcpu"
    }
    fn migrate(&self) -> Migrator<'_> {
        Migrator::Multi(self)
    }

    // The consumer is expected to handle run/pause/halt events directly, since
    // the vCPUs are mostly likely to be driven in manner separate from the
    // other emulated devices.
}
impl MigrateMulti for Vcpu {
    fn export(
        &self,
        output: &mut PayloadOutputs,
        _ctx: &MigrateCtx,
    ) -> std::result::Result<(), MigrateStateError> {
        output.push(migrate::VcpuRunStateV1::read(self)?.into())?;
        output.push(migrate::VcpuGpRegsV1::read(self)?.into())?;
        output.push(migrate::VcpuCtrlRegsV1::read(self)?.into())?;
        output.push(migrate::VcpuDbgRegsV1::read(self)?.into())?;
        output.push(migrate::VcpuSegRegsV1::read(self)?.into())?;
        output.push(migrate::VcpuMsrsV1::read(self)?.into())?;
        output.push(migrate::FpuStateV1::read(self)?.into())?;
        output.push(migrate::LapicV1::read(self)?.into())?;

        // PMU was introduced in V18
        if bhyve_api::api_version()? >= ApiVersion::V18
            && self.hardware_vendor == CpuidVendor::Amd
        {
            output.push(migrate::PmuAmdV1::read(self)?.into())?;
        }

        Ok(())
    }

    fn import(
        &self,
        offer: &mut PayloadOffers,
        _ctx: &MigrateCtx,
    ) -> std::result::Result<(), MigrateStateError> {
        let run_state: migrate::VcpuRunStateV1 = offer.take()?;
        let gp_regs: migrate::VcpuGpRegsV1 = offer.take()?;
        let ctrl_regs: migrate::VcpuCtrlRegsV1 = offer.take()?;
        let dbg_regs: migrate::VcpuDbgRegsV1 = offer.take()?;
        let seg_regs: migrate::VcpuSegRegsV1 = offer.take()?;
        let ms_regs: migrate::VcpuMsrsV1 = offer.take()?;
        let fpu: migrate::FpuStateV1 = offer.take()?;
        let lapic: migrate::LapicV1 = offer.take()?;

        run_state.write(self)?;
        gp_regs.write(self)?;
        ctrl_regs.write(self)?;
        dbg_regs.write(self)?;
        seg_regs.write(self)?;
        ms_regs.write(self)?;
        fpu.write(self)?;
        lapic.write(self)?;

        if let Ok(pmu_amd) = offer.take::<migrate::PmuAmdV1>() {
            pmu_amd.write(self)?;
        }

        Ok(())
    }
}

pub mod migrate {
    use std::io::Result;
    use std::{convert::TryInto, io};

    use super::Vcpu;
    use crate::migrate::*;

    use bhyve_api::{vdi_field_entry_v1, vm_reg_name, ApiVersion};
    use serde::{Deserialize, Serialize};

    pub(super) trait VcpuReadWrite: Sized {
        fn read(vcpu: &Vcpu) -> Result<Self>;
        fn write(self, vcpu: &Vcpu) -> Result<()>;
    }

    #[derive(Clone, Default, Deserialize, Serialize)]
    pub struct VcpuRunStateV1 {
        pub run_state: u32,
        pub sipi_vector: u8,

        pub intr_shadow: bool,
        pub pending_nmi: bool,
        pub pending_extint: bool,
        pub pending_exception: u64,
        pub pending_intinfo: u64,
    }
    impl Schema<'_> for VcpuRunStateV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-vcpu-runstate", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct VcpuGpRegsV1 {
        pub rax: u64,
        pub rcx: u64,
        pub rdx: u64,
        pub rbx: u64,
        pub rsp: u64,
        pub rbp: u64,
        pub rsi: u64,
        pub rdi: u64,
        pub r8: u64,
        pub r9: u64,
        pub r10: u64,
        pub r11: u64,
        pub r12: u64,
        pub r13: u64,
        pub r14: u64,
        pub r15: u64,

        pub rip: u64,
        pub rflags: u64,
    }
    impl Schema<'_> for VcpuGpRegsV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-vcpu-gpregs", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct VcpuCtrlRegsV1 {
        pub cr0: u64,
        pub cr2: u64,
        pub cr3: u64,
        pub cr4: u64,
        pub xcr0: u64,

        /// EFER MSR contents
        ///
        /// We count it among the control registers, rather than the rest of the
        /// MSRs, because of its involvement in configuring long mode.
        pub efer: u64,
    }
    impl Schema<'_> for VcpuCtrlRegsV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-vcpu-ctrlregs", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct VcpuDbgRegsV1 {
        pub dr0: u64,
        pub dr1: u64,
        pub dr2: u64,
        pub dr3: u64,
        pub dr6: u64,
        pub dr7: u64,
        /// DEBUGCTL MSR
        pub debugctl: u64,
    }
    impl Schema<'_> for VcpuDbgRegsV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-vcpu-dbgregs", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct VcpuSegRegsV1 {
        pub cs: SegDesc,
        pub ds: SegDesc,
        pub es: SegDesc,
        pub fs: SegDesc,
        pub gs: SegDesc,
        pub ss: SegDesc,
        pub gdtr: SegDesc,
        pub idtr: SegDesc,
        pub ldtr: SegDesc,
        pub tr: SegDesc,
    }
    impl Schema<'_> for VcpuSegRegsV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-vcpu-segregs", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct SegDesc {
        pub base: u64,
        pub limit: u32,
        pub access: u32,
        pub selector: u16,
    }

    #[derive(Clone, Default, Deserialize, Serialize)]
    pub struct VcpuMsrsV1(Vec<MsrEntry>);
    impl Schema<'_> for VcpuMsrsV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-cpu-msregs", 1)
        }
    }

    #[derive(Copy, Clone, Default, Deserialize, Serialize)]
    pub struct MsrEntry {
        pub ident: u32,
        pub value: u64,
    }

    #[derive(Clone, Default, Deserialize, Serialize)]
    pub struct FpuStateV1 {
        pub blob: Vec<u8>,
    }
    impl Schema<'_> for FpuStateV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-cpu-fpu", 1)
        }
    }

    #[derive(Clone, Default, Deserialize, Serialize)]
    pub struct LapicV1 {
        pub page: LapicPageV1,
        pub msr_apicbase: u64,
        pub timer_target: i64,
        pub esr_pending: u32,
    }
    impl Schema<'_> for LapicV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-lapic", 1)
        }
    }

    #[derive(Clone, Default, Deserialize, Serialize)]
    pub struct LapicPageV1 {
        pub id: u32,
        pub version: u32,
        pub tpr: u32,
        pub apr: u32,
        pub ldr: u32,
        pub dfr: u32,
        pub svr: u32,
        pub isr: [u32; 8],
        pub tmr: [u32; 8],
        pub irr: [u32; 8],
        pub esr: u32,
        pub lvt_cmci: u32,
        pub icr: u64,
        pub lvt_timer: u32,
        pub lvt_thermal: u32,
        pub lvt_pcint: u32,
        pub lvt_lint0: u32,
        pub lvt_lint1: u32,
        pub lvt_error: u32,
        pub icr_timer: u32,
        pub dcr_timer: u32,
    }

    #[derive(Clone, Deserialize, Serialize)]
    pub struct PmuAmdV1 {
        pub evtsel: [u64; 6],
        pub counter: [u64; 6],
    }
    impl Schema<'_> for PmuAmdV1 {
        fn id() -> SchemaId {
            ("bhyve-x86-pmu-amd", 1)
        }
    }

    impl From<(bhyve_api::seg_desc, u16)> for SegDesc {
        fn from(value: (bhyve_api::seg_desc, u16)) -> Self {
            let (desc, selector) = value;
            Self {
                base: desc.base,
                limit: desc.limit,
                access: desc.access,
                selector,
            }
        }
    }
    impl Into<(bhyve_api::seg_desc, u16)> for SegDesc {
        fn into(self) -> (bhyve_api::seg_desc, u16) {
            (
                bhyve_api::seg_desc {
                    base: self.base,
                    limit: self.limit,
                    access: self.access,
                },
                self.selector,
            )
        }
    }

    impl From<vdi_field_entry_v1> for MsrEntry {
        fn from(raw: vdi_field_entry_v1) -> Self {
            Self { ident: raw.vfe_ident, value: raw.vfe_value }
        }
    }
    impl From<MsrEntry> for vdi_field_entry_v1 {
        fn from(entry: MsrEntry) -> Self {
            vdi_field_entry_v1::new(entry.ident, entry.value)
        }
    }

    impl From<bhyve_api::vdi_lapic_v1> for LapicV1 {
        fn from(value: bhyve_api::vdi_lapic_v1) -> Self {
            Self {
                page: value.vl_lapic.into(),
                msr_apicbase: value.vl_msr_apicbase,
                timer_target: value.vl_timer_target,
                esr_pending: value.vl_esr_pending,
            }
        }
    }
    impl From<LapicV1> for bhyve_api::vdi_lapic_v1 {
        fn from(value: LapicV1) -> Self {
            bhyve_api::vdi_lapic_v1 {
                vl_lapic: value.page.into(),
                vl_msr_apicbase: value.msr_apicbase,
                vl_timer_target: value.timer_target,
                vl_esr_pending: value.esr_pending,
            }
        }
    }
    impl From<bhyve_api::vdi_lapic_page_v1> for LapicPageV1 {
        fn from(value: bhyve_api::vdi_lapic_page_v1) -> Self {
            Self {
                id: value.vlp_id,
                version: value.vlp_version,
                tpr: value.vlp_tpr,
                apr: value.vlp_apr,
                ldr: value.vlp_ldr,
                dfr: value.vlp_dfr,
                svr: value.vlp_svr,
                isr: value.vlp_isr,
                tmr: value.vlp_tmr,
                irr: value.vlp_irr,
                esr: value.vlp_esr,
                lvt_cmci: value.vlp_lvt_cmci,
                icr: value.vlp_icr,
                lvt_timer: value.vlp_lvt_timer,
                lvt_thermal: value.vlp_lvt_thermal,
                lvt_pcint: value.vlp_lvt_pcint,
                lvt_lint0: value.vlp_lvt_lint0,
                lvt_lint1: value.vlp_lvt_lint1,
                lvt_error: value.vlp_lvt_error,
                icr_timer: value.vlp_icr_timer,
                dcr_timer: value.vlp_dcr_timer,
            }
        }
    }
    impl From<LapicPageV1> for bhyve_api::vdi_lapic_page_v1 {
        fn from(value: LapicPageV1) -> Self {
            bhyve_api::vdi_lapic_page_v1 {
                vlp_id: value.id,
                vlp_version: value.version,
                vlp_tpr: value.tpr,
                vlp_apr: value.apr,
                vlp_ldr: value.ldr,
                vlp_dfr: value.dfr,
                vlp_svr: value.svr,
                vlp_isr: value.isr,
                vlp_tmr: value.tmr,
                vlp_irr: value.irr,
                vlp_esr: value.esr,
                vlp_lvt_cmci: value.lvt_cmci,
                vlp_icr: value.icr,
                vlp_lvt_timer: value.lvt_timer,
                vlp_lvt_thermal: value.lvt_thermal,
                vlp_lvt_pcint: value.lvt_pcint,
                vlp_lvt_lint0: value.lvt_lint0,
                vlp_lvt_lint1: value.lvt_lint1,
                vlp_lvt_error: value.lvt_error,
                vlp_icr_timer: value.icr_timer,
                vlp_dcr_timer: value.dcr_timer,
            }
        }
    }
    impl From<bhyve_api::vdi_pmu_amd_v1> for PmuAmdV1 {
        fn from(value: bhyve_api::vdi_pmu_amd_v1) -> Self {
            PmuAmdV1 { evtsel: value.vpa_evtsel, counter: value.vpa_ctr }
        }
    }
    impl From<PmuAmdV1> for bhyve_api::vdi_pmu_amd_v1 {
        fn from(value: PmuAmdV1) -> Self {
            bhyve_api::vdi_pmu_amd_v1 {
                vpa_evtsel: value.evtsel,
                vpa_ctr: value.counter,
            }
        }
    }

    impl VcpuReadWrite for VcpuRunStateV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let run_state = vcpu.get_run_state()?;

            let vmm_arch: Vec<bhyve_api::vdi_field_entry_v1> = vcpu
                .hdl
                .data_op(bhyve_api::VDC_VMM_ARCH, 1)
                .for_vcpu(vcpu.id)
                .read_all()?;

            // Load all of the pending interrupt/exception state
            //
            // If illumos#15143 support is missing, none of these fields will be
            // present, so the values will remain false/zeroed.  Such an outcome
            // is fine for now.
            let (
                mut pending_nmi,
                mut pending_extint,
                mut pending_exception,
                mut pending_intinfo,
            ) = (false, false, 0, 0);
            for ent in vmm_arch.iter() {
                match ent.vfe_ident {
                    bhyve_api::VAI_PEND_NMI => pending_nmi = ent.vfe_value != 0,
                    bhyve_api::VAI_PEND_EXTINT => {
                        pending_extint = ent.vfe_value != 0
                    }
                    bhyve_api::VAI_PEND_EXCP => {
                        pending_exception = ent.vfe_value
                    }
                    bhyve_api::VAI_PEND_INTINFO => {
                        pending_intinfo = ent.vfe_value
                    }
                    _ => {}
                }
            }
            let intr_shadow =
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_INTR_SHADOW)? != 0;

            Ok(Self {
                run_state: run_state.state,
                sipi_vector: run_state.sipi_vector,
                intr_shadow,
                pending_nmi,
                pending_extint,
                pending_exception,
                pending_intinfo,
            })
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            vcpu.set_run_state(self.run_state, Some(self.sipi_vector))?;
            vcpu.set_reg(
                vm_reg_name::VM_REG_GUEST_INTR_SHADOW,
                u64::from(self.intr_shadow),
            )?;

            let ents = [
                vdi_field_entry_v1::new(
                    bhyve_api::VAI_PEND_NMI,
                    u64::from(self.pending_nmi),
                ),
                vdi_field_entry_v1::new(
                    bhyve_api::VAI_PEND_EXTINT,
                    u64::from(self.pending_extint),
                ),
                vdi_field_entry_v1::new(
                    bhyve_api::VAI_PEND_EXCP,
                    self.pending_exception,
                ),
                vdi_field_entry_v1::new(
                    bhyve_api::VAI_PEND_INTINFO,
                    self.pending_intinfo,
                ),
            ];

            // Do not attempt to import interrupt/exception state unless there
            // is proper support for it on the host we are running upon.
            //
            // When hosts with illumos#15143 integrated become common, the
            // overall required version for propolis can grow to encompass V10
            // and this check can be elided.
            if bhyve_api::api_version()? >= ApiVersion::V10 {
                vcpu.hdl
                    .data_op(bhyve_api::VDC_VMM_ARCH, 1)
                    .for_vcpu(vcpu.id)
                    .write_many(&ents)?;
            }

            Ok(())
        }
    }

    impl VcpuReadWrite for VcpuGpRegsV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            Ok(Self {
                rax: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RAX)?,
                rcx: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RCX)?,
                rdx: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RDX)?,
                rbx: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RBX)?,
                rsp: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RSP)?,
                rbp: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RBP)?,
                rsi: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RSI)?,
                rdi: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RDI)?,
                r8: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R8)?,
                r9: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R9)?,
                r10: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R10)?,
                r11: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R11)?,
                r12: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R12)?,
                r13: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R13)?,
                r14: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R14)?,
                r15: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_R15)?,
                rip: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RIP)?,
                rflags: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_RFLAGS)?,
            })
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RAX, self.rax)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RCX, self.rcx)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RDX, self.rdx)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RBX, self.rbx)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RSP, self.rsp)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RBP, self.rbp)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RSI, self.rsi)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RDI, self.rdi)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R8, self.r8)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R9, self.r9)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R10, self.r10)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R11, self.r11)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R12, self.r12)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R13, self.r13)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R14, self.r14)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_R15, self.r15)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RIP, self.rip)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_RFLAGS, self.rflags)?;
            Ok(())
        }
    }

    impl VcpuReadWrite for VcpuCtrlRegsV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            Ok(Self {
                cr0: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_CR0)?,
                cr2: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_CR2)?,
                cr3: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_CR3)?,
                cr4: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_CR4)?,
                efer: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_EFER)?,
                xcr0: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_XCR0)?,
            })
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_CR0, self.cr0)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_CR2, self.cr2)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_CR3, self.cr3)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_CR4, self.cr4)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_EFER, self.efer)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_XCR0, self.xcr0)?;
            Ok(())
        }
    }
    impl VcpuReadWrite for VcpuDbgRegsV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            Ok(Self {
                dr0: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR0)?,
                dr1: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR1)?,
                dr2: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR2)?,
                dr3: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR3)?,
                dr6: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR6)?,
                dr7: vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DR7)?,
                // TODO: populate from MSR
                debugctl: 0,
            })
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR0, self.dr0)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR1, self.dr1)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR2, self.dr2)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR3, self.dr3)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR6, self.dr6)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DR7, self.dr7)?;
            // TODO: set debugctl MSR
            Ok(())
        }
    }

    impl VcpuReadWrite for VcpuSegRegsV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let cs = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_CS)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_CS)? as u16,
            ));
            let ds = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_DS)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_DS)? as u16,
            ));
            let es = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_ES)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_ES)? as u16,
            ));
            let fs = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_FS)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_FS)? as u16,
            ));
            let gs = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_GS)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_GS)? as u16,
            ));
            let ss = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_SS)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_SS)? as u16,
            ));
            let gdtr = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_GDTR)?,
                // GDT has no selector register
                0,
            ));
            let idtr = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_IDTR)?,
                // IDT has no selector register
                0,
            ));
            let ldtr = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_LDTR)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_LDTR)? as u16,
            ));
            let tr = SegDesc::from((
                vcpu.get_segreg(vm_reg_name::VM_REG_GUEST_TR)?,
                vcpu.get_reg(vm_reg_name::VM_REG_GUEST_TR)? as u16,
            ));
            Ok(Self { cs, ds, es, fs, gs, ss, gdtr, idtr, ldtr, tr })
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            let (cs, css) = self.cs.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_CS, &cs)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_CS, css.into())?;

            let (ds, dss) = self.ds.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_DS, &ds)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_DS, dss.into())?;

            let (es, ess) = self.es.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_ES, &es)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_ES, ess.into())?;

            let (fs, fss) = self.fs.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_FS, &fs)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_FS, fss.into())?;

            let (gs, gss) = self.gs.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_GS, &gs)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_GS, gss.into())?;

            let (ss, sss) = self.ss.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_SS, &ss)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_SS, sss.into())?;

            let (gdtr, _) = self.gdtr.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_GDTR, &gdtr)?;

            let (idtr, _) = self.idtr.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_IDTR, &idtr)?;

            let (ldtr, ldtrs) = self.ldtr.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_LDTR, &ldtr)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_LDTR, ldtrs.into())?;

            let (tr, trs) = self.tr.into();
            vcpu.set_segreg(vm_reg_name::VM_REG_GUEST_TR, &tr)?;
            vcpu.set_reg(vm_reg_name::VM_REG_GUEST_TR, trs.into())?;
            Ok(())
        }
    }
    impl VcpuMsrsV1 {
        const fn valid_msr(ident: u32) -> bool {
            use super::bits::*;

            match ident {
                // EFER is held in CtrlRegs
                MSR_EFER => false,
                // DEBUGCTL is held in DbgRegs
                MSR_DEBUGCTL => false,

                _ => true,
            }
        }
    }

    impl VcpuReadWrite for VcpuMsrsV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let raw_msrs: Vec<bhyve_api::vdi_field_entry_v1> = vcpu
                .hdl
                .data_op(bhyve_api::VDC_MSR, 1)
                .for_vcpu(vcpu.id)
                .read_all()?;

            let mut filtered: Vec<MsrEntry> = raw_msrs
                .into_iter()
                .filter_map(|ent| {
                    if Self::valid_msr(ent.vfe_ident) {
                        Some(ent.into())
                    } else {
                        None
                    }
                })
                .collect();

            // Sort the output to make it more readable in case someone happens
            // to be inspecting the device payloads
            filtered.sort_unstable_by_key(|v| v.ident);

            Ok(Self(filtered))
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            let raw_msrs: Vec<bhyve_api::vdi_field_entry_v1> = self
                .0
                .into_iter()
                .filter_map(|ent| {
                    // belt-and-suspenders verification that provided MSRs are
                    // acceptable for loading into the kernel vmm
                    if Self::valid_msr(ent.ident) {
                        Some(ent.into())
                    } else {
                        None
                    }
                })
                .collect();

            vcpu.hdl
                .data_op(bhyve_api::VDC_MSR, 1)
                .for_vcpu(vcpu.id)
                .write_many(&raw_msrs)?;

            Ok(())
        }
    }

    impl VcpuReadWrite for FpuStateV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let mut fpu_area_desc = bhyve_api::vm_fpu_desc::default();

            unsafe {
                vcpu.hdl
                    .ioctl(bhyve_api::VM_DESC_FPU_AREA, &mut fpu_area_desc)?;
            }
            let len = fpu_area_desc.vfd_req_size as usize;
            let mut fpu = Vec::with_capacity(len);
            fpu.resize_with(len, u8::default);

            let mut fpu_req = bhyve_api::vm_fpu_state {
                vcpuid: vcpu.cpuid(),
                buf: fpu.as_mut_ptr() as *mut libc::c_void,
                len: fpu_area_desc.vfd_req_size,
            };
            unsafe {
                vcpu.hdl.ioctl(bhyve_api::VM_GET_FPU, &mut fpu_req)?;
            }

            Ok(Self { blob: fpu })
        }

        fn write(mut self, vcpu: &Vcpu) -> Result<()> {
            let mut fpu_req = bhyve_api::vm_fpu_state {
                vcpuid: vcpu.cpuid(),
                buf: self.blob.as_mut_ptr() as *mut _,
                len: self.blob.len().try_into().map_err(|_| {
                    io::Error::new(
                        io::ErrorKind::Other,
                        "fpu blob size too large",
                    )
                })?,
            };
            unsafe {
                vcpu.hdl.ioctl(bhyve_api::VM_SET_FPU, &mut fpu_req)?;
            }
            Ok(())
        }
    }
    impl VcpuReadWrite for LapicV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let mut vdi = vcpu
                .hdl
                .data_op(bhyve_api::VDC_LAPIC, 1)
                .for_vcpu(vcpu.id)
                .read::<bhyve_api::vdi_lapic_v1>()?;

            // A timer target without a value in ICR is nonsensical
            if vdi.vl_timer_target != 0 && vdi.vl_lapic.vlp_icr_timer == 0 {
                if vcpu.hdl.api_version()? <= ApiVersion::V16 {
                    // Fix up invalid LAPIC timer data on kernels predating the
                    // fix from illumos#16183
                    vdi.vl_timer_target = 0;
                } else {
                    return Err(io::Error::new(
                        io::ErrorKind::InvalidData,
                        "post-illumos#16183 kernel emitting bad ICR timer data",
                    ));
                }
            }

            Ok(vdi.into())
        }
        fn write(self, vcpu: &Vcpu) -> Result<()> {
            // Be wary of illumos#16183 payloads
            if self.timer_target != 0 && self.page.icr_timer == 0 {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidData,
                    "ICR-timer does not match timer target time",
                ));
            }

            vcpu.hdl
                .data_op(bhyve_api::VDC_LAPIC, 1)
                .for_vcpu(vcpu.id)
                .write::<bhyve_api::vdi_lapic_v1>(&self.into())?;

            Ok(())
        }
    }
    impl VcpuReadWrite for PmuAmdV1 {
        fn read(vcpu: &Vcpu) -> Result<Self> {
            let vdi = vcpu
                .hdl
                .data_op(bhyve_api::VDC_PMU_AMD, 1)
                .for_vcpu(vcpu.id)
                .read::<bhyve_api::vdi_pmu_amd_v1>()?;

            Ok(vdi.into())
        }

        fn write(self, vcpu: &Vcpu) -> Result<()> {
            vcpu.hdl
                .data_op(bhyve_api::VDC_PMU_AMD, 1)
                .for_vcpu(vcpu.id)
                .write::<bhyve_api::vdi_pmu_amd_v1>(&self.into())?;

            Ok(())
        }
    }
}

mod bits {
    pub const MSR_DEBUGCTL: u32 = 0x1d9;
    pub const MSR_EFER: u32 = 0xc0000080;

    pub const IDT_GP: u8 = 0xd;
}

/// Pretty-printable diagnostic information about the state of a vCPU.
pub struct Diagnostics {
    gp_regs: Result<GuestData<migrate::VcpuGpRegsV1>>,
    seg_regs: Result<GuestData<migrate::VcpuSegRegsV1>>,
    ctrl_regs: Result<GuestData<migrate::VcpuCtrlRegsV1>>,
}

impl Diagnostics {
    pub fn capture(vcpu: &Vcpu) -> Self {
        Self {
            gp_regs: migrate::VcpuGpRegsV1::read(vcpu).map(GuestData::from),
            seg_regs: migrate::VcpuSegRegsV1::read(vcpu).map(GuestData::from),
            ctrl_regs: migrate::VcpuCtrlRegsV1::read(vcpu).map(GuestData::from),
        }
    }
}

impl std::fmt::Display for migrate::VcpuGpRegsV1 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(f, "%rax = {:#018x}\t%r9  = {:#018x}", self.rax, self.r9)?;
        writeln!(f, "%rbx = {:#018x}\t%r10 = {:#018x}", self.rbx, self.r10)?;
        writeln!(f, "%rcx = {:#018x}\t%r11 = {:#018x}", self.rcx, self.r11)?;
        writeln!(f, "%rdx = {:#018x}\t%r12 = {:#018x}", self.rdx, self.r12)?;
        writeln!(f, "%rsi = {:#018x}\t%r13 = {:#018x}", self.rsi, self.r13)?;
        writeln!(f, "%rdi = {:#018x}\t%r14 = {:#018x}", self.rdi, self.r14)?;
        writeln!(f, "%r8  = {:#018x}\t%r15 = {:#018x}", self.r8, self.r15)?;
        writeln!(f)?;
        writeln!(f, "%rip = {:#018x}", self.rip)?;
        writeln!(f, "%rbp = {:#018x}", self.rbp)?;
        writeln!(f, "%rsp = {:#018x}", self.rsp)?;
        writeln!(f, "%rflags = {:#018x}", self.rflags)?;

        Ok(())
    }
}

impl std::fmt::Display for migrate::SegDesc {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "sel = {:#06x}\tbase = {:#018x}", self.selector, self.base)?;
        write!(
            f,
            "\tlimit = {:#010x}\taccess = {:#010x}",
            self.limit, self.access
        )?;
        Ok(())
    }
}

impl std::fmt::Display for migrate::VcpuSegRegsV1 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(f, "%cs:   {}", self.cs)?;
        writeln!(f, "%ds:   {}", self.ds)?;
        writeln!(f, "%es:   {}", self.es)?;
        writeln!(f, "%fs:   {}", self.fs)?;
        writeln!(f, "%gs:   {}", self.gs)?;
        writeln!(f, "%ss:   {}", self.ss)?;
        writeln!(f, "%gdtr: {}", self.gdtr)?;
        writeln!(f, "%idtr: {}", self.idtr)?;
        writeln!(f, "%ldtr: {}", self.ldtr)?;
        writeln!(f, "%tr:   {}", self.tr)?;
        Ok(())
    }
}

impl std::fmt::Display for migrate::VcpuCtrlRegsV1 {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(f, "%cr0  = {:#018x}\t%cr2  = {:#018x}", self.cr0, self.cr2)?;
        writeln!(f, "%cr3  = {:#018x}\t%cr4  = {:#018x}", self.cr3, self.cr4)?;
        writeln!(
            f,
            "%xcr0 = {:#018x}\t%efer = {:#018x}",
            self.xcr0, self.efer
        )?;
        Ok(())
    }
}

impl std::fmt::Display for Diagnostics {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        writeln!(f)?;
        writeln!(
            f,
            "{}",
            self.gp_regs.as_ref().map(|regs| regs.to_string()).unwrap_or_else(
                |e| format!("error reading general-purpose registers: {e}")
            )
        )?;
        writeln!(
            f,
            "{}",
            self.seg_regs.as_ref().map(|regs| regs.to_string()).unwrap_or_else(
                |e| format!("error reading segment registers: {e}")
            )
        )?;
        writeln!(
            f,
            "{}",
            self.ctrl_regs
                .as_ref()
                .map(|regs| regs.to_string())
                .unwrap_or_else(|e| format!(
                    "error reading control registers: {e}"
                ))
        )?;
        Ok(())
    }
}


================================================
FILE: lib/propolis/src/vmm/hdl.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Module responsible for communicating with the kernel's VMM.
//!
//! Responsible for both issuing commands to the bhyve
//! kernel controller to create and destroy VMs.
//!
//! Additionally, contains a wrapper struct ([`VmmHdl`])
//! for encapsulating commands to the underlying kernel
//! object which represents a single VM.

use std::fs::File;
use std::io::{Error, ErrorKind, Result, Write};
use std::os::raw::c_void;
use std::os::unix::io::{AsRawFd, RawFd};
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;

use crate::common::PAGE_SIZE;
use crate::vmm::mem::Prot;

/// Configurable options for VMM instance creation
///
/// # Options:
/// - `force`: If a VM with the name `name` already exists, attempt
///   to destroy the VM before creating it.
/// - `use_reservoir`: Allocate guest memory (only) from the VMM reservoir.  If
/// this is enabled, and memory in excess of what is available from the
/// reservoir is requested, creation of that guest memory resource will fail.
#[derive(Default, Copy, Clone)]
pub struct CreateOpts {
    pub force: bool,
    pub use_reservoir: bool,
    pub track_dirty: bool,
}

/// Creates a new virtual machine with the provided `name`.
///
/// Operates on the bhyve controller object at `/dev/vmmctl`,
/// which acts as an interface to the kernel module, and opens
/// an object at `/dev/vmm/{name}`.
///
/// # Arguments
/// - `name`: The name of the VM to create.
/// - `opts`: Creation options (detailed in `CreateOpts`)
pub(crate) fn create_vm(name: &str, opts: CreateOpts) -> Result<VmmHdl> {
    let ctl = bhyve_api::VmmCtlFd::open()?;

    let mut req = bhyve_api::vm_create_req::new(name.as_bytes())?;
    if opts.use_reservoir {
        req.flags |= bhyve_api::VCF_RESERVOIR_MEM;
    }
    if opts.track_dirty {
        req.flags |= bhyve_api::VCF_TRACK_DIRTY;
    }
    let res = unsafe { ctl.ioctl(bhyve_api::VMM_CREATE_VM, &mut req) };
    if let Err(e) = res {
        if e.kind() != ErrorKind::AlreadyExists || !opts.force {
            return Err(e);
        }

        // try to nuke(!) the existing vm
        ctl.vm_destroy(name.as_bytes()).or_else(|e| match e.kind() {
            ErrorKind::NotFound => Ok(()),
            _ => Err(e),
        })?;

        // now attempt to create in its presumed absence
        let _ = unsafe { ctl.ioctl(bhyve_api::VMM_CREATE_VM, &mut req) }?;
    }

    // Safety: Files opened within VMM_PATH_PREFIX are VMMs, which may not be
    // truncated.
    let inner = bhyve_api::VmmFd::open(name)?;

    Ok(VmmHdl {
        inner,
        destroyed: AtomicBool::new(false),
        name: name.to_string(),
        #[cfg(test)]
        is_test_hdl: false,
    })
}

/// A wrapper around a file which must uphold the guarantee that the underlying
/// structure may not be truncated.
pub struct VmmFile(File);

impl VmmFile {
    /// Constructs a new `VmmFile`.
    ///
    /// # Safety
    ///
    /// The caller must guarantee that the provided file cannot be truncated.
    pub unsafe fn new(f: File) -> Self {
        VmmFile(f)
    }

    /// Accesses the VMM as a raw fd.
    pub fn fd(&self) -> RawFd {
        self.0.as_raw_fd()
    }
}

/// A handle to an existing virtual machine monitor.
pub struct VmmHdl {
    pub(super) inner: bhyve_api::VmmFd,
    destroyed: AtomicBool,
    name: String,

    #[cfg(test)]
    /// Track if this VmmHdl belongs to a wholly fictitious Instance/Machine.
    is_test_hdl: bool,
}
impl VmmHdl {
    /// Accesses the raw file descriptor behind the VMM.
    pub fn fd(&self) -> RawFd {
        self.inner.as_raw_fd()
    }
    /// Sends an ioctl to the underlying VMM.
    pub unsafe fn ioctl<T>(&self, cmd: i32, data: *mut T) -> Result<()> {
        if self.destroyed.load(Ordering::Acquire) {
            return Err(Error::new(ErrorKind::NotFound, "instance destroyed"));
        }

        #[cfg(test)]
        if self.is_test_hdl {
            // Lie about all ioctl results, since there is no real vmm resource
            // underlying this handle.
            return Ok(());
        }

        self.inner.ioctl(cmd, data)?;
        Ok(())
    }

    /// Sends an ioctl (with usize param) to the underlying VMM.
    pub fn ioctl_usize(&self, cmd: i32, data: usize) -> Result<()> {
        if self.destroyed.load(Ordering::Acquire) {
            return Err(Error::new(ErrorKind::NotFound, "instance destroyed"));
        }

        #[cfg(test)]
        if self.is_test_hdl {
            // Lie about all ioctl results, since there is no real vmm resource
            // underlying this handle.
            return Ok(());
        }

        self.inner.ioctl_usize(cmd, data)?;
        Ok(())
    }

    /// Query the API version exposed by the kernel VMM.
    pub fn api_version(&self) -> Result<u32> {
        self.inner.api_version()
    }

    /// Allocate a memory segment within the VM.
    ///
    /// # Arguments
    /// - `segid`: The segment ID of the requested memory.
    /// - `size`: The size of the memory region, in bytes.
    /// - `segname`: The (optional) name of the memory segment.
    pub fn create_memseg(
        &self,
        segid: i32,
        size: usize,
        segname: Option<&str>,
    ) -> Result<()> {
        let mut seg = bhyve_api::vm_memseg {
            segid,
            len: size,
            name: [0u8; bhyve_api::VM_MAX_SEG_NAMELEN],
        };
        if let Some(name) = segname {
            let name_raw = name.as_bytes();

            assert!(name_raw.len() < bhyve_api::VM_MAX_SEG_NAMELEN);
            (&mut seg.name[..]).write_all(name_raw)?;
        }
        unsafe { self.ioctl(bhyve_api::VM_ALLOC_MEMSEG, &mut seg) }
    }

    /// Maps a memory segment within the guest address space.
    ///
    /// # Arguments
    /// - `segid`: The segment ID to be mapped.
    /// - `gpa`: The "Guest Physical Address" to be mapped.
    /// - `len`: The length of the mapping, in bytes. Must be page aligned.
    /// - `segoff`: Offset within the `gpa` where the mapping should occur.
    /// Must be page aligned.
    /// - `prot`: Memory protections to apply to the guest mapping.
    pub fn map_memseg(
        &self,
        segid: i32,
        gpa: usize,
        len: usize,
        segoff: usize,
        prot: Prot,
    ) -> Result<()> {
        assert!(segoff <= i64::MAX as usize);

        let mut map = bhyve_api::vm_memmap {
            gpa: gpa as u64,
            segid,
            segoff: segoff as i64,
            len,
            prot: i32::from(prot.bits()),
            flags: 0,
        };
        unsafe { self.ioctl(bhyve_api::VM_MMAP_MEMSEG, &mut map) }
    }

    /// Looks up a segment by `segid` and returns the offset
    /// within the guest's address virtual address space where
    /// it is mapped.
    pub fn devmem_offset(&self, segid: i32) -> Result<usize> {
        let mut devoff = bhyve_api::vm_devmem_offset { segid, offset: 0 };
        unsafe {
            self.ioctl(bhyve_api::VM_DEVMEM_GETOFFSET, &mut devoff)?;
        }

        assert!(devoff.offset >= 0);
        Ok(devoff.offset as usize)
    }

    /// Tracks dirty pages in the guest's physical address space, clearing any
    /// dirty bits set on pages in the tracked range.
    ///
    /// # Arguments:
    /// - `start_gpa`: The start of the guest physical address range to track.
    /// Must be page aligned.
    /// - `bitmap`: A mutable bitmap of dirty pages, one bit per guest PFN
    /// relative to `start_gpa`.
    pub fn track_dirty_pages(
        &self,
        start_gpa: u64,
        bitmap: &mut [u8],
    ) -> Result<()> {
        let mut tracker = bhyve_api::vmm_dirty_tracker {
            vdt_start_gpa: start_gpa,
            vdt_len: page_bitmap_len(&bitmap),
            vdt_pfns: bitmap.as_mut_ptr() as *mut c_void,
        };
        unsafe { self.ioctl(bhyve_api::VM_TRACK_DIRTY_PAGES, &mut tracker) }
    }

    /// Set the dirty bits on pages in the guest's physical address space.
    ///
    /// This method takes a bitmap in which each bit represents a page. For each
    /// bit that's set in the bitmap, the corresponding guest page will have its
    /// dirty bit set.
    ///
    /// # Arguments:
    /// - `start_gpa`: The start of the guest physical address range to track.
    ///   Must be page aligned.
    /// - `bitmap`: A bitmap indicating which pages to set dirty bits for, one
    ///   bit per guest PFN relative to `start_gpa`.
    ///
    /// # Supported Bhyve Versions
    ///
    /// This method is only available on bhyve [v17](bhyve_api::ApiVersion::V17)
    /// and later. If the bhyve API version is older than that, this method will
    /// return an error. The [`VmmHdl::can_npt_operate`] method returns `true`
    /// if this method is supported.
    pub fn set_dirty_pages(&self, start_gpa: u64, bitmap: &[u8]) -> Result<()> {
        if !self.can_npt_operate() {
            return Err(Error::new(
                ErrorKind::Unsupported,
                "VmmHdl::set_dirty_pages requires bhyve v17 or later",
            ));
        }

        let mut npt_op = bhyve_api::vm_npt_operation {
            vno_gpa: start_gpa,
            vno_len: page_bitmap_len(bitmap) as u64,
            vno_operation: bhyve_api::VNO_OP_SET_DIRTY
                | bhyve_api::VNO_FLAG_BITMAP_IN,
            vno_bitmap: bitmap.as_ptr() as *mut _,
        };
        unsafe { self.ioctl(bhyve_api::VM_NPT_OPERATION, &mut npt_op) }
    }

    /// Returns `true` if  the current bhyve version supports the
    /// `VM_NPT_OPERATION` ioctl, used by [`VmmHdl::set_dirty_pages`] method.
    pub fn can_npt_operate(&self) -> bool {
        self.api_version()
            .map(|v| v >= bhyve_api::ApiVersion::V17)
            // If we couldn't read the Bhyve API version, assume the operation
            // is unsupported.
            .unwrap_or(false)
    }

    /// Issues a request to update the virtual RTC time.
    pub fn rtc_settime(&self, time: Duration) -> Result<()> {
        self.inner.rtc_settime(time)
    }
    /// Writes to the registers within the RTC device.
    pub fn rtc_write(&self, offset: u8, value: u8) -> Result<()> {
        let mut data =
            bhyve_api::vm_rtc_data { offset: i32::from(offset), value };
        unsafe { self.ioctl(bhyve_api::VM_RTC_WRITE, &mut data) }
    }
    /// Reads from the registers within the RTC device.
    pub fn rtc_read(&self, offset: u8) -> Result<u8> {
        let mut data =
            bhyve_api::vm_rtc_data { offset: i32::from(offset), value: 0 };
        unsafe {
            self.ioctl(bhyve_api::VM_RTC_READ, &mut data)?;
        }
        Ok(data.value)
    }

    /// Asserts the requested IRQ for the virtual interrupt controller.
    ///
    /// `pic_irq` sends a request to the legacy 8259 PIC.
    /// `ioapic_irq` (if supplied) sends a request to the IOAPIC.
    pub fn isa_assert_irq(
        &self,
        pic_irq: u8,
        ioapic_irq: Option<u8>,
    ) -> Result<()> {
        let mut data = bhyve_api::vm_isa_irq {
            atpic_irq: i32::from(pic_irq),
            ioapic_irq: ioapic_irq.map(i32::from).unwrap_or(-1),
        };
        unsafe { self.ioctl(bhyve_api::VM_ISA_ASSERT_IRQ, &mut data) }
    }
    /// Deasserts the requested IRQ.
    pub fn isa_deassert_irq(
        &self,
        pic_irq: u8,
        ioapic_irq: Option<u8>,
    ) -> Result<()> {
        let mut data = bhyve_api::vm_isa_irq {
            atpic_irq: i32::from(pic_irq),
            ioapic_irq: ioapic_irq.map(i32::from).unwrap_or(-1),
        };
        unsafe { self.ioctl(bhyve_api::VM_ISA_DEASSERT_IRQ, &mut data) }
    }
    /// Pulses the requested IRQ, turning it on then off.
    pub fn isa_pulse_irq(
        &self,
        pic_irq: u8,
        ioapic_irq: Option<u8>,
    ) -> Result<()> {
        let mut data = bhyve_api::vm_isa_irq {
            atpic_irq: i32::from(pic_irq),
            ioapic_irq: ioapic_irq.map(i32::from).unwrap_or(-1),
        };
        unsafe { self.ioctl(bhyve_api::VM_ISA_PULSE_IRQ, &mut data) }
    }
    #[allow(unused)]
    pub fn isa_set_trigger_mode(
        &self,
        vec: u8,
        level_mode: bool,
    ) -> Result<()> {
        let mut data = bhyve_api::vm_isa_irq_trigger {
            atpic_irq: i32::from(vec),
            trigger: if level_mode { 1 } else { 0 },
        };
        unsafe { self.ioctl(bhyve_api::VM_ISA_SET_IRQ_TRIGGER, &mut data) }
    }

    #[allow(unused)]
    pub fn ioapic_assert_irq(&self, irq: u8) -> Result<()> {
        let mut data = bhyve_api::vm_ioapic_irq { irq: i32::from(irq) };
        unsafe { self.ioctl(bhyve_api::VM_IOAPIC_ASSERT_IRQ, &mut data) }
    }
    #[allow(unused)]
    pub fn ioapic_deassert_irq(&self, irq: u8) -> Result<()> {
        let mut data = bhyve_api::vm_ioapic_irq { irq: i32::from(irq) };
        unsafe { self.ioctl(bhyve_api::VM_IOAPIC_DEASSERT_IRQ, &mut data) }
    }
    #[allow(unused)]
    pub fn ioapic_pulse_irq(&self, irq: u8) -> Result<()> {
        let mut data = bhyve_api::vm_ioapic_irq { irq: i32::from(irq) };
        unsafe { self.ioctl(bhyve_api::VM_IOAPIC_PULSE_IRQ, &mut data) }
    }
    #[allow(unused)]
    pub fn ioapic_pin_count(&self) -> Result<u8> {
        let mut data = 0u32;
        unsafe {
            self.ioctl(bhyve_api::VM_IOAPIC_PINCOUNT, &mut data)?;
        }
        Ok(data as u8)
    }

    pub fn lapic_msi(&self, addr: u64, msg: u64) -> Result<()> {
        let mut data = bhyve_api::vm_lapic_msi { msg, addr };
        unsafe { self.ioctl(bhyve_api::VM_LAPIC_MSI, &mut data) }
    }

    pub fn pmtmr_locate(&self, port: u16) -> Result<()> {
        unsafe { self.ioctl(bhyve_api::VM_PMTMR_LOCATE, port as *mut usize) }
    }

    pub fn suspend(
        &self,
        how: bhyve_api::vm_suspend_how,
        source: Option<i32>,
    ) -> Result<()> {
        let mut data = bhyve_api::vm_suspend {
            how: how as u32,
            source: source.unwrap_or(-1),
        };
        unsafe { self.ioctl(bhyve_api::VM_SUSPEND, &mut data) }
    }

    pub fn reinit(&self, force_suspend: bool) -> Result<()> {
        let mut data = bhyve_api::vm_reinit { flags: 0 };
        if force_suspend {
            data.flags |= bhyve_api::VM_REINIT_F_FORCE_SUSPEND;
        }
        unsafe { self.ioctl(bhyve_api::VM_REINIT, &mut data) }
    }

    /// Pause device emulation logic for the instance (such as timers, etc).
    /// This allows a consistent snapshot to be taken or loaded.
    pub fn pause(&self) -> Result<()> {
        self.ioctl_usize(bhyve_api::VM_PAUSE, 0)
    }

    /// Resume device emulation logic from a prior [VmmHdl::pause] call.
    pub fn resume(&self) -> Result<()> {
        self.ioctl_usize(bhyve_api::VM_RESUME, 0)
    }

    /// Destroys the VMM.
    // TODO: Should this take "mut self", to consume the object?
    pub fn destroy(&self) -> Result<()> {
        if self.destroyed.swap(true, Ordering::SeqCst) {
            return Err(Error::new(ErrorKind::NotFound, "already destroyed"));
        }

        // Attempt destruction via the handle (rather than going through vmmctl)
        // This is done through the [ioctl_usize] helper rather than
        // [Self::ioctl_usize], since the latter rejects attempted operations
        // after `destroyed` is set.
        if let Ok(_) = self.inner.ioctl_usize(bhyve_api::VM_DESTROY_SELF, 0) {
            return Ok(());
        }

        // If that failed (which may occur on older platforms without
        // self-destruction), then fall back to performing the destroy through
        // the vmmctl device.
        let ctl = bhyve_api::VmmCtlFd::open()?;
        ctl.vm_destroy(self.name.as_bytes()).or_else(|e| match e.kind() {
            ErrorKind::NotFound => Ok(()),
            _ => Err(e),
        })
    }

    /// Set whether instance should auto-destruct when closed
    pub fn set_autodestruct(&self, enable_autodestruct: bool) -> Result<()> {
        self.ioctl_usize(
            bhyve_api::VM_SET_AUTODESTRUCT,
            usize::from(enable_autodestruct),
        )
    }

    pub fn data_op(
        &self,
        class: u16,
        version: u16,
    ) -> bhyve_api::VmmDataOp<'_> {
        self.inner.data_op(class, version)
    }
}

#[cfg(test)]
impl VmmHdl {
    /// Build a VmmHdl instance suitable for unit tests, but nothing else, since
    /// it will not be backed by any real vmm resources.
    pub(crate) fn new_test(mem_size: usize) -> Result<Self> {
        use tempfile::tempfile;
        let fp = tempfile()?;
        fp.set_len(mem_size as u64).unwrap();
        let inner = unsafe { bhyve_api::VmmFd::new_raw(fp) };
        Ok(Self {
            inner,
            destroyed: AtomicBool::new(false),
            name: "TEST-ONLY VMM INSTANCE".to_string(),
            is_test_hdl: true,
        })
    }
}

pub fn query_reservoir() -> Result<bhyve_api::vmm_resv_query> {
    let ctl = bhyve_api::VmmCtlFd::open()?;
    let mut data = bhyve_api::vmm_resv_query::default();
    let _ = unsafe { ctl.ioctl(bhyve_api::VMM_RESV_QUERY, &mut data) }?;
    Ok(data)
}

fn page_bitmap_len(bitmap: &[u8]) -> usize {
    bitmap.len() * 8 * PAGE_SIZE
}


================================================
FILE: lib/propolis/src/vmm/machine.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Representation of a virtual machine's hardware.

use std::io::{Error, ErrorKind, Result};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;

use crate::accessors::*;
use crate::enlightenment::bhyve::BhyveGuestInterface;
use crate::enlightenment::Enlightenment;
use crate::mmio::MmioBus;
use crate::pio::PioBus;
use crate::vcpu::{Vcpu, MAXCPU};
use crate::vmm::{create_vm, CreateOpts, PhysMap, VmmHdl};

/// Arbitrary limit for the top of the physical memory map.
///
/// For now it corresponds to the top address described by the DSDT in the
/// static tables shipped in the "blessed" OVMF ROM shipping with propolis.
/// That DSDT entry describes the 64-bit MMIO region.
///
/// When MMIO and the physmap in general is made more robust, this should be
/// eliminated completely.
pub const MAX_PHYSMEM: usize = 0x1000_0000_0000;

/// The aggregate representation of a virtual machine.
pub struct Machine {
    pub hdl: Arc<VmmHdl>,
    pub vcpus: Vec<Arc<Vcpu>>,

    pub map_physmem: PhysMap,
    pub bus_mmio: Arc<MmioBus>,
    pub bus_pio: Arc<PioBus>,
    pub guest_hv_interface: Arc<dyn Enlightenment>,

    pub acc_mem: MemAccessor,
    pub acc_msi: MsiAccessor,

    // Track if machine has been destroyed prior to drop
    destroyed: AtomicBool,
}

impl Machine {
    pub fn reinitialize(&self) -> Result<()> {
        self.hdl.reinit(true)?;
        self.map_physmem.post_reinit()?;
        Ok(())
    }

    /// (Re)Initialize vCPUs per x86 spec
    pub fn vcpu_x86_setup(&self) -> Result<()> {
        for vcpu in self.vcpus.iter() {
            vcpu.activate()?;
            vcpu.reboot_state()?;
            if vcpu.is_bsp() {
                vcpu.set_run_state(bhyve_api::VRS_RUN, None)?;
                vcpu.set_reg(bhyve_api::vm_reg_name::VM_REG_GUEST_RIP, 0xfff0)?;
            }
        }
        Ok(())
    }

    /// Destroy the `Machine` and its associated resources.  Returns the
    /// underlying [VmmHdl] for the caller to do further
    /// cleanup, such as destroy the kernel VMM instance.
    pub fn destroy(mut self) -> Arc<VmmHdl> {
        self.do_destroy().expect("machine not already destroyed")
    }

    fn do_destroy(&mut self) -> Option<Arc<VmmHdl>> {
        if !self.destroyed.swap(true, Ordering::Relaxed) {
            // Remove the accessor roots so they may not be used further.
            self.acc_mem
                .remove_resource()
                .expect("memory accessor not vacated");
            self.acc_msi.remove_resource().expect("MSI accessor not vacated");

            // Clear out registrations in the PIO/MMIO buses to reduce the
            // chances that they perpetuate a cyclic reference.
            self.bus_pio.clear();
            self.bus_mmio.clear();

            // Clear all of the entries from the physmem map so their associated
            // mappings in the process address space are munmapped.  This relies
            // on acc_mem being removed already in order to succeed.
            // TODO: more verification
            self.map_physmem.destroy();

            Some(self.hdl.clone())
        } else {
            None
        }
    }

    pub fn inject_nmi(&self) -> Result<()> {
        // When the Machine is created, we're guaranteed at least one vcpu, so
        // just send the NMI to the first one.
        self.vcpus[0].inject_nmi()
    }
}
impl Drop for Machine {
    fn drop(&mut self) {
        // If consumer has not opted to own the fate of the contained VmmHdl
        // through an explicit call to Machine::destroy(), then clean up the
        // kernel VMM resource backing it automatically.
        if let Some(hdl) = self.do_destroy() {
            let _ = hdl.destroy();
        }
    }
}

#[cfg(test)]
impl Machine {
    pub(crate) fn new_test() -> Result<Self> {
        // Create a test handle with 2M tempfile to use as our VM "memory"
        let hdl = Arc::new(VmmHdl::new_test(2 * 1024 * 1024)?);

        let mut map = PhysMap::new(MAX_PHYSMEM, hdl.clone());
        // TODO: meaningfully populate these
        map.add_test_rom("test-rom".to_string(), 0, 1024 * 1024)?;
        map.add_test_mem("test-ram".to_string(), 1024 * 1024, 1024 * 1024)?;

        let bus_mmio = Arc::new(MmioBus::new(MAX_PHYSMEM));
        let bus_pio = Arc::new(PioBus::new());
        let guest_hv_interface = Arc::new(BhyveGuestInterface);

        let vcpus = vec![Vcpu::new(
            hdl.clone(),
            0,
            bus_mmio.clone(),
            bus_pio.clone(),
            guest_hv_interface.clone(),
        )];

        let acc_mem = map.finalize();
        let acc_msi = MsiAccessor::new(hdl.clone());

        Ok(Machine {
            hdl: hdl.clone(),
            vcpus,

            map_physmem: map,

            acc_mem,
            acc_msi,

            bus_mmio,
            bus_pio,
            guest_hv_interface,

            destroyed: AtomicBool::new(false),
        })
    }
}

/// Builder object used to initialize a [`Machine`].
///
/// # Example
///
/// ```no_run
/// use propolis::vmm::{Builder, CreateOpts};
///
/// let opts = CreateOpts {
///     // Override any desired VM creation options
///     ..Default::default()
/// };
/// let builder = Builder::new("my-machine", opts).unwrap()
///     .max_cpus(4).unwrap()
///     .add_mem_region(0, 0xc000_0000, "lowmem").unwrap()
///     .add_mem_region(0x1_0000_0000, 0xc000_0000, "highmem").unwrap()
///     .add_rom_region(0xffe0_0000, 0x20_0000, "bootrom").unwrap()
///     .add_mmio_region(0xc0000000, 0x20000000, "dev32").unwrap();
/// let machine = builder.finalize().unwrap();
/// ```
pub struct Builder {
    inner_hdl: Option<Arc<VmmHdl>>,
    physmap: Option<PhysMap>,
    max_cpu: u8,
    guest_hv_interface: Option<Arc<dyn Enlightenment>>,
}
impl Builder {
    /// Constructs a new builder object which may be used
    /// to produce a VM.
    ///
    /// In the construction of this object, the builder
    /// attempts to access the vmm controller at "/dev/vmmctl",
    /// and issues commands to begin construction of the VM.
    ///
    /// # Arguments
    /// - `name`: The name for the new instance.
    /// - `force`: If true, deletes the VM if it already exists.
    pub fn new(name: &str, opts: CreateOpts) -> Result<Self> {
        let hdl = Arc::new(create_vm(name, opts)?);
        let physmap = Some(PhysMap::new(MAX_PHYSMEM, hdl.clone()));
        Ok(Self {
            inner_hdl: Some(hdl),
            max_cpu: 1,
            guest_hv_interface: None,
            physmap,
        })
    }

    /// Creates and maps a memory segment in the guest's address space,
    /// identified as system memory.
    pub fn add_mem_region(
        mut self,
        start: usize,
        len: usize,
        name: &str,
    ) -> Result<Self> {
        self.physmap.as_mut().unwrap().add_mem(name.to_string(), start, len)?;
        Ok(self)
    }

    /// Creates and maps a memory segment in the guest's address space,
    /// identified as ROM.
    pub fn add_rom_region(
        mut self,
        start: usize,
        len: usize,
        name: &str,
    ) -> Result<Self> {
        self.physmap.as_mut().unwrap().add_rom(name.to_string(), start, len)?;
        Ok(self)
    }
    /// Registers a region of memory for MMIO.
    pub fn add_mmio_region(
        mut self,
        start: usize,
        len: usize,
        name: &str,
    ) -> Result<Self> {
        self.physmap.as_mut().unwrap().add_mmio_reservation(
            name.to_string(),
            start,
            len,
        )?;
        Ok(self)
    }
    /// Sets the maximum number of CPUs for the machine.
    pub fn max_cpus(mut self, max: u8) -> Result<Self> {
        if max == 0 || max > MAXCPU as u8 {
            Err(Error::new(ErrorKind::InvalidInput, "maxcpu out of range"))
        } else {
            self.max_cpu = max;
            Ok(self)
        }
    }

    /// Sets the guest-hypervisor interface to supply to this machine's vCPUs.
    pub fn guest_hypervisor_interface(
        mut self,
        interface: Arc<dyn Enlightenment>,
    ) -> Self {
        self.guest_hv_interface = Some(interface);
        self
    }

    /// Consumes `self` and creates a new [`Machine`] based
    /// on the provided memory regions.
    pub fn finalize(mut self) -> Result<Machine> {
        let hdl = self.inner_hdl.take().unwrap();
        let mut map = self.physmap.take().unwrap();

        let bus_mmio = Arc::new(MmioBus::new(MAX_PHYSMEM));
        let bus_pio = Arc::new(PioBus::new());
        let acc_mem = map.finalize();
        let acc_msi = MsiAccessor::new(hdl.clone());

        let guest_hv_interface = self
            .guest_hv_interface
            .take()
            .unwrap_or(Arc::new(BhyveGuestInterface));

        guest_hv_interface.attach(&acc_mem, hdl.clone());
        let vcpus = (0..self.max_cpu)
            .map(|id| {
                Vcpu::new(
                    hdl.clone(),
                    i32::from(id),
                    bus_mmio.clone(),
                    bus_pio.clone(),
                    guest_hv_interface.clone(),
                )
            })
            .collect();

        let machine = Machine {
            hdl: hdl.clone(),
            vcpus,

            map_physmem: map,

            acc_mem,
            acc_msi,

            bus_mmio,
            bus_pio,
            guest_hv_interface,

            destroyed: AtomicBool::new(false),
        };
        Ok(machine)
    }
}

impl Drop for Builder {
    fn drop(&mut self) {
        if let Some(hdl) = &self.inner_hdl {
            // Do not allow the vmm device to persist
            hdl.destroy().unwrap();
        }
    }
}


================================================
FILE: lib/propolis/src/vmm/mem.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Module for managing guest memory mappings.

use std::io::{Error, ErrorKind, Result};
use std::marker::PhantomData;
use std::mem::{size_of, size_of_val, MaybeUninit};
use std::ops::RangeInclusive;
use std::os::unix::io::{AsRawFd, RawFd};
use std::ptr::{copy_nonoverlapping, NonNull};
use std::sync::Arc;

use libc::iovec;

use crate::accessors::MemAccessor;
use crate::common::{
    GuestAddr, GuestData, GuestRegion, PAGE_MASK, PAGE_SHIFT, PAGE_SIZE,
};
use crate::util::aspace::ASpace;
use crate::vmm::VmmHdl;

use zerocopy::{FromBytes, IntoBytes};

bitflags! {
    /// Bitflags representing memory protections.
    #[derive(Debug, Copy, Clone)]
    pub struct Prot: u8 {
        const NONE = 0;
        const READ = libc::PROT_READ as u8;
        const WRITE = libc::PROT_WRITE as u8;
        const EXEC = libc::PROT_EXEC as u8;
        const RW = (libc::PROT_READ
                    | libc::PROT_WRITE) as u8;
        const ALL = (libc::PROT_READ
                    | libc::PROT_WRITE
                    | libc::PROT_EXEC) as u8;
    }
}

pub(crate) struct MapSeg {
    id: i32,

    /// Mapping of the guest (physical) address space, subject to the protection
    /// restrictions enforced on the guest.
    map_guest: Arc<Mapping>,

    /// Mapping of the memory segment object itself, with full (R/W) access to
    /// its contents.
    map_seg: Arc<Mapping>,
}

pub(crate) enum MapKind {
    Dram(MapSeg),
    Rom(MapSeg),
    MmioReserve,
}

pub(crate) struct MapEnt {
    name: String,
    kind: MapKind,
}

impl MapEnt {
    fn map_type(&self) -> MapType {
        match &self.kind {
            MapKind::Dram(_) => MapType::Dram,
            MapKind::Rom(_) => MapType::Rom,
            MapKind::MmioReserve => MapType::Mmio,
        }
    }
}

pub enum MapType {
    Dram,
    Rom,
    Mmio,
}

pub struct PhysMap {
    map: Arc<ASpace<MapEnt>>,
    hdl: Arc<VmmHdl>,
    next_segid: i32,
}
impl PhysMap {
    pub(crate) fn new(size: usize, hdl: Arc<VmmHdl>) -> Self {
        assert!(size != 0);
        assert!(size & PAGE_SIZE == 0, "size must be page-aligned");

        Self { map: Arc::new(ASpace::new(0, size - 1)), hdl, next_segid: 0 }
    }

    pub(crate) fn map_mut(&mut self) -> &mut ASpace<MapEnt> {
        Arc::get_mut(&mut self.map).expect(
            "map should not be accessed mutably after PhysMap finalization",
        )
    }

    /// Create and map a memory region for the guest
    pub(crate) fn add_mem(
        &mut self,
        name: String,
        addr: usize,
        size: usize,
    ) -> Result<()> {
        let (segid, map_guest, map_seg) =
            self.seg_create_map(addr, size, None)?;

        self.map_mut()
            .register(
                addr,
                size,
                MapEnt {
                    name,
                    kind: MapKind::Dram(MapSeg {
                        id: segid,
                        map_guest,
                        map_seg,
                    }),
                },
            )
            .map_err(Error::from)
    }

    /// Create and map a ROM region for the guest
    pub(crate) fn add_rom(
        &mut self,
        name: String,
        addr: usize,
        size: usize,
    ) -> Result<()> {
        let (segid, map_guest, map_seg) =
            self.seg_create_map(addr, size, Some(&name))?;

        self.map_mut()
            .register(
                addr,
                size,
                MapEnt {
                    name,
                    kind: MapKind::Rom(MapSeg {
                        id: segid,
                        map_guest,
                        map_seg,
                    }),
                },
            )
            .map_err(Error::from)
    }

    /// Mark a region of the guest address space as reserved for MMIO
    pub(crate) fn add_mmio_reservation(
        &mut self,
        name: String,
        addr: usize,
        size: usize,
    ) -> Result<()> {
        self.map_mut()
            .register(addr, size, MapEnt { name, kind: MapKind::MmioReserve })
            .map_err(Error::from)
    }

    pub(crate) fn post_reinit(&self) -> Result<()> {
        // Since VM_REINIT unmaps all non-sysmem segments from the address space
        // of the VM, we must reestablish the ROM mapping(s) now.
        for (addr, len, ent) in self.map.iter() {
            if let MapKind::Rom(detail) = &ent.kind {
                self.hdl.map_memseg(
                    detail.id,
                    addr,
                    len,
                    0,
                    Prot::READ | Prot::EXEC,
                )?;
            }
        }
        Ok(())
    }

    pub fn mappings(&self) -> Vec<(usize, usize, MapType)> {
        self.map
            .iter()
            .map(|(addr, len, ent)| (addr, len, ent.map_type()))
            .collect()
    }

    pub(crate) fn finalize(&mut self) -> MemAccessor {
        assert!(
            Arc::strong_count(&self.map) == 1,
            "finalize should only be called once"
        );
        MemAccessor::new(Arc::new(MemCtx { map: self.map.clone() }))
    }

    /// Allocate a backing memseg, map it into the guest-physical space, and map
    /// both (the segment and guest mapping) into the process-virtual space.
    fn seg_create_map(
        &mut self,
        addr: usize,
        size: usize,
        rom_name: Option<&str>,
    ) -> Result<(i32, Arc<Mapping>, Arc<Mapping>)> {
        let prot = match rom_name.as_ref() {
            Some(_) => Prot::READ | Prot::EXEC,
            None => Prot::ALL,
        };

        let segid = self.next_segid;
        self.hdl.create_memseg(segid, size, rom_name)?;
        self.hdl.map_memseg(segid, addr, size, 0, prot)?;
        self.next_segid += 1;
        // TODO: if we somehow fail the later stages of this operation, the
        // memseg and its mapping established in the VMM will persist

        let seg_off = self.hdl.devmem_offset(segid)?;
        let map_guest = Mapping::new(size, prot, &self.hdl, addr as i64)?;
        let map_seg = Mapping::new(size, Prot::RW, &self.hdl, seg_off as i64)?;

        Ok((segid, map_guest, map_seg))
    }

    pub(crate) fn destroy(&mut self) {
        let map = Arc::get_mut(&mut self.map).expect(
            "no refs should remain to Physmap contents when destroy() called",
        );
        map.clear();
    }
}

#[cfg(test)]
impl PhysMap {
    pub(crate) fn new_test(size: usize) -> Self {
        let hdl =
            VmmHdl::new_test(size).expect("create tempfile backed test hdl");
        Self::new(size, Arc::new(hdl))
    }

    /// Create "memory" region on an instance backed with a fake VmmHdl
    pub(crate) fn add_test_mem(
        &mut self,
        name: String,
        addr: usize,
        size: usize,
    ) -> Result<()> {
        let (map_guest, map_seg) = self.seg_test_map(addr, size, false)?;
        self.map_mut()
            .register(
                addr,
                size,
                MapEnt {
                    name,
                    kind: MapKind::Dram(MapSeg { id: -1, map_guest, map_seg }),
                },
            )
            .map_err(Error::from)
    }

    /// Create "ROM" region on an instance backed with a fake VmmHdl
    pub(crate) fn add_test_rom(
        &mut self,
        name: String,
        addr: usize,
        size: usize,
    ) -> Result<()> {
        let (map_guest, map_seg) = self.seg_test_map(addr, size, true)?;
        self.map_mut()
            .register(
                addr,
                size,
                MapEnt {
                    name,
                    kind: MapKind::Rom(MapSeg { id: -1, map_guest, map_seg }),
                },
            )
            .map_err(Error::from)
    }

    /// Make fake VmmHdl (backed with tempfile) for use in testing
    fn seg_test_map(
        &mut self,
        addr: usize,
        size: usize,
        is_rom: bool,
    ) -> Result<(Arc<Mapping>, Arc<Mapping>)> {
        let prot = match is_rom {
            true => Prot::READ,
            false => Prot::RW,
        };
        let map_guest = Mapping::new(size, prot, &self.hdl, addr as i64)?;
        let map_seg = Mapping::new(size, Prot::RW, &self.hdl, addr as i64)?;

        Ok((map_guest, map_seg))
    }
}

// TODO: reword?
/// A owned region of mapped guest memory, accessible via [`SubMapping`].
///
/// When emulating hardware in service of a VM we are often working with raw
/// pointers into guest memory. `Mapping` and [`SubMapping`] together provide
/// safe (in the Rust sense) operators to read and write guest memory, with
/// escape hatches in some cases which `Mapping` cannot directly support.
///
/// In general, the guest into which this `Mapping` points is assumed to be
/// running and concurrently reading or writing all of its address space. For
/// example, when Propolis is performing a read in service of memory-mapped I/O,
/// we must assume the guest is concurrently writing to the address we read.
/// Even if the guest is paused, it is possible that guest address ranges have
/// been sent sent to hardware and are being accessed via DMA. This limits the
/// interfaces `Mapping` can provide, and adds some complexity to `Mapping`'s
/// implementation.
///
/// # Safety
///
/// Rust references of guest memory are inappropriate:
/// - if we had an immutable reference of guest memory, then guest vCPUs or
///   host hardware may concurrently write and violate that immutability.
/// - if we had a mutable reference of guest memory, then guest vCPUs or host
///   hardware may concurrently write or read and violate the exclusivity of a
///   mutable reference.
///
/// As a result, `(Sub)Mapping` takes care to not return a reference of guest
/// memory, and to never accidentally form a reference of guest memory - even as
/// a slice, `&[u8]`.
///
/// Guest pointers are subject to the same requirements as any other raw
/// pointer:
/// - The pointer must not be null
/// - The dereferenced pointer must be within bounds of a valid mapping
///
/// Guest pointers are trivially not null; a `Mapping` will have some non-null
/// base and does not wrap the address space. Even if a guest's provided
/// pointer is `0usize`, it is added to a non-null offset and will never be an
/// actual pointer to zero.
///
/// `Mapping` and `SubMapping` are primarily concerned with ensuring guest
/// accesses are within bounds of the guest mapping, and that the mapping is
/// valid for the access to be performed (writes are not made into read-only
/// mappings, for example).
///
/// Considering these requirements, this structure provides an interface which
/// upholds the following conditions:
/// - An accessed memory region is fully contained in the mapping.
/// - Reads to a memory region are only permitted if the mapping is readable.
/// - Writes to a memory region are only permitted if the mapping is writable.
/// - References to memory are neither made transiently nor exposed.

#[derive(Debug)]
pub(crate) struct Mapping {
    ptr: NonNull<u8>,
    len: usize,
    prot: Prot,
}
impl Mapping {
    /// Creates a new memory mapping from a `VmmHdl`, with the requested
    /// permissions.  The `size` and `devoff` must be `PAGE_SIZE` aligned.
    fn new(
        size: usize,
        prot: Prot,
        vmm: &VmmHdl,
        devoff: i64,
    ) -> Result<Arc<Self>> {
        let fd = vmm.fd();

        // We do not mmap() guest resources with the EXEC bit set
        let mmap_prot = prot.intersection(Prot::RW);

        // Safety:
        // With a NULL `addr`, the OS will pick a mapping location which does not
        // conflict with other resources.  While the VmmFile is not something
        // that should be truncated, it is the responsibility of the caller to
        // ensure that the underlying resources are not destroyed prior to
        // `Mapping`s which refer to them.
        let ptr = unsafe {
            libc::mmap(
                core::ptr::null_mut(),
                size,
                mmap_prot.bits().into(),
                libc::MAP_SHARED,
                fd,
                devoff,
            )
        };

        if ptr == libc::MAP_FAILED {
            return Err(Error::last_os_error());
        }
        let ptr = NonNull::new(ptr as *mut u8)
            .expect("mmap() result should be non-NULL");

        Ok(Arc::new(Self { ptr, len: size, prot }))
    }
}
impl Drop for Mapping {
    fn drop(&mut self) {
        unsafe {
            libc::munmap(self.ptr.as_ptr() as *mut libc::c_void, self.len);
        }
    }
}

// Safety: `Mapping`'s API does not provide raw access to the underlying
// pointer, nor any mechanism to create references to the underlying data.
unsafe impl Send for Mapping {}
unsafe impl Sync for Mapping {}

#[derive(Debug)]
// Backing resources (Mapping or SubMapping) must remain held, even though we do
// not reference them directly as a field.
#[allow(dead_code)]
enum Backing<'a> {
    Base(&'a Mapping),
    Sub(&'a SubMapping<'a>),
}

/// A borrowed region from a `Mapping` object.
///
/// Provides interfaces for acting on memory, but does not own the
/// underlying memory region.
///
/// As this is simply a borrow of a `Mapping`, `SubMapping` is subject to the
/// same safety requirements as `Mapping`; everything in the doc comment there
/// applies here as well.
#[derive(Debug)]
pub struct SubMapping<'a> {
    // The backing resource must remain held, even though we never reference it
    // directly as a field.
    #[allow(unused)]
    backing: Backing<'a>,

    ptr: NonNull<u8>,
    len: usize,
    prot: Prot,
}

impl SubMapping<'_> {
    /// Create `SubMapping` using the entire region offered by an underlying
    /// `Mapping` object.
    fn new_base<'a>(base: &'a Mapping) -> SubMapping<'a> {
        SubMapping {
            backing: Backing::Base(base),

            ptr: base.ptr,
            len: base.len,
            prot: base.prot,
        }
    }

    /// Create `SubMapping` using entire region offered by existing `SubMapping`
    /// object.
    fn new_sub(&self) -> SubMapping<'_> {
        SubMapping {
            backing: Backing::Sub(self),

            ptr: self.ptr,
            len: self.len,
            prot: self.prot,
        }
    }

    #[cfg(test)]
    fn new_base_test(base: &Mapping) -> SubMapping<'_> {
        let ptr = base.ptr;
        let len = base.len;
        let prot = base.prot;
        SubMapping { backing: Backing::Base(base), ptr, len, prot }
    }

    /// Acquire a reference to a region of memory within the
    /// current mapping.
    ///
    /// - `offset` is relative to the current mapping.
    /// - `length` is the length of the new subregion.
    ///
    /// Returns `None` if the requested offset/length extends beyond the end of
    /// the mapping.
    pub fn subregion(
        &self,
        offset: usize,
        length: usize,
    ) -> Option<SubMapping<'_>> {
        self.new_sub().constrain_region(offset, length).ok()
    }

    /// Constrain the access permissions of a SubMapping
    pub fn constrain_access(mut self, prot_limit: Prot) -> Self {
        self.prot = self.prot.intersection(prot_limit);
        self
    }

    /// Attempt to constrain the `SubMapping` to a subset of the memory which it
    /// currently covers.
    ///
    /// - `offset` is relative to the current mapping.
    /// - `length` is the length of the new subregion.
    ///
    /// Returns the constrained `SubMapping` when successful, or the unchanged
    /// `SubMapping` if invalid input resulted in an error.
    pub fn constrain_region(
        mut self,
        offset: usize,
        length: usize,
    ) -> std::result::Result<Self, Self> {
        let end = match offset.checked_add(length) {
            Some(v) => v,
            None => return Err(self),
        };
        if self.len < end {
            return Err(self);
        }

        // Safety:
        // - Starting and resulting pointer must be within bounds or
        // one past the end of the same allocated object.
        // - The computed offset, in bytes, cannot overflow isize.
        // - The offset cannot rely on "wrapping around" the address
        // space.
        let ptr = unsafe { self.ptr.as_ptr().add(offset) };
        self.ptr = NonNull::new(ptr).unwrap();
        self.len = length;
        Ok(self)
    }

    /// Emit appropriate error if mapping does not allow writes
    fn check_write_access(&self) -> Result<()> {
        if !self.prot.contains(Prot::WRITE) {
            Err(Error::new(ErrorKind::PermissionDenied, "No write access"))
        } else {
            Ok(())
        }
    }

    /// Emit appropriate error if mapping does not allow reads
    fn check_read_access(&self) -> Result<()> {
        if !self.prot.contains(Prot::READ) {
            Err(Error::new(ErrorKind::PermissionDenied, "No read access"))
        } else {
            Ok(())
        }
    }

    /// Reads a `T` object from the mapping.
    pub fn read<T: Copy + FromBytes>(&self) -> Result<T> {
        self.check_read_access()?;
        let typed = self.ptr.as_ptr() as *const T;
        if self.len < std::mem::size_of::<T>() {
            return Err(Error::new(ErrorKind::InvalidData, "Buffer too small"));
        }

        // Safety:
        // - typed must be valid for reads: `check_read_access()` succeeded
        // - typed must point to a properly initialized value of T: always true
        //     because we require `T: FromBytes`. `zerocopy::FromBytes` happens
        //     to have the same concerns as us - that T is valid for all bit
        //     patterns.
        Ok(unsafe { typed.read_unaligned() })
    }

    /// Read the entire mapping as an array of `T` objects.
    /// The size of the mapping must be aligned to `size_of::<T>()`.
    pub fn read_many_owned<T: Copy + FromBytes>(&self) -> Result<Vec<T>> {
        self.check_read_access()?;
        if !self.len.is_multiple_of(size_of::<T>()) {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "Mapping size not aligned to value type",
            ));
        }
        let count = self.len / size_of::<T>();
        let mut vec = Vec::with_capacity(count);

        self.read_many(&mut vec.spare_capacity_mut()[..count])?;
        // Safety: read_many() was successful and just initialized the first `count` elements of
        // the vector.
        unsafe {
            vec.set_len(count);
        }
        Ok(vec)
    }

    /// Read `values` from the mapping.
    pub fn read_many<T: Copy + FromBytes>(
        &self,
        values: &mut [T],
    ) -> Result<()> {
        self.check_read_access()?;
        let copy_len = size_of_val(values);
        if self.len < copy_len {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "Value larger than mapping",
            ));
        }

        // We know that the `values` reference is properly aligned, but that is
        // not guaranteed for the source pointer.  Cast it down to a u8, which
        // will appease those alignment concerns
        let src = self.ptr.as_ptr() as *const u8;
        // We know reinterpreting `*mut T` as `*mut u8` and writing to it cannot
        // result in invalid `T` because `T: FromBytes`
        let dst = values.as_mut_ptr() as *mut u8;

        // Safety
        // - `src` is valid for read for the `copy_len` as checked above
        // - `dst` is valid for writes for its entire length, since it is from a
        // valid mutable reference passed in to us
        // - both are aligned for a `u8` copy
        // - `dst` cannot be overlapped by `src`, since the former came from a
        //   valid reference, and references to guest mappings are not allowed
        unsafe {
            copy_nonoverlapping(src, dst, copy_len);
        }
        Ok(())
    }

    /// Reads a buffer of bytes from the mapping.
    ///
    /// If `buf` is larger than the SubMapping, the read will be truncated to
    /// length of the SubMapping.
    ///
    /// Returns the number of bytes read.
    pub fn read_bytes(&self, buf: &mut [u8]) -> Result<usize> {
        let read_len = usize::min(buf.len(), self.len);
        self.read_many(&mut buf[..read_len])?;
        Ok(read_len)
    }

    /// Reads a buffer of bytes from the mapping into an uninitialized region
    ///
    /// If `buf` is larger than the SubMapping, the read will be truncated to
    /// length of the SubMapping.
    ///
    /// Returns the number of bytes read.
    pub fn read_bytes_uninit(
        &self,
        buf: &mut [MaybeUninit<u8>],
    ) -> Result<usize> {
        let read_len = usize::min(buf.len(), self.len);
        self.read_many(&mut buf[..read_len])?;
        Ok(read_len)
    }

    /// Pread from `file` into the mapping.
    pub fn pread(
        &self,
        file: &impl AsRawFd,
        length: usize,
        offset: i64,
    ) -> Result<usize> {
        self.check_write_access()?;
        let to_read = usize::min(length, self.len);
        let read = unsafe {
            libc::pread(
                file.as_raw_fd(),
                self.ptr.as_ptr() as *mut libc::c_void,
                to_read,
                offset,
            )
        };
        if read == -1 {
            return Err(Error::last_os_error());
        }
        Ok(read as usize)
    }

    /// Writes `value` into the mapping.
    pub fn write<T: Copy + IntoBytes>(&self, value: &T) -> Result<()> {
        self.check_write_access()?;
        let typed = self.ptr.as_ptr() as *mut T;
        unsafe {
            typed.write_unaligned(*value);
        }
        Ok(())
    }

    /// Writes `values` into the mapping.
    pub fn write_many<T: Copy + IntoBytes>(&self, values: &[T]) -> Result<()> {
        self.check_write_access()?;
        let copy_len = size_of_val(values);
        if self.len < copy_len {
            return Err(Error::new(
                ErrorKind::InvalidInput,
                "Value larger than mapping",
            ));
        }

        // We know that the `values` reference is properly aligned, but that is
        // not guaranteed for the destination pointer.  Cast it down to a u8,
        // which will appease those alignment concerns
        let src = values.as_ptr() as *const u8;
        let dst = self.ptr.as_ptr();

        // Safety
        // - `src` is valid for reads for its entire length, since it is from a
        //   valid reference passed in to us
        // - `dst` is valid for writes for the `copy_len` as checked above
        // - both are aligned for a `u8` copy
        // - `dst` cannot be overlapped by `src`, since the latter came from a
        //   valid reference, and references to guest mappings are not allowed
        unsafe {
            copy_nonoverlapping(src, dst, copy_len);
        }
        Ok(())
    }

    /// Writes a buffer of bytes into the mapping.
    ///
    /// If `buf` is larger than the SubMapping, the write will be truncated to
    /// length of the SubMapping.
    ///
    /// Returns the number of bytes written.
    pub fn write_bytes(&self, buf: &[u8]) -> Result<usize> {
        let write_len = usize::min(buf.len(), self.len);
        self.write_many(&buf[..write_len])?;
        Ok(write_len)
    }

    /// Writes a single byte `val` to the mapping, `count` times.
    pub fn write_byte(&self, val: u8, count: usize) -> Result<usize> {
        self.check_write_access()?;
        let to_copy = usize::min(count, self.len);
        unsafe {
            self.ptr.as_ptr().write_bytes(val, to_copy);
        }
        Ok(to_copy)
    }

    /// Pwrite from the mapping to `file`.
    pub fn pwrite(
        &self,
        file: &impl AsRawFd,
        length: usize,
        offset: i64,
    ) -> Result<usize> {
        self.check_read_access()?;
        let to_write = usize::min(length, self.len);
        let written = unsafe {
            libc::pwrite(
                file.as_raw_fd(),
                self.ptr.as_ptr() as *const libc::c_void,
                to_write,
                offset,
            )
        };
        if written == -1 {
            return Err(Error::last_os_error());
        }
        Ok(written as usize)
    }

    /// Returns the length of the mapping.
    pub fn len(&self) -> usize {
        self.len
    }

    /// Returns true if the mapping is empty.
    pub fn is_empty(&self) -> bool {
        self.len == 0
    }

    pub fn prot(&self) -> Prot {
        self.prot
    }

    /// Returns a raw readable reference to the underlying data.
    ///
    /// # Safety
    ///
    /// - The caller must never create a reference to the underlying
    /// memory region.
    /// - The returned pointer must not outlive the mapping.
    /// - The caller may only read up to `len()` bytes.
    pub unsafe fn raw_readable(&self) -> Option<*const u8> {
        if self.prot.contains(Prot::READ) {
            Some(self.ptr.as_ptr() as *const u8)
        } else {
            None
        }
    }

    /// Returns a raw writable reference to the underlying data.
    ///
    /// # Safety
    ///
    /// - The caller must never create a reference to the underlying
    /// memory region.
    /// - The returned pointer must not outlive the mapping.
    /// - The caller may only write up to `len()` bytes.
    pub unsafe fn raw_writable(&self) -> Option<*mut u8> {
        if self.prot.contains(Prot::WRITE) {
            Some(self.ptr.as_ptr())
        } else {
            None
        }
    }
}

// Safety: `SubMapping`'s API does not provide raw access to the underlying
// pointer, nor any mechanism to create references to the underlying data.
unsafe impl Send for SubMapping<'_> {}
unsafe impl Sync for SubMapping<'_> {}

pub trait MappingExt {
    /// preadv from `file` into multiple mappings
    fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize>;

    /// pwritev from multiple mappings to `file`
    fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize>;
}

// Gross hack alert: since the mappings below are memory regions backed by
// segvmm_ops, `zvol_{read,write}` and similar will end up contending on
// `svmd->svmd_lock`. Instead, as long as the I/O is small enough we'll tolerate
// it, copy from guest memory to Propolis heap. The segment backing Propolis'
// heap has an `as_page{,un}lock` impl that avoids the more
// expensive/contentious `as_fault()` fallback.
//
// This is an optimization until stlouis#871 can get things sorted, at
// which point it should be strictly worse than directly using the
// requested mappings.
//
// 1 MiB is an arbitrary-ish choice: `propolis-server` and `propolis-standalone`
// set NVMe MDTS to "8", so the largest I/Os from NVMe will be
// `2**8 * 4096 == 1048576 bytes == 1 MiB`. Beyond this, fall back to using
// iovecs directly, potentially at increased OS overhead.
//
// The amount of memory used for temporary buffers is given by the number of
// worker threads for all file-backed disks, times this threshold. It works out
// to up to 8 MiB (8 worker threads) of buffers per disk by default as of
// writing.
const MAPPING_IO_LIMIT_BYTES: usize = crate::common::MB;

/// Compute the number of bytes that would be required to hold these mappings
/// sequentially.
///
/// Ranges covered by multiple mappings are counted repeatedly.
fn total_mapping_size(mappings: &[SubMapping<'_>]) -> Result<usize> {
    mappings
        .iter()
        .try_fold(0usize, |total, mapping| total.checked_add(mapping.len))
        .ok_or_else(|| {
            Error::new(
                ErrorKind::InvalidInput,
                "Total mapping larger than a `usize`",
            )
        })
}

impl<'a, T: AsRef<[SubMapping<'a>]>> MappingExt for T {
    fn preadv(&self, fd: RawFd, offset: i64) -> Result<usize> {
        if !self
            .as_ref()
            .iter()
            .all(|mapping| mapping.prot.contains(Prot::WRITE))
        {
            return Err(Error::new(
                ErrorKind::PermissionDenied,
                "No write access",
            ));
        }

        let total_capacity = total_mapping_size(self.as_ref())?;

        // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
        if total_capacity <= MAPPING_IO_LIMIT_BYTES {
            // If we're motivated to avoid the zero-fill via
            // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
            // probably avoid this gross hack entirely (see comment on
            // MAPPING_IO_LIMIT_BYTES).
            let mut buf = vec![0; total_capacity];

            let iov = [iovec {
                iov_base: buf.as_mut_ptr() as *mut libc::c_void,
                iov_len: buf.len(),
            }];

            let res = unsafe {
                libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
            };
            if res == -1 {
                return Err(Error::last_os_error());
            }
            let read = res as usize;

            // copy `read` bytes back into the iovecs and return
            let mut remaining = &buf[..read];
            for mapping in self.as_ref().iter() {
                let to_copy = std::cmp::min(remaining.len(), mapping.len);

                let (curr_buf, rest) = remaining.split_at(to_copy);

                mapping.write_bytes(curr_buf)?;

                remaining = rest;

                if remaining.len() == 0 {
                    // Either we're at the last iov and we're finished copying
                    // back into the guest, or `preadv` did a short read.
                    break;
                }
            }

            // We should never read more than the guest mappings could hold.
            assert_eq!(remaining.len(), 0);

            Ok(read)
        } else {
            let iov = self
                .as_ref()
                .iter()
                .map(|mapping| iovec {
                    iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
                    iov_len: mapping.len,
                })
                .collect::<Vec<_>>();

            let read = unsafe {
                libc::preadv(fd, iov.as_ptr(), iov.len() as libc::c_int, offset)
            };
            if read == -1 {
                return Err(Error::last_os_error());
            }
            let read: usize = read.try_into().expect("read is positive");
            Ok(read)
        }
    }

    fn pwritev(&self, fd: RawFd, offset: i64) -> Result<usize> {
        if !self
            .as_ref()
            .iter()
            .all(|mapping| mapping.prot.contains(Prot::READ))
        {
            return Err(Error::new(
                ErrorKind::PermissionDenied,
                "No read access",
            ));
        }

        let total_capacity = total_mapping_size(self.as_ref())?;

        // Gross hack: see the comment on `MAPPING_IO_LIMIT_BYTES`.
        let written = if total_capacity <= MAPPING_IO_LIMIT_BYTES {
            // If we're motivated to avoid the zero-fill via
            // `Layout::with_size_align` + `GlobalAlloc::alloc`, we should
            // probably avoid this gross hack entirely (see comment on
            // MAPPING_IO_LIMIT_BYTES).
            let mut buf = vec![0; total_capacity];

            let mut remaining = buf.as_mut_slice();
            for mapping in self.as_ref().iter() {
                // The original `buf` is at least as large as all mappings
                // combined, so `remaining` is at least as large as this and all
                // remaining mappings, so we can slice up to `mapping.len`.
                let (curr_buf, rest) = remaining.split_at_mut(mapping.len);

                mapping.read_bytes(curr_buf)?;

                remaining = rest;
            }

            let iovs = [iovec {
                iov_base: buf.as_mut_ptr() as *mut libc::c_void,
                iov_len: buf.len(),
            }];

            unsafe {
                libc::pwritev(
                    fd,
                    iovs.as_ptr(),
                    iovs.len() as libc::c_int,
                    offset,
                )
            }
        } else {
            let iovs = self
                .as_ref()
                .iter()
                .map(|mapping| iovec {
                    iov_base: mapping.ptr.as_ptr() as *mut libc::c_void,
                    iov_len: mapping.len,
                })
                .collect::<Vec<_>>();

            unsafe {
                libc::pwritev(
                    fd,
                    iovs.as_ptr(),
                    iovs.len() as libc::c_int,
                    offset,
                )
            }
        };

        if written == -1 {
            return Err(Error::last_os_error());
        }

        Ok(written as usize)
    }
}

/// Wrapper around an address space for a VM.
pub struct MemCtx {
    map: Arc<ASpace<MapEnt>>,
}
impl MemCtx {
    /// Reads a generic value from a specified guest address.
    pub fn read<T: Copy + FromBytes>(
        &self,
        addr: GuestAddr,
    ) -> Option<GuestData<T>> {
        if let Some(mapping) =
            self.region_covered(addr, size_of::<T>(), Prot::READ)
        {
            mapping.read::<T>().ok().map(GuestData::from)
        } else {
            None
        }
    }
    /// Reads bytes into a requested buffer from guest memory.
    ///
    /// Copies up to `buf.len()` or `len` bytes, whichever is smaller.
    pub fn read_into(
        &self,
        addr: GuestAddr,
        buf: &mut GuestData<&mut [u8]>,
        len: usize,
    ) -> Option<usize> {
        let len = usize::min(buf.len(), len);
        if let Some(mapping) = self.region_covered(addr, len, Prot::READ) {
            mapping.read_bytes(&mut buf[..len]).ok()
        } else {
            None
        }
    }
    /// Reads bytes from guest memory into a buffer, using the direct
    /// mapping.
    pub fn direct_read_into(
        &self,
        addr: GuestAddr,
        buf: &mut GuestData<&mut [u8]>,
        len: usize,
    ) -> Option<usize> {
        let len = usize::min(buf.len(), len);
        let region = GuestRegion(addr, len);
        let mapping = self.direct_readable_region(&region)?;
        mapping.read_bytes(&mut buf[..len]).ok()
    }

    /// Reads multiple objects from a guest address.
    pub fn read_many<T: Copy + FromBytes>(
        &self,
        base: GuestAddr,
        count: usize,
    ) -> Option<GuestData<MemMany<'_, T>>> {
        self.region_covered(base, size_of::<T>() * count, Prot::READ).map(
            |mapping| {
                GuestData::from(MemMany {
                    mapping,
                    pos: 0,
                    count,
                    phantom: PhantomData,
                })
            },
        )
    }
    /// Writes a value to guest memory.
    pub fn write<T: Copy + IntoBytes>(&self, addr: GuestAddr, val: &T) -> bool {
        if let Some(mapping) =
            self.region_covered(addr, size_of::<T>(), Prot::WRITE)
        {
            mapping.write(val).is_ok()
        } else {
            false
        }
    }
    /// Writes bytes from a buffer to guest memory.
    ///
    /// Writes up to `buf.len()` or `len` bytes, whichever is smaller.
    pub fn write_from(
        &self,
        addr: GuestAddr,
        buf: &[u8],
        len: usize,
    ) -> Option<usize> {
        let len = usize::min(buf.len(), len);
        if let Some(mapping) = self.region_covered(addr, len, Prot::WRITE) {
            mapping.write_bytes(&buf[..len]).ok()
        } else {
            None
        }
    }
    /// Writes a single value to guest memory.
    pub fn write_byte(&self, addr: GuestAddr, val: u8, count: usize) -> bool {
        if let Some(mapping) = self.region_covered(addr, count, Prot::WRITE) {
            mapping.write_byte(val, count).is_ok()
        } else {
            false
        }
    }
    /// Write multiple values to guest memory
    ///
    /// If the memory offset and value(s) size would result in the copy crossing
    /// vmm memory segments, this will fail.
    pub fn write_many<T: Copy + IntoBytes>(
        &self,
        addr: GuestAddr,
        val: &[T],
    ) -> bool {
        if let Some(mapping) =
            self.region_covered(addr, size_of_val(val), Prot::WRITE)
        {
            mapping.write_many(val).is_ok()
        } else {
            false
        }
    }

    pub fn writable_region(
        &self,
        region: &GuestRegion,
    ) -> Option<SubMapping<'_>> {
        let mapping = self.region_covered(region.0, region.1, Prot::WRITE)?;
        Some(mapping)
    }
    pub fn readable_region(
        &self,
        region: &GuestRegion,
    ) -> Option<SubMapping<'_>> {
        let mapping = self.region_covered(region.0, region.1, Prot::READ)?;
        Some(mapping)
    }
    pub fn readwrite_region(
        &self,
        region: &GuestRegion,
    ) -> Option<SubMapping<'_>> {
        let mapping = self.region_covered(region.0, region.1, Prot::RW)?;
        Some(mapping)
    }

    /// Like `direct_writable_region`, but looks up the region by name.
    pub fn direct_writable_region_by_name(
        &self,
        name: &str,
    ) -> Result<SubMapping<'_>> {
        let ent = self
            .map
            .iter()
            .find_map(|(_addr, _len, ent)| match &ent.kind {
                MapKind::Dram(seg) if ent.name == name => Some(&seg.map_seg),
                MapKind::Rom(seg) if ent.name == name => Some(&seg.map_seg),
                _ => None,
            })
            .ok_or_else(|| {
                Error::new(
                    ErrorKind::NotFound,
                    format!("memory region {} not found", name),
                )
            })?;
        Ok(SubMapping::new_base(ent).constrain_access(Prot::WRITE))
    }

    /// Like `writable_region`, but accesses the underlying memory segment
    /// directly, bypassing protection enforced to the guest and tracking of
    /// dirty pages in the guest-physical address space.
    pub fn direct_writable_region(
        &self,
        region: &GuestRegion,
    ) -> Option<SubMapping<'_>> {
        let (_guest_map, seg_map) = self.region_mappings(region.0, region.1)?;
        Some(seg_map.constrain_access(Prot::WRITE))
    }

    /// Like `readable_region`, but accesses the underlying memory segment
    /// directly, bypassing protection enforced to the guest and tracking of
    /// accessed pages in the guest-physical address space.
    pub fn direct_readable_region(
        &self,
        region: &GuestRegion,
    ) -> Option<SubMapping<'_>> {
        let (_guest_map, seg_map) = self.region_mappings(region.0, region.1)?;
        Some(seg_map.constrain_access(Prot::READ))
    }

    /// Look up a region in the guest's address space and return its protection
    /// (as preceived by the guest) and mapping access, both through the nested
    /// page tables, and directly to the underlying memory segment.
    fn region_mappings(
        &self,
        addr: GuestAddr,
        len: usize,
    ) -> Option<(SubMapping<'_>, SubMapping<'_>)> {
        let start = addr.0 as usize;
        let Some(end) = start.checked_add(len) else {
            // The mappings in `self.map` do not wrap, so no mapping can match
            // with a region wrapping at the end of the address space.
            return None;
        };
        if let Ok((addr, rlen, ent)) = self.map.region_at(start) {
            // Unlike start+len before, we know `addr + rlen` cannot overflow:
            // if it would, ASpace::register would have rejected this region.
            if addr + rlen < end {
                return None;
            }
            let req_offset = start - addr;
            let (prot, seg) = match &ent.kind {
                MapKind::Dram(seg) => Some((Prot::RW, seg)),
                MapKind::Rom(seg) => Some((Prot::READ, seg)),
                MapKind::MmioReserve => None,
            }?;

            let guest_map = SubMapping::new_base(&seg.map_guest)
                .constrain_access(prot)
                .constrain_region(req_offset, len)
                .expect("mapping offset should be valid");

            let seg_map = SubMapping::new_base(&seg.map_seg)
                .constrain_region(req_offset, len)
                .expect("mapping offset should be valid");

            return Some((guest_map, seg_map));
        }
        None
    }

    /// Looks up a region of memory in the guest's address space, returning a
    /// pointer to the containing region.
    fn region_covered(
        &self,
        addr: GuestAddr,
        len: usize,
        req_prot: Prot,
    ) -> Option<SubMapping<'_>> {
        let (guest_map, _seg_map) = self.region_mappings(addr, len)?;
        // Although this protection check could be considered redundant with the
        // permissions on the mapping itself, performing it here allows
        // consumers to gracefully handle errors, rather than taking a fault
        // when attempting to exceed the guest's apparent permissions.
        if guest_map.prot().contains(req_prot) {
            Some(guest_map)
        } else {
            None
        }
    }

    /// Returns the [lowest, highest] memory addresses in the space as an
    /// inclusive range.
    pub fn mem_bounds(&self) -> Option<RangeInclusive<GuestAddr>> {
        let lowest = self
            .map
            .lowest_addr(|entry| matches!(entry.kind, MapKind::Dram(_)))?
            as u64;
        let highest = self
            .map
            .highest_addr(|entry| matches!(entry.kind, MapKind::Dram(_)))?
            as u64;
        Some(GuestAddr(lowest)..=GuestAddr(highest))
    }
}

pub enum MemAccessed {}
impl crate::accessors::AccessedResource for MemAccessed {
    type Root = Arc<MemCtx>;
    type Leaf = Arc<MemCtx>;
    type Target = MemCtx;

    fn derive(root: &Self::Root) -> Self::Leaf {
        root.clone()
    }
    fn deref(leaf: &Self::Leaf) -> &Self::Target {
        leaf
    }
}

/// A contiguous region of memory containing generic objects.
pub struct MemMany<'a, T: Copy> {
    mapping: SubMapping<'a>,
    count: usize,
    pos: usize,
    phantom: PhantomData<T>,
}
impl<T: Copy + FromBytes> GuestData<MemMany<'_, T>> {
    /// Gets the object at position `pos` within the memory region.
    ///
    /// Returns [`Option::None`] if out of range.
    pub fn get(&self, pos: usize) -> Option<GuestData<T>> {
        if pos < self.count {
            let sz = std::mem::size_of::<T>();
            self.mapping
                .subregion(pos * sz, sz)?
                .read::<T>()
                .ok()
                .map(GuestData::from)
        } else {
            None
        }
    }
}
impl<T: Copy + FromBytes> Iterator for GuestData<MemMany<'_, T>> {
    type Item = GuestData<T>;

    fn next(&mut self) -> Option<Self::Item> {
        let res = self.get(self.pos);
        self.pos += 1;
        res
    }
}

/// A 52-bit physical page number, i.e., the ordinal index of a 4 KiB page of
/// guest memory.
#[derive(Clone, Copy, Ord, PartialOrd, Eq, PartialEq)]
pub(crate) struct Pfn(u64);

impl std::fmt::Debug for Pfn {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Pfn({:#x})", self.0)
    }
}

impl std::fmt::Display for Pfn {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        core::fmt::LowerHex::fmt(&self.0, f)
    }
}

impl From<Pfn> for u64 {
    fn from(value: Pfn) -> Self {
        value.0
    }
}

impl Pfn {
    /// Creates a new PFN wrapper for the supplied physical page number. Returns
    /// `None` if the page number cannot correctly be shifted to form the
    /// corresponding 64-bit address.
    pub(crate) fn new(pfn: u64) -> Option<Self> {
        if pfn > (PAGE_MASK as u64 >> PAGE_SHIFT) {
            None
        } else {
            Some(Self(pfn))
        }
    }

    /// Creates a new PFN wrapper without checking that the supplied PFN can
    /// be shifted to produce a corresponding 64-bit address.
    ///
    /// # Safety
    ///
    /// The supplied PFN must fit in 52 bits; otherwise [`Self::addr`] will
    /// return incorrect addresses for this PFN.
    //
    // This is currently only used by test code.
    #[cfg(test)]
    pub(crate) fn new_unchecked(pfn: u64) -> Self {
        Self(pfn)
    }

    /// Yields the 64-bit address corresponding to this PFN.
    pub(crate) fn addr(&self) -> GuestAddr {
        GuestAddr(self.0 << PAGE_SHIFT)
    }
}

#[cfg(test)]
pub mod test {
    use super::*;

    const TEST_LEN: usize = 16 * 1024;

    fn test_setup(prot: Prot) -> (VmmHdl, Arc<Mapping>) {
        let hdl = VmmHdl::new_test(TEST_LEN)
            .expect("create tempfile backed test hdl");
        let base = Mapping::new(TEST_LEN, prot, &hdl, 0).unwrap();
        (hdl, base)
    }

    #[test]
    fn memory_protections_match_libc() {
        assert_eq!(i32::from(Prot::READ.bits()), libc::PROT_READ);
        assert_eq!(i32::from(Prot::WRITE.bits()), libc::PROT_WRITE);
        assert_eq!(i32::from(Prot::EXEC.bits()), libc::PROT_EXEC);
    }

    #[test]
    fn mapping_denies_read_beyond_end() {
        let (_hdl, base) = test_setup(Prot::READ);
        let mapping = SubMapping::new_base_test(&base);

        assert!(mapping.read::<[u8; TEST_LEN + 1]>().is_err());
    }

    #[test]
    fn mapping_shortens_read_bytes_beyond_end() {
        let (_hdl, base) = test_setup(Prot::READ);
        let mapping = SubMapping::new_base_test(&base);

        let mut buf: [u8; TEST_LEN + 1] = [0; TEST_LEN + 1];
        assert_eq!(TEST_LEN, mapping.read_bytes(&mut buf).unwrap());
    }

    #[test]
    fn mapping_shortens_write_bytes_beyond_end() {
        let (_hdl, base) = test_setup(Prot::RW);
        let mapping = SubMapping::new_base_test(&base);

        let mut buf: [u8; TEST_LEN + 1] = [0; TEST_LEN + 1];
        assert_eq!(TEST_LEN, mapping.write_bytes(&mut buf).unwrap());
    }

    #[test]
    fn mapping_create_empty() {
        let (_hdl, base) = test_setup(Prot::READ);
        let mapping =
            SubMapping::new_base_test(&base).constrain_region(0, 0).unwrap();

        assert_eq!(0, mapping.len());
        assert!(mapping.is_empty());
    }

    #[test]
    fn mapping_valid_subregions() {
        let (_hdl, base) = test_setup(Prot::READ);
        let mapping = SubMapping::new_base_test(&base);

        assert!(mapping.subregion(0, 0).is_some());
        assert!(mapping.subregion(0, TEST_LEN / 2).is_some());
        assert!(mapping.subregion(TEST_LEN, 0).is_some());
    }

    #[test]
    fn mapping_invalid_subregions() {
        let (_hdl, base) = test_setup(Prot::READ);
        let mapping = SubMapping::new_base_test(&base);

        // Beyond the end of the mapping.
        assert!(mapping.subregion(TEST_LEN + 1, 0).is_none());
        assert!(mapping.subregion(TEST_LEN, 1).is_none());

        // Overflow.
        assert!(mapping.subregion(usize::MAX, 1).is_none());
        assert!(mapping.subregion(1, usize::MAX).is_none());
    }

    #[test]
    fn subregion_protection() {
        let (_hdl, base) = test_setup(Prot::RW);
        let mapping = SubMapping::new_base_test(&base);

        // Main region has full access
        let mut buf = [0u8];
        assert!(mapping.write_bytes(&buf).is_ok());
        assert!(mapping.read_bytes(&mut buf).is_ok());

        // Restricted to reads
        let sub_read = mapping
            .subregion(0, TEST_LEN)
            .unwrap()
            .constrain_access(Prot::READ);
        assert!(sub_read.write_bytes(&buf).is_err());
        assert!(sub_read.read_bytes(&mut buf).is_ok());

        // Restricted to writes
        let sub_write = mapping
            .subregion(0, TEST_LEN)
            .unwrap()
            .constrain_access(Prot::WRITE);
        assert!(sub_write.write_bytes(&buf).is_ok());
        assert!(sub_write.read_bytes(&mut buf).is_err());
    }

    // Tests above cover lookups inside one mapping, but Propolis uses memory
    // through a `MemCtx` including an `ASpace<MapEnt>` covering all mappings
    // for the VM.
    #[test]
    fn region_lookup() {
        let hdl = VmmHdl::new_test(TEST_LEN)
            .expect("create tempfile backed test hdl");
        let hdl = Arc::new(hdl);

        let mut phys = PhysMap::new(TEST_LEN, hdl);
        phys.add_mem("test dram".to_string(), 0, PAGE_SIZE)
            .expect("can add test DRAM");
        let mem = phys.finalize();

        let acc_mem = mem.access().expect("can access memory");

        // We can get a readable region covering all added memory.
        let region =
            acc_mem.readable_region(&GuestRegion(GuestAddr(0), PAGE_SIZE));
        assert!(region.is_some());

        // But not a region extending past memory in the VM.
        let region =
            acc_mem.readable_region(&GuestRegion(GuestAddr(0), 2 * PAGE_SIZE));
        assert!(region.is_none());

        // And not a region that would wrap into VM memory.
        let region = acc_mem
            .readable_region(&GuestRegion(GuestAddr(u64::MAX), PAGE_SIZE));
        assert!(region.is_none());
    }
}


================================================
FILE: lib/propolis/src/vmm/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Representation of a VM's hardware and kernel structures.

pub mod hdl;
pub mod machine;
pub mod mem;
pub mod time;

pub use hdl::*;
pub use machine::*;
pub use mem::*;

/// Check that available vmm API matches expectations of propolis crate
pub(crate) fn check_api_version() -> Result<(), crate::api_version::Error> {
    let ctl = bhyve_api::VmmCtlFd::open()?;
    let vers = ctl.api_version()?;

    // propolis only requires the bits provided by V8, currently
    let want = bhyve_api::ApiVersion::V8 as u32;

    if vers < want {
        return Err(crate::api_version::Error::TooLow { have: vers, want });
    }

    Ok(())
}


================================================
FILE: lib/propolis/src/vmm/time.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use serde::{Deserialize, Serialize};
use std::time::Duration;
use thiserror::Error;

use super::VmmHdl;

pub const NS_PER_SEC: u64 = 1_000_000_000;
pub const SEC_PER_DAY: u64 = 24 * 60 * 60;

/// Representation of guest time data state
///
/// This is serialized/deserialized as part of the migration protocol
#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)]
pub struct VmTimeData {
    /// guest TSC frequency (hz)
    pub guest_freq: u64,

    /// current guest TSC
    pub guest_tsc: u64,

    /// monotonic host clock (ns)
    pub hrtime: i64,

    /// wall clock host clock (sec)
    pub hres_sec: u64,

    /// wall clock host clock (ns)
    pub hres_ns: u64,

    /// guest boot_hrtime (can be negative)
    pub boot_hrtime: i64,
}

/// A collection of data about adjustments made to the time data to enable
/// richer log messages
#[derive(Debug)]
pub struct VmTimeDataAdjustments {
    pub guest_uptime_ns: u64,
    pub migrate_delta: Duration,
    pub migrate_delta_negative: bool,
    pub guest_tsc_delta: u64,
    pub boot_hrtime_delta: u64,
}

impl VmTimeData {
    pub fn wall_clock(&self) -> Duration {
        Duration::new(self.hres_sec, self.hres_ns as u32)
    }
}

impl From<bhyve_api::vdi_time_info_v1> for VmTimeData {
    fn from(raw: bhyve_api::vdi_time_info_v1) -> Self {
        Self {
            guest_freq: raw.vt_guest_freq,
            guest_tsc: raw.vt_guest_tsc,
            hrtime: raw.vt_hrtime,
            hres_sec: raw.vt_hres_sec,
            hres_ns: raw.vt_hres_ns,
            boot_hrtime: raw.vt_boot_hrtime,
        }
    }
}
impl From<VmTimeData> for bhyve_api::vdi_time_info_v1 {
    fn from(info: VmTimeData) -> Self {
        bhyve_api::vdi_time_info_v1 {
            vt_guest_freq: info.guest_freq,
            vt_guest_tsc: info.guest_tsc,
            vt_hrtime: info.hrtime,
            vt_hres_sec: info.hres_sec,
            vt_hres_ns: info.hres_ns,
            vt_boot_hrtime: info.boot_hrtime,
        }
    }
}

pub fn import_time_data(
    hdl: &VmmHdl,
    time_info: VmTimeData,
) -> std::io::Result<()> {
    let raw = bhyve_api::vdi_time_info_v1::from(time_info);
    hdl.data_op(bhyve_api::VDC_VMM_TIME, 1).write(&raw)?;

    Ok(())
}

pub fn export_time_data(hdl: &VmmHdl) -> std::io::Result<VmTimeData> {
    let raw = hdl
        .data_op(bhyve_api::VDC_VMM_TIME, 1)
        .read::<bhyve_api::vdi_time_info_v1>()?;

    Ok(VmTimeData::from(raw))
}

/// Returns the current host hrtime and wall clock time
//
// The current host hrtime and wall clock time are exposed via the VMM time data
// interface.
//
// The kernel side of the interface disables interrupts while it takes the clock
// readings; in the absence of a function to translate between the two clock
// values, this is a best effort way to read the hrtime and wall clock times as
// close to as possible at the same point in time. Thus fishing this data out of
// the VMM time data read payload is strictly better than calling
// clock_gettime(3c) twice from userspace.
pub fn host_time_snapshot(hdl: &VmmHdl) -> std::io::Result<(i64, Duration)> {
    let ti = export_time_data(hdl)?;
    let wc = Duration::new(ti.hres_sec, ti.hres_ns as u32);
    let hrt = ti.hrtime;

    Ok((hrt, wc))
}

/// Given an input representation of guest time data on a source host, and a
/// current host hrtime and wallclock time on the target host, output an
/// "adjusted" view of the guest time data. This data can be imported to bhyve
/// to allow guest time (namely, the guest TSC and its device timers) to allow
/// the guest's sense of time to function properly on the target.
//  See comments inline for more details about how we calculate a new guest TSC
//  and boot_hrtime.
pub fn adjust_time_data(
    src: VmTimeData,
    dst_hrt: i64,
    dst_wc: Duration,
) -> Result<(VmTimeData, VmTimeDataAdjustments), TimeAdjustError> {
    // Basic validation: there is no reason system hrtime should be negative,
    // and other calculations assume that, so validate that first
    if dst_hrt < 0 {
        return Err(TimeAdjustError::Hrtime { hrtime: dst_hrt });
    }

    // Find delta between export on source and import on target using wall clock
    // time. This delta is used for adjusting the TSC and boot_hrtime.
    //
    // We expect to be operating on machines with well-synchronized wall
    // clocks, so using wall clock time is a useful shorthand for observing how
    // much time has passed. If for some reason we see a negative delta (see
    // also: #357), clamp the delta to 0.
    //
    // migrate_delta = target wall clock - source wall clock
    let (migrate_delta, migrate_delta_negative) =
        match dst_wc.checked_sub(src.wall_clock()) {
            Some(d) => (d, false),
            None => (Duration::from_secs(0), true),
        };

    if migrate_delta.as_nanos() > (i64::MAX as u128) {
        // migrate delta won't fit in hrtime calculations
        return Err(TimeAdjustError::InvalidMigrateDelta {
            src_wc: src.wall_clock(),
            dst_wc,
        });
    }
    let migrate_delta_ns = migrate_delta.as_nanos() as i64;
    assert!(migrate_delta_ns >= 0, "migrate delta cannot be negative");

    // Find a new boot_hrtime for the guest
    //
    // Device timers are scheduled based on hrtime of the host: For example, a
    // timer that should fire after 1 second is scheduled as: host hrtime + 1s.
    //
    // When devices are exported for migration, time values are normalized
    // against the guest's boot_hrtime on export, and de-normalized against
    // boot_hrtime on import. The boot_hrtime of the guest is set to the hrtime
    // of when the guest booted. Because this value is used on import to fix up
    // timer values, it is critical to set this value prior to importing device
    // state such that existing timers are normalized correctly. As with booting
    // a guest, on the target, it should be set to the hrtime of the host when
    // the guest would have booted, had it booted on that target.
    //
    // An example may be helpful. Consider a guest that has 5 days of uptime,
    // booted on a host with 30 days of uptime. Suppose that guest is migrated
    // with a device timer that should fire 1 second in the future.
    //
    // +=================================================================+
    // | hrtime (source) | guest hrtime | boot_hrtime  | timer value     |
    // +-----------------------------------------------------------------+
    // | 30 days         | 5 days       |(30 - 5) days | src hrtime + 1s |
    // |                 |              | 25 days      | 30 days + 1s    |
    // +=================================================================+
    //
    // Suppose the guest is then migrated to a host with 100 days of uptime.
    // On migration, the existing timer is normalized before export by
    // subtracting out boot_hrtime:
    //       normalized = timer - boot_hrtime
    //                  = (30 days + 1 sec) - 25 days
    //                  = 5 days + 1 sec
    //
    // When the timer is imported, it is denormalized by adding back in
    // the new boot_hrtime. The timer should still fire 1 second from the
    // current hrtime of the host. The target hrtime is 100 days, so the timer
    // should fire at 100 days + 1 sec.
    //
    // Working backwards to get the new boot_hrtime, we have:
    //
    //       denormalized = normalized + boot_hrtime
    //       boot_hrtime  = denormalized - normalized
    //       boot_hrtime  = (100 days + 1 sec) - (5 days + 1 sec)
    //       boot_hrtime  = 95 days
    //
    // And on the target, the timer should still fire 1 second into the future
    // as expected:
    //
    // +=====================================================================+
    // | hrtime (target) | guest hrtime | boot_hrtime   | timer value        |
    // +---------------------------------------------------------------------+
    // | 100 days        | 5 days       |(100 - 5) days |     5 days + 1 sec |
    // |                 |              | 95 days       |   + 95 days        |
    // |                 |              |               | = 100 days + 1 sec |
    // +=====================================================================+
    //
    // NB: It is possible for boot_hrtime to be negative; this occurs if a
    // guest has a longer uptime than its host (an expected common case for
    // migration). This is okay: hrtime is a signed value, and the normalization
    // maths still work with negative values.
    //

    // guest_uptime_ns  = source hrtime - source boot_hrtime
    let guest_uptime_ns: i64 = src
        .hrtime
        .checked_sub(src.boot_hrtime)
        .ok_or_else(|| TimeAdjustError::GuestUptimeOverflow {
            desc: "src_hrt - boot_hrtime",
            src_hrt: src.hrtime,
            boot_hrtime: src.boot_hrtime,
        })?;
    if guest_uptime_ns < 0 {
        // This can only happen if somehow boot_hrtime was in the future on the
        // source, which is an invalid state
        return Err(TimeAdjustError::GuestUptimeOverflow {
            desc: "src_hrt < boot_hrtime",
            src_hrt: src.hrtime,
            boot_hrtime: src.boot_hrtime,
        });
    }

    // boot_hrtime_delta = guest_uptime_ns + migrate_delta_ns
    let boot_hrtime_delta: i64 = guest_uptime_ns
        .checked_add(migrate_delta_ns)
        .ok_or_else(|| TimeAdjustError::TimeDeltaOverflow {
            uptime_ns: guest_uptime_ns,
            migrate_delta,
        })?;

    // boot_hrtime = target hrtime - boot_hrtime_delta
    let new_boot_hrtime: i64 = dst_hrt
        .checked_sub(boot_hrtime_delta)
        .ok_or_else(|| TimeAdjustError::BootHrtimeOverflow {
            total_delta: boot_hrtime_delta as u64,
            dst_hrtime: dst_hrt,
        })?;

    // Get the guest TSC adjustment and add it to the old guest TSC
    //
    // We move the guest TSC forward based on the migrate delta, such that the
    // guest TSC reflects the time passed in migration.
    //
    // NB: It is okay to overflow the TSC here: It is possible for the guest to
    // write to the TSC, and if it did so it might expect it to overflow.
    let guest_tsc_delta = calc_tsc_delta(migrate_delta, src.guest_freq)?;
    let new_guest_tsc = src.guest_tsc.wrapping_add(guest_tsc_delta);

    Ok((
        VmTimeData {
            guest_freq: src.guest_freq,
            guest_tsc: new_guest_tsc,
            hrtime: dst_hrt,
            hres_sec: dst_wc.as_secs(),
            hres_ns: u64::from(dst_wc.subsec_nanos()),
            boot_hrtime: new_boot_hrtime,
        },
        VmTimeDataAdjustments {
            guest_uptime_ns: guest_uptime_ns as u64,
            migrate_delta,
            migrate_delta_negative,
            guest_tsc_delta,
            boot_hrtime_delta: boot_hrtime_delta as u64,
        },
    ))
}

/// Errors related to making timing adjustment calcultions
#[derive(Clone, Debug, Error)]
pub enum TimeAdjustError {
    /// Negative system hrtime
    #[error("invalid system hrtime: src={hrtime}")]
    Hrtime {
        /// target host hrtime
        hrtime: i64,
    },

    /// Error calculating migration time delta
    #[error("invalid migration delta: src={src_wc:?},dst={dst_wc:?}")]
    InvalidMigrateDelta {
        /// source host wall clock time
        src_wc: Duration,

        /// destination host wall clock time
        dst_wc: Duration,
    },

    /// Error calculating guest uptime
    #[error(
        "guest uptime cannot be represented: \
        desc={desc}, src_hrtime={src_hrt:?}, boot_hrtime={boot_hrtime}"
    )]
    GuestUptimeOverflow {
        /// error description
        desc: &'static str,

        /// source host hrtime
        src_hrt: i64,

        /// input guest boot_hrtime
        boot_hrtime: i64,
    },

    /// Invalid total delta for boot_hrtime calculations
    #[error(
        "could not calculate time delta: \
            guest uptime {uptime_ns} ns, migrate_delta={migrate_delta:?}"
    )]
    TimeDeltaOverflow {
        /// guest uptime
        uptime_ns: i64,

        /// migration time delta
        migrate_delta: Duration,
    },

    /// Invalid calculated boot_hrtime
    #[error(
        "guest boot_hrtime cannot be represented: \
            total_delta={total_delta:?}, dst_hrtime={dst_hrtime:?}"
    )]
    BootHrtimeOverflow {
        /// calculated total delta (uptime + migration delta)
        total_delta: u64,

        /// destination host hrtime
        dst_hrtime: i64,
    },

    /// Invalid guest TSC adjustment
    #[error(
        "could not calculate TSC adjustment: \
            desc=\"{desc:?}\", migrate_delta={migrate_delta:?},
            guest_hz={guest_hz}, tsc_adjust={tsc_adjust}"
    )]
    TscAdjustOverflow {
        /// error description
        desc: &'static str,

        /// migration time delta
        migrate_delta: Duration,

        /// guest TSC frequency (hz)
        guest_hz: u64,

        /// calculated TSC adjustment
        tsc_adjust: u128,
    },
}

/// Calculate the adjustment needed for the guest TSC.
///
/// ticks = (migrate_delta ns * guest_hz hz) / `NS_PER_SEC`
fn calc_tsc_delta(
    migrate_delta: Duration,
    guest_hz: u64,
) -> Result<u64, TimeAdjustError> {
    assert_ne!(guest_hz, 0);

    let delta_ns: u128 = migrate_delta.as_nanos();
    let mut tsc_adjust: u128 = 0;

    let upper: u128 =
        delta_ns.checked_mul(u128::from(guest_hz)).ok_or_else(|| {
            TimeAdjustError::TscAdjustOverflow {
                desc: "migrate_delta * guest_hz",
                migrate_delta,
                guest_hz,
                tsc_adjust,
            }
        })?;

    tsc_adjust =
        upper.checked_div(u128::from(NS_PER_SEC)).ok_or_else(|| {
            TimeAdjustError::TscAdjustOverflow {
                desc: "upper / NS_PER_SEC",
                migrate_delta,
                guest_hz,
                tsc_adjust,
            }
        })?;
    if tsc_adjust > u128::from(u64::MAX) {
        return Err(TimeAdjustError::TscAdjustOverflow {
            desc: "tsc_adjust > 64-bits",
            migrate_delta,
            guest_hz,
            tsc_adjust,
        });
    }

    Ok(tsc_adjust as u64)
}

#[cfg(test)]
mod test {
    use std::time::Duration;

    use crate::vmm::time::SEC_PER_DAY;

    use super::{
        adjust_time_data, calc_tsc_delta, TimeAdjustError, VmTimeData,
        NS_PER_SEC,
    };

    fn base_time_data() -> VmTimeData {
        VmTimeData {
            // non-zero freq, so as not to blow any asserts
            guest_freq: 1,
            guest_tsc: 0,
            hrtime: 0,
            hres_sec: 0,
            hres_ns: 0,
            boot_hrtime: 0,
        }
    }

    #[test]
    fn test_invalid_hrtime() {
        // system hrtime should not be negative
        let res =
            adjust_time_data(base_time_data(), -1, Duration::from_nanos(0));
        assert!(res.is_err());
        assert!(matches!(res, Err(TimeAdjustError::Hrtime { .. })));
    }

    // migrate_delta = target wall clock - source wall clock
    //               = dst_wc - src.wall_clock()
    #[test]
    fn test_calc_migrate_delta() {
        // valid input

        // 1 sec - 0 sec
        let dst_wc = Duration::from_secs(1);
        let res = adjust_time_data(base_time_data(), 0, dst_wc);
        assert!(res.is_ok());
        let adj = res.unwrap().1;
        assert_eq!(adj.migrate_delta, Duration::from_secs(1));
        assert!(!adj.migrate_delta_negative);

        // 1 ns - 0 sec
        let dst_wc = Duration::from_nanos(1);
        let res = adjust_time_data(base_time_data(), 0, dst_wc);
        assert!(res.is_ok());
        let adj = res.unwrap().1;
        assert_eq!(adj.migrate_delta, Duration::from_nanos(1));
        assert!(!adj.migrate_delta_negative);

        // 0 sec - 0 sec
        let dst_wc = Duration::from_nanos(0);
        let res = adjust_time_data(base_time_data(), 0, dst_wc);
        assert!(res.is_ok());
        let adj = res.unwrap().1;
        assert_eq!(adj.migrate_delta, Duration::from_nanos(0));
        assert!(!adj.migrate_delta_negative);

        // negative migrate delta should be clamped to 0
        // 0 sec - 1 sec
        let src_td = VmTimeData { hres_sec: 1, ..base_time_data() };
        let dst_wc = Duration::from_nanos(0);
        let res = adjust_time_data(src_td, 0, dst_wc);
        assert!(res.is_ok());
        let adj = res.unwrap().1;
        assert_eq!(adj.migrate_delta, Duration::from_nanos(0));
        assert!(adj.migrate_delta_negative);

        // 0 sec - 1 ns
        let src_td = VmTimeData { hres_ns: 1, ..base_time_data() };
        let dst_wc = Duration::from_nanos(0);
        let res = adjust_time_data(src_td, 0, dst_wc);
        assert!(res.is_ok());
        let adj = res.unwrap().1;
        assert_eq!(adj.migrate_delta, Duration::from_nanos(0));
        assert!(adj.migrate_delta_negative);

        // error case: migrate delta overflows i64
        // (i64::MAX + 1) sec - 0 sec
        let dst_wc = Duration::from_nanos((i64::MAX as u64) + 1);
        let res = adjust_time_data(base_time_data(), 0, dst_wc);
        assert!(res.is_err());
        assert!(matches!(
            res,
            Err(TimeAdjustError::InvalidMigrateDelta { .. })
        ));
    }

    struct Gutv {
        hrt: i64,
        bhrt: i64,
        res: u64,
    }
    const GUEST_UPTIME_TESTS_VALID: &'static [Gutv] = &[
        // edge case: boot_hrtime == 0
        Gutv { hrt: 1, bhrt: 0, res: 1 },
        // boot_hrtime > 0
        // guest was booted on this host, or was migrated to a host with higher
        // uptime than itself
        Gutv {
            hrt: 300_000_000_000,
            bhrt: 200_000_000_000,
            res: 100_000_000_000,
        },
        Gutv { hrt: i64::MAX, bhrt: i64::MAX - 1, res: 1 },
        // edge case: src_hrt == boot_hrtime
        Gutv { hrt: 0, bhrt: 0, res: 0 },
        Gutv { hrt: 300_000_000_000, bhrt: 300_000_000_000, res: 0 },
        Gutv { hrt: i64::MAX, bhrt: i64::MAX, res: 0 },
        // boot_hrtime < 0
        // guest came from a host with less uptime than itself
        Gutv { hrt: 1000, bhrt: -100, res: 1100 },
    ];
    struct Guti {
        hrt: i64,
        bhrt: i64,
    }
    const GUEST_UPTIME_TESTS_INVALID: &'static [Guti] = &[
        // src_hrt - boot_hrtime underflows i64
        // (0 - i64::MAX)
        Guti { hrt: 0, bhrt: i64::MAX },
        // (src_hrt - boot_hrtime) overflows i64
        // (i64::MAX - -1)
        Guti { hrt: i64::MAX, bhrt: -1 },
        // src_hrt < boot_hrtime
        // (0 < 1)
        Guti { hrt: 0, bhrt: 1 },
    ];

    // guest_uptime_ns  = source hrtime - boot_hrtime
    #[test]
    fn test_calc_guest_uptime() {
        // valid cases
        for i in 0..GUEST_UPTIME_TESTS_VALID.len() {
            let t = &GUEST_UPTIME_TESTS_VALID[i];

            let msg = format!(
                "src_hrtime={}, boot_hrtime={}, expected={:?}",
                t.hrt, t.bhrt, t.res
            );

            let src_td = VmTimeData {
                hrtime: t.hrt,
                boot_hrtime: t.bhrt,
                ..base_time_data()
            };
            let res = adjust_time_data(src_td, 0, Duration::from_nanos(0));
            match res {
                Ok(v) => {
                    assert_eq!(
                        v.1.guest_uptime_ns, t.res,
                        "got incorrect value: {}",
                        msg
                    );
                }
                Err(e) => {
                    assert!(false, "got error {}: {}", e, msg);
                }
            }
        }

        // error cases
        for i in 0..GUEST_UPTIME_TESTS_INVALID.len() {
            let t = &GUEST_UPTIME_TESTS_INVALID[i];
            let msg = format!("src_hrtime={}, boot_hrtime={}", t.hrt, t.bhrt,);

            let src_td = VmTimeData {
                hrtime: t.hrt,
                boot_hrtime: t.bhrt,
                ..base_time_data()
            };
            let res = adjust_time_data(src_td, 0, Duration::from_nanos(0));
            match res {
                Ok(v) => {
                    assert!(
                        false,
                        "expected error but got value {:?}: {}",
                        v, msg
                    );
                }
                Err(TimeAdjustError::GuestUptimeOverflow { .. }) => {
                    // test passes
                }
                Err(e) => {
                    assert!(false, "got incorrect error type {:?}: {}", e, msg);
                }
            }
        }
    }

    // boot_hrtime_delta = guest_uptime_ns + migrate_delta
    #[test]
    fn test_calc_boot_hrtime_delta() {
        // valid input
        //
        // 30 secs guest uptime + 10 sec migrate delta = 40 sec delta
        let src_td = VmTimeData {
            // 30 sec host uptime - 0 boot_hrtime = 30 secs guest uptime
            hrtime: 30 * (NS_PER_SEC as i64),
            ..base_time_data()
        };
        // 10 sec dst_wc - 0 sec src_wc = 10 secs migrate_delta
        let res = adjust_time_data(src_td, 0, Duration::from_secs(10));
        assert!(res.is_ok());
        assert_eq!(res.unwrap().1.boot_hrtime_delta, 40 * NS_PER_SEC);

        // edge case: max guest uptime
        //
        // i64::MAX ns guest uptime + 0 sec migrate delta = i64::MAX ns delta
        let src_td = VmTimeData { hrtime: i64::MAX, ..base_time_data() };
        let res = adjust_time_data(src_td, 0, Duration::from_secs(0));
        assert!(res.is_ok());
        assert_eq!(res.unwrap().1.boot_hrtime_delta, i64::MAX as u64);

        // error case: uptime + migrate_delta overflows u64
        //
        // i64::MAX ns guest uptime + 1 ns delta = overflow i64
        let src_td = VmTimeData { hrtime: i64::MAX, ..base_time_data() };
        let res = adjust_time_data(src_td, 0, Duration::from_nanos(1));
        assert!(res.is_err());
        assert!(matches!(res, Err(TimeAdjustError::TimeDeltaOverflow { .. })));
    }

    // boot_hrtime = target hrtime - boot_hrtime_delta
    //
    // boot_hrtime = target hrtime - ((source_hrtime - boot_hrtime)
    //                                + (dst_wc - src.wall_clock())
    #[test]
    fn test_calc_boot_hrtime() {
        // valid input

        // positive boot_hrtime: target hrtime > boot_hrtime_delta
        //
        // target_hrtime     = 4 days, 1 min, 500 ns
        // boot_hrtime_delta = 3 days, 300 ns
        // boot_hrtime       = 1 day, 1 min, 200 ns
        let dst_hrt =
            Duration::new(4 * SEC_PER_DAY + 60, 500).as_nanos() as i64;
        // decompose boot_hrtime delta into:
        // - 3 days guest uptime
        // - 300 ns migrate delta
        let src_td = VmTimeData {
            // 5 days of host uptime
            hrtime: (5 * SEC_PER_DAY * NS_PER_SEC) as i64,

            // 3 days of guest uptime = 2 days boot_hrtime
            boot_hrtime: (2 * SEC_PER_DAY * NS_PER_SEC) as i64,
            ..base_time_data()
        };
        // migrate_delta: 300 ns dst_wc - 0 ns src_wc
        let dst_wc = Duration::from_nanos(300);

        // expect: 1 day, 1 min, 200 ns
        let expect = ((SEC_PER_DAY + 60) * NS_PER_SEC) as i64 + 200;
        let res = adjust_time_data(src_td, dst_hrt, dst_wc);
        assert!(res.is_ok());
        assert_eq!(res.unwrap().0.boot_hrtime, expect);

        // negative boot_hrtime result: target hrtime < boot_hrtime_delta
        // (guest has longer uptime than target host)
        //
        // target_hrtime     = 3 days, 300 ns
        // boot_hrtime_delta = 4 days, 1 min, 500 ns
        // boot_hrtime       = -(1 day, 1 min, 200 ns)
        let dst_hrt = Duration::new(3 * SEC_PER_DAY, 300).as_nanos() as i64;
        // decompose boot_hrtime_delta: 4 days, 1 min, 500 ns
        // - 4 days guest uptime
        // - 1 min 500 ns migrate delta
        let src_td = VmTimeData {
            // 10 days host uptime
            hrtime: (10 * SEC_PER_DAY * NS_PER_SEC) as i64,

            // 4 days of guest uptime = 6 days boot_hrtime
            boot_hrtime: (6 * SEC_PER_DAY * NS_PER_SEC) as i64,

            // src_wc = 5 sec
            hres_sec: 5,
            hres_ns: 0,
            ..base_time_data()
        };
        // migrate_delta: 1 min 500 ns = dst_wc - src_wc
        //
        //               1 min 500 ns = dst_wc - 5 sec
        // dst_wc        1 min 500 ns + 5 sec = 65 sec 500 ns
        let dst_wc = Duration::new(65, 500);

        // expect: - (1 day, 1 min, 200 ns)
        let expect: i64 = -(((SEC_PER_DAY + 60) * NS_PER_SEC) as i64 + 200);
        let res = adjust_time_data(src_td, dst_hrt, dst_wc);
        assert!(res.is_ok());
        assert_eq!(res.unwrap().0.boot_hrtime, expect);
    }

    #[test]
    fn test_calc_tsc_delta() {
        // valid input

        // 1 GHz, 1 second
        let migrate_delta = Duration::from_nanos(NS_PER_SEC);
        let guest_hz = 1_000_000_000;
        let expect = NS_PER_SEC;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_ok());
        assert_eq!(res.unwrap(), expect);

        // 1 GHz, 20 seconds
        let migrate_delta = Duration::from_nanos(NS_PER_SEC * 20);
        let guest_hz = 1_000_000_000;
        let expect = NS_PER_SEC * 20;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_ok());
        assert_eq!(res.unwrap(), expect);

        // 2.5 GHz, 1 second
        let migrate_delta = Duration::from_nanos(NS_PER_SEC);
        let guest_hz = 2_500_000_000;
        let expect = 2_500_000_000;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_ok());
        assert_eq!(res.unwrap(), expect);

        // 2.5 GHz, 20 seconds
        let migrate_delta = Duration::from_nanos(NS_PER_SEC * 20);
        let guest_hz = 2_500_000_000;
        let expect = 50_000_000_000;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_ok());
        assert_eq!(res.unwrap(), expect);

        // error cases

        // delta * guest_hz overflows u128
        let migrate_delta = Duration::from_secs(u64::MAX);
        let guest_hz = u64::MAX;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_err());
        assert!(matches!(res, Err(TimeAdjustError::TscAdjustOverflow { .. })));

        // (delta * guest_hz) / NS_PER_SEC overflows u64
        let migrate_delta = Duration::from_secs(u64::MAX);
        let guest_hz = 1_000_000_000;
        let res = calc_tsc_delta(migrate_delta, guest_hz);
        assert!(res.is_err());
        assert!(matches!(res, Err(TimeAdjustError::TscAdjustOverflow { .. })));
    }

    #[test]
    fn test_calc_guest_tsc() {
        // valid input
        //
        // 1GHz
        // TSC + 3 sec 3 ns
        let src_td = VmTimeData { guest_freq: NS_PER_SEC, ..base_time_data() };
        let res = adjust_time_data(src_td, 0, Duration::new(3, 3));
        let expect = 3 * NS_PER_SEC + 3;
        assert!(res.is_ok());
        assert_eq!(res.unwrap().0.guest_tsc, expect);

        // 2Ghz
        // TSC + 3 sec 3 ns
        let src_td =
            VmTimeData { guest_freq: 2 * NS_PER_SEC, ..base_time_data() };
        let res = adjust_time_data(src_td, 0, Duration::new(3, 3));
        let expect = 2 * (3 * NS_PER_SEC + 3);
        assert!(res.is_ok());
        assert_eq!(res.unwrap().0.guest_tsc, expect);

        // valid input: tsc overflows u64
        let src_td = VmTimeData {
            guest_freq: NS_PER_SEC,
            guest_tsc: u64::MAX,
            ..base_time_data()
        };
        // + 3 sec 3 ns delta: TSC should wrap around to 3 sec 2 ns
        let res = adjust_time_data(src_td, 0, Duration::new(3, 3));
        let expect = 3 * NS_PER_SEC + 2;
        assert!(res.is_ok());
        assert_eq!(res.unwrap().0.guest_tsc, expect);
    }
}


================================================
FILE: lib/propolis/src/vsock/buffer.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::num::NonZeroUsize;
use std::num::Wrapping;

#[derive(Debug, thiserror::Error)]
pub enum VsockBufError {
    #[error(
        "VsockBuf has {remaining} bytes available but tried to push {pushed}"
    )]
    InsufficientSpace { pushed: usize, remaining: usize },
}

/// A ringbuffer used to store guest -> host data
pub struct VsockBuf {
    buf: Box<[u8]>,
    head: Wrapping<usize>,
    tail: Wrapping<usize>,
}

impl std::fmt::Debug for VsockBuf {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("VsockBuf")
            .field("capacity", &self.capacity())
            .field("head", &self.head)
            .field("tail", &self.tail)
            .field("in_use", &self.len())
            .field("free", &self.free())
            .finish()
    }
}

impl VsockBuf {
    /// Create a new `VsockBuf`
    pub fn new(capacity: NonZeroUsize) -> Self {
        let capacity = capacity.get();
        Self {
            buf: vec![0; capacity].into_boxed_slice(),
            head: Wrapping(0),
            tail: Wrapping(0),
        }
    }

    pub fn capacity(&self) -> usize {
        self.buf.len()
    }

    pub fn len(&self) -> usize {
        (self.head - self.tail).0
    }

    fn free(&self) -> usize {
        self.capacity() - self.len()
    }

    pub fn is_empty(&self) -> bool {
        self.head == self.tail
    }

    pub fn push(
        &mut self,
        data: impl AsRef<[u8]>,
    ) -> Result<(), VsockBufError> {
        let data = data.as_ref();

        if data.len() > self.free() {
            return Err(VsockBufError::InsufficientSpace {
                pushed: data.len(),
                remaining: self.free(),
            });
        }

        let head_offset = self.head.0 % self.buf.len();
        let available_len = self.buf.len() - head_offset;

        // If the data can fit in the remaining space of the ring buffer, copy
        // it in one go.
        if data.len() <= available_len {
            self.buf[head_offset..head_offset + data.len()]
                .copy_from_slice(&data);
        // Otherwise, split it and write the remaining data to the front.
        } else {
            let (fits, wrapped) = data.split_at(available_len);
            self.buf[head_offset..].copy_from_slice(fits);
            self.buf[..wrapped.len()].copy_from_slice(wrapped);
        }

        self.head += Wrapping(data.len());
        Ok(())
    }

    pub fn write_to<W: std::io::Write>(
        &mut self,
        writer: &mut W,
    ) -> std::io::Result<usize> {
        // If we have no data to write, bail early
        if self.is_empty() {
            return Ok(0);
        }

        let tail_offset = self.tail.0 % self.buf.len();
        let head_offset = self.head.0 % self.buf.len();

        // If the data is contiguous, write it in one go
        let nwritten = if tail_offset < head_offset {
            writer.write(&self.buf[tail_offset..head_offset])?
        } else {
            // Data wraps around, so try to write it in batches
            let available_len = self.buf.len() - tail_offset;
            let nwritten = writer.write(&self.buf[tail_offset..])?;

            // If we failed to write the entire first segment, return early
            if nwritten < available_len {
                self.tail += Wrapping(nwritten);
                return Ok(nwritten);
            }

            // If we were successful, attempt to continue writing the wrapped
            // around segment
            let second_nwritten = writer.write(&self.buf[..head_offset])?;
            nwritten + second_nwritten
        };

        self.tail += Wrapping(nwritten);
        Ok(nwritten)
    }
}

#[cfg(test)]
mod test {
    use std::{io::Cursor, num::NonZeroUsize};

    use crate::vsock::buffer::VsockBuf;

    #[test]
    fn test_capacity_and_len() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        assert_eq!(vb.capacity(), 10);
        assert!(vb.is_empty());

        let data = vec![1; 8];
        let data_len = data.len();
        assert!(vb.push(data).is_ok());
        assert!(!vb.is_empty());
        assert_eq!(vb.capacity(), 10);
        assert_eq!(vb.len(), data_len);
    }

    #[test]
    fn test_push_less_than_capacity() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        let data = vec![1; 8];
        assert!(vb.push(data).is_ok());
    }

    #[test]
    fn test_push_more_than_capacity() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        let data = vec![1; 8];
        assert!(vb.push(data).is_ok());

        let data = vec![1; 8];
        assert!(vb.push(data).is_err());
    }

    #[test]
    fn test_write_to() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        let data = vec![1; 10];
        assert!(vb.push(data).is_ok());

        let mut some_socket = [1; 10];
        let mut cursor = Cursor::new(&mut some_socket[..]);
        assert!(vb.write_to(&mut cursor).is_ok_and(|n| n == 10));
    }

    #[test]
    fn test_partial_write_to() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        let data = vec![1; 10];
        assert!(vb.push(data).is_ok());

        let mut some_socket = [1; 5];
        let mut cursor = Cursor::new(&mut some_socket[..]);
        assert!(vb.write_to(&mut cursor).is_ok_and(|n| n == 5));
        assert_eq!(vb.len(), 5, "5 bytes remain");

        // reset the cursor and read another chunk
        cursor.set_position(0);
        assert!(vb.write_to(&mut cursor).is_ok_and(|n| n == 5));
        assert!(vb.is_empty());
    }

    #[test]
    fn test_wrap_around() {
        let mut vb = VsockBuf::new(NonZeroUsize::new(10).unwrap());
        let data = vec![1; 8];
        assert!(vb.push(data).is_ok());

        let mut some_socket = [1; 4];
        let mut cursor = Cursor::new(&mut some_socket[..]);
        assert!(vb.write_to(&mut cursor).is_ok_and(|n| n == 4));
        assert_eq!(some_socket, [1u8; 4]);

        let data = vec![2; 4];
        assert!(vb.push(data).is_ok());

        let mut some_socket = [1; 8];
        let mut cursor = Cursor::new(&mut some_socket[..]);
        assert!(vb.write_to(&mut cursor).is_ok_and(|n| n == 8));
        assert_eq!(some_socket, [1, 1, 1, 1, 2, 2, 2, 2]);
    }
}


================================================
FILE: lib/propolis/src/vsock/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub mod buffer;
pub mod packet;

#[cfg(target_os = "illumos")]
pub mod poller;

#[cfg(not(target_os = "illumos"))]
#[path = "poller_stub.rs"]
pub mod poller;

pub mod proxy;
pub use proxy::VsockProxy;

/// Well-known CID for the host
pub(crate) const VSOCK_HOST_CID: u64 = 2;

#[derive(Debug, thiserror::Error)]
#[error("guest cid {0} contains reserved bits")]
pub struct InvalidGuestCid(u64);

#[derive(Debug, Copy, Clone)]
pub struct GuestCid(u64);

impl GuestCid {
    pub const fn get(&self) -> u64 {
        self.0
    }
}

impl TryFrom<u64> for GuestCid {
    type Error = InvalidGuestCid;

    fn try_from(value: u64) -> Result<Self, Self::Error> {
        match value {
            // Within the virtio spec cid 0,1, and 2 have special meaning.
            cid @ 0..=2 => Err(InvalidGuestCid(cid)),
            // The upper 32 bits of the cid are reserved
            cid if cid >> 32 != 0 => Err(InvalidGuestCid(value)),
            // This cid is valid
            cid => Ok(GuestCid(cid)),
        }
    }
}

#[derive(Debug, thiserror::Error)]
pub enum VsockError {
    #[error("failed to send virt queue notification for queue {}", queue)]
    QueueNotify { queue: u16 },
}

pub trait VsockBackend: Send + Sync + 'static {
    fn queue_notify(&self, queue_id: u16) -> Result<(), VsockError>;
}

#[usdt::provider(provider = "propolis")]
mod probes {
    use crate::vsock::packet::VsockPacketHeader;

    /// Host->Guest
    fn vsock_pkt_rx(hdr: &VsockPacketHeader) {}
    /// Guest->Host
    fn vsock_pkt_tx(hdr: &VsockPacketHeader) {}
}


================================================
FILE: lib/propolis/src/vsock/packet.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use serde::{Serialize, Serializer};
use strum::FromRepr;
use zerocopy::byteorder::little_endian::{U16, U32, U64};
use zerocopy::{FromBytes, Immutable, IntoBytes};

use crate::vsock::proxy::CONN_TX_BUF_SIZE;
use crate::vsock::{GuestCid, VSOCK_HOST_CID};

bitflags! {
    /// Shutdown flags for VIRTIO_VSOCK_OP_SHUTDOWN
    #[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize)]
    #[repr(transparent)]
    pub struct VsockPacketFlags: u32 {
        const VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE = 1 << 0;
        const VIRTIO_VSOCK_SHUTDOWN_F_SEND = 1 << 1;
    }
}

#[derive(Debug, Clone, Copy, FromRepr, PartialEq, Eq, Serialize)]
#[repr(u16)]
pub enum VsockSocketType {
    Stream = 1,
    SeqPacket = 2,
    #[cfg(test)]
    InvalidTestValue = 0x1de,
}

#[derive(thiserror::Error, Debug)]
pub enum VsockPacketError {
    #[error("failed to read packet header from descriptor chain")]
    ChainHeaderRead,

    #[error(
        "vsock packet header reported {hdr_len} bytes but the descriptor \
        chain contains {chain_len}"
    )]
    InvalidPacketLen { hdr_len: usize, chain_len: usize },

    #[error(
        "descriptor chain only yielded {remaining} bytes out of {expected} \
        bytes"
    )]
    InsufficientBytes { expected: usize, remaining: usize },

    #[error("src_cid {src_cid} contains reserved bits")]
    InvalidSrcCid { src_cid: u64 },

    #[error("dst_cid {dst_cid} contains reserved bits")]
    InvalidDstCid { dst_cid: u64 },
}

#[derive(Clone, Copy, Debug, FromRepr, Eq, PartialEq, Serialize)]
#[repr(u16)]
pub enum VsockPacketOp {
    Request = 1,
    Response = 2,
    Reset = 3,
    Shutdown = 4,
    ReadWrite = 5,
    CreditUpdate = 6,
    CreditRequest = 7,
}

/// Represents the required vsock fields required to send a packet to a guest.
pub struct VsockGuestAddr {
    /// Guest context ID
    pub guest_cid: GuestCid,
    /// Host port
    pub src_port: u32,
    /// Guest port
    pub dst_port: u32,
}

#[repr(C, packed)]
#[derive(Copy, Clone, Default, Debug, FromBytes, IntoBytes, Immutable)]
pub struct VsockPacketHeader {
    src_cid: U64,
    dst_cid: U64,
    src_port: U32,
    dst_port: U32,
    len: U32,
    // Note this is "type" in the spec
    socket_type: U16,
    op: U16,
    flags: U32,
    buf_alloc: U32,
    fwd_cnt: U32,
}

// NB: This implementation is here to support dtrace usdt probes.
impl Serialize for VsockPacketHeader {
    fn serialize<S: Serializer>(
        &self,
        serializer: S,
    ) -> Result<S::Ok, S::Error> {
        use serde::ser::SerializeStruct;
        let mut s = serializer.serialize_struct("VsockPacketHeader", 10)?;
        s.serialize_field("src_cid", &self.src_cid.get())?;
        s.serialize_field("dst_cid", &self.dst_cid.get())?;
        s.serialize_field("src_port", &self.src_port.get())?;
        s.serialize_field("dst_port", &self.dst_port.get())?;
        s.serialize_field("len", &self.len.get())?;
        s.serialize_field("socket_type", &self.socket_type())?;
        s.serialize_field("op", &self.op())?;
        s.serialize_field("flags", &self.flags())?;
        s.serialize_field("buf_alloc", &self.buf_alloc.get())?;
        s.serialize_field("fwd_cnt", &self.fwd_cnt.get())?;
        s.end()
    }
}

impl VsockPacketHeader {
    pub fn src_cid(&self) -> u64 {
        self.src_cid.get()
    }

    pub fn dst_cid(&self) -> u64 {
        self.dst_cid.get()
    }

    pub fn src_port(&self) -> u32 {
        self.src_port.get()
    }

    pub fn dst_port(&self) -> u32 {
        self.dst_port.get()
    }

    pub fn len(&self) -> u32 {
        self.len.get()
    }

    pub fn socket_type(&self) -> Option<VsockSocketType> {
        VsockSocketType::from_repr(self.socket_type.get())
    }

    pub fn op(&self) -> Option<VsockPacketOp> {
        VsockPacketOp::from_repr(self.op.get())
    }

    pub fn flags(&self) -> VsockPacketFlags {
        VsockPacketFlags::from_bits_retain(self.flags.get())
    }

    pub fn buf_alloc(&self) -> u32 {
        self.buf_alloc.get()
    }

    pub fn fwd_cnt(&self) -> u32 {
        self.fwd_cnt.get()
    }

    pub const fn new() -> Self {
        Self {
            src_cid: U64::new(0),
            dst_cid: U64::new(0),
            src_port: U32::new(0),
            dst_port: U32::new(0),
            len: U32::new(0),
            socket_type: U16::new(VsockSocketType::Stream as u16),
            op: U16::new(0),
            flags: U32::new(0),
            buf_alloc: U32::new(CONN_TX_BUF_SIZE as u32),
            fwd_cnt: U32::new(0),
        }
    }

    pub const fn set_host_src_cid(&mut self) -> &mut Self {
        self.src_cid = U64::new(VSOCK_HOST_CID);
        self
    }

    #[cfg(test)]
    pub const fn set_src_cid(&mut self, cid: GuestCid) -> &mut Self {
        self.src_cid = U64::new(cid.get());
        self
    }

    #[cfg(test)]
    pub const fn set_dst_cid_raw(&mut self, cid: u64) -> &mut Self {
        self.dst_cid = U64::new(cid);
        self
    }

    pub const fn set_dst_cid(&mut self, cid: GuestCid) -> &mut Self {
        self.dst_cid = U64::new(cid.get());
        self
    }

    pub const fn set_src_port(&mut self, port: u32) -> &mut Self {
        self.src_port = U32::new(port);
        self
    }

    pub const fn set_dst_port(&mut self, port: u32) -> &mut Self {
        self.dst_port = U32::new(port);
        self
    }

    pub const fn set_len(&mut self, len: u32) -> &mut Self {
        self.len = U32::new(len);
        self
    }

    pub const fn set_socket_type(
        &mut self,
        socket_type: VsockSocketType,
    ) -> &mut Self {
        self.socket_type = U16::new(socket_type as u16);
        self
    }

    pub const fn set_op(&mut self, op: VsockPacketOp) -> &mut Self {
        self.op = U16::new(op as u16);
        self
    }

    pub const fn set_flags(&mut self, flags: VsockPacketFlags) -> &mut Self {
        self.flags = U32::new(flags.bits());
        self
    }

    pub const fn set_buf_alloc(&mut self, buf_alloc: u32) -> &mut Self {
        self.buf_alloc = U32::new(buf_alloc);
        self
    }

    pub const fn set_fwd_cnt(&mut self, fwd_cnt: u32) -> &mut Self {
        self.fwd_cnt = U32::new(fwd_cnt);
        self
    }
}

#[derive(Default)]
pub struct VsockPacket {
    pub(crate) header: VsockPacketHeader,
    pub(crate) data: Box<[u8]>,
}

impl std::fmt::Debug for VsockPacket {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("VsockPacket")
            .field("header", &self.header)
            .field("data_len", &self.data.len())
            .finish()
    }
}

impl VsockPacket {
    fn new(addr: VsockGuestAddr, op: VsockPacketOp) -> Self {
        let mut header = VsockPacketHeader::new();
        header
            .set_host_src_cid()
            .set_dst_cid(addr.guest_cid)
            .set_src_port(addr.src_port)
            .set_dst_port(addr.dst_port)
            .set_op(op);

        Self { header, data: [].into() }
    }

    pub fn new_reset(addr: VsockGuestAddr) -> Self {
        Self::new(addr, VsockPacketOp::Reset)
    }

    pub fn new_response(addr: VsockGuestAddr) -> Self {
        let packet = Self::new(addr, VsockPacketOp::Response);
        packet
    }

    /// Create a new RW packet that sets the len field to the size of the data.
    ///
    /// Panics if the supplied data value is greater than `u32::MAX`, as
    /// anything larger would not fit within the peer's `buf_alloc`, which is
    /// defined as `u32`.
    pub fn new_rw(
        addr: VsockGuestAddr,
        fwd_cnt: u32,
        data: impl Into<Box<[u8]>>,
    ) -> Self {
        let data = data.into();
        let len = data.len();
        assert!(
            len < u32::MAX as usize,
            "vsock packets should not exceed u32::MAX"
        );
        let mut packet = Self::new(addr, VsockPacketOp::ReadWrite);
        packet.header.set_len(len as u32);
        packet.header.set_fwd_cnt(fwd_cnt);
        packet.data = data;
        packet
    }

    pub fn new_credit_update(addr: VsockGuestAddr, fwd_cnt: u32) -> Self {
        let mut packet = Self::new(addr, VsockPacketOp::CreditUpdate);
        packet.header.set_fwd_cnt(fwd_cnt);
        packet
    }

    pub fn new_shutdown(
        addr: VsockGuestAddr,
        flags: VsockPacketFlags,
        fwd_cnt: u32,
    ) -> Self {
        let mut packet = Self::new(addr, VsockPacketOp::Shutdown);
        packet.header.set_fwd_cnt(fwd_cnt);
        packet.header.set_flags(flags);
        packet
    }
}


================================================
FILE: lib/propolis/src/vsock/poller.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::collections::VecDeque;
use std::ffi::c_void;
use std::io::ErrorKind;
use std::io::Read;
use std::os::fd::{AsFd, AsRawFd, BorrowedFd, FromRawFd, OwnedFd, RawFd};
use std::sync::mpsc;
use std::sync::Arc;
use std::sync::Barrier;
use std::sync::Condvar;
use std::sync::Mutex;
use std::thread::JoinHandle;
use std::time::Duration;
use std::time::Instant;

use iddqd::IdHashMap;
use nix::poll::PollFlags;
use slog::{error, info, warn, Logger};

use crate::hw::virtio::vsock::VsockVq;
use crate::hw::virtio::vsock::VSOCK_RX_QUEUE;
use crate::hw::virtio::vsock::VSOCK_TX_QUEUE;
use crate::vsock::packet::VsockPacket;
use crate::vsock::packet::VsockPacketFlags;
use crate::vsock::packet::VsockSocketType;
use crate::vsock::probes;
use crate::vsock::proxy::ConnKey;
use crate::vsock::proxy::VsockPortMapping;
use crate::vsock::proxy::VsockProxyConn;
use crate::vsock::GuestCid;
use crate::vsock::VSOCK_HOST_CID;

use super::packet::VsockGuestAddr;
use super::packet::VsockPacketOp;

/// How long we will wait to receive a RST from a guest when closing a
/// connection, and how long we will wait when a guest closes its connection
/// for the host to drain its vbuf to the underlying socket.
const DEFAULT_QUIESCE_TIMEOUT: Duration = Duration::from_secs(2);

#[repr(usize)]
enum VsockEvent {
    TxQueue = 0,
    RxQueue = 1,
    Pause = 2,
}

impl TryFrom<usize> for VsockEvent {
    type Error = usize;

    fn try_from(value: usize) -> Result<Self, Self::Error> {
        match value {
            0 => Ok(Self::TxQueue),
            1 => Ok(Self::RxQueue),
            2 => Ok(Self::Pause),
            unknown => Err(unknown),
        }
    }
}

struct PollerState {
    cv: Condvar,
    running: Mutex<bool>,
}
impl PollerState {
    fn new() -> Self {
        Self { cv: Condvar::new(), running: Mutex::new(false) }
    }

    fn wait_stopped(&self) {
        let guard = self.running.lock().unwrap();
        let _res = self.cv.wait_while(guard, |g| *g).unwrap();
    }

    fn set_stopped(&self) {
        let mut guard = self.running.lock().unwrap();
        if *guard {
            *guard = false;
            self.cv.notify_all();
        }
    }

    fn set_running(&self) {
        let mut guard = self.running.lock().unwrap();
        *guard = true;
    }
}

/// Commands that can be sent to a paused `poller_loop`.
enum PausedCmd {
    /// Continue execution
    Resume,
    /// Cleanup all connection state and queued packets
    Reset { oneshot: mpsc::SyncSender<()> },
    /// Shutdown the event-loop
    Halt,
}

pub struct VsockPollerNotify {
    port_fd: Arc<OwnedFd>,
    state: Arc<PollerState>,
    pause_tx: mpsc::SyncSender<PausedCmd>,
}

impl VsockPollerNotify {
    fn port_fd(&self) -> BorrowedFd<'_> {
        self.port_fd.as_fd()
    }

    fn port_send(&self, event: VsockEvent) -> std::io::Result<()> {
        let ret = unsafe {
            libc::port_send(self.port_fd().as_raw_fd(), 0, event as usize as _)
        };

        if ret == 0 {
            Ok(())
        } else {
            Err(std::io::Error::last_os_error())
        }
    }

    pub fn queue_notify(&self, id: u16) -> std::io::Result<()> {
        match id {
            VSOCK_RX_QUEUE => self.port_send(VsockEvent::RxQueue),
            VSOCK_TX_QUEUE => self.port_send(VsockEvent::TxQueue),
            _ => Ok(()),
        }
    }

    pub fn pause(&self) -> std::io::Result<()> {
        self.port_send(VsockEvent::Pause)
    }

    pub fn resume(&self) {
        self.pause_tx.send(PausedCmd::Resume).unwrap();
    }

    pub fn reset(&self) {
        let (tx, rx) = mpsc::sync_channel(0);
        self.pause_tx.send(PausedCmd::Reset { oneshot: tx }).unwrap();
        rx.recv().unwrap();
    }

    pub fn halt(&self) {
        self.pause_tx.send(PausedCmd::Halt).unwrap();
    }

    pub fn wait_stopped(&self) {
        self.state.wait_stopped();
    }
}

/// Set of `PollFlags` that signifies a readable event.
const fn is_readable(flags: PollFlags) -> bool {
    const READABLE: PollFlags = PollFlags::from_bits_truncate(
        PollFlags::POLLIN.bits()
            | PollFlags::POLLHUP.bits()
            | PollFlags::POLLERR.bits()
            | PollFlags::POLLPRI.bits(),
    );
    READABLE.intersects(flags)
}

/// Set of `PollFlags` that signifies a writable event.
const fn is_writable(flags: PollFlags) -> bool {
    const WRITABLE: PollFlags = PollFlags::from_bits_truncate(
        PollFlags::POLLOUT.bits()
            | PollFlags::POLLHUP.bits()
            | PollFlags::POLLERR.bits(),
    );
    WRITABLE.intersects(flags)
}

#[derive(Debug)]
enum RxEvent {
    /// Vsock RST packet
    Reset(ConnKey),
    /// Vsock RESPONSE packet
    NewConnection(ConnKey),
    /// Vsock CREDIT_UPDATE packet
    CreditUpdate(ConnKey),
}

struct ClosingConn {
    key: ConnKey,
    started: Instant,
}

pub struct VsockPoller {
    log: Logger,
    start_barrier: Arc<Barrier>,
    /// The guest context id
    guest_cid: GuestCid,
    /// Port mappings we are proxying packets to and from
    port_mappings: IdHashMap<VsockPortMapping>,
    /// The event port fd.
    port_fd: Arc<OwnedFd>,
    /// The virtqueues associated with the vsock device
    queues: VsockVq,
    /// Handle used to manage the [`Lifecycle`] of the event-loop
    state: Arc<PollerState>,
    pause_rx: mpsc::Receiver<PausedCmd>,
    pause_tx: mpsc::SyncSender<PausedCmd>,
    /// The connection map of guest connected streams
    connections: HashMap<ConnKey, VsockProxyConn>,
    /// Queue of vsock packets that need to be sent to the guest
    rx: VecDeque<RxEvent>,
    /// Connections blocked waiting for rx queue descriptors
    rx_blocked: Vec<ConnKey>,
    /// Connections waiting to be reaped
    quiescing: VecDeque<ClosingConn>,
}

impl VsockPoller {
    /// Create a new `VsockPoller`.
    ///
    /// This poller is responsible for driving virtio-socket connections between
    /// the guest VM and host sockets.
    pub fn new(
        log: Logger,
        start_barrier: Arc<Barrier>,
        cid: GuestCid,
        queues: VsockVq,
        port_mappings: IdHashMap<VsockPortMapping>,
    ) -> std::io::Result<Self> {
        let port_fd = unsafe {
            let fd = match libc::port_create() {
                -1 => return Err(std::io::Error::last_os_error()),
                fd => fd,
            };

            // Set CLOEXEC on the event port fd
            if libc::fcntl(
                fd,
                libc::F_SETFD,
                libc::fcntl(fd, libc::F_GETFD) | libc::FD_CLOEXEC,
            ) < 0
            {
                return Err(std::io::Error::last_os_error());
            };

            fd
        };

        let (pause_tx, pause_rx) = mpsc::sync_channel(1);

        info!(
            &log,
            "vsock poller configured with";
            "mappings" => ?port_mappings,
        );

        Ok(Self {
            log,
            start_barrier,
            guest_cid: cid,
            port_mappings,
            port_fd: Arc::new(unsafe { OwnedFd::from_raw_fd(port_fd) }),
            queues,
            state: Arc::new(PollerState::new()),
            pause_rx,
            pause_tx,
            connections: Default::default(),
            rx: Default::default(),
            rx_blocked: Default::default(),
            quiescing: Default::default(),
        })
    }

    /// Get a handle to a `VsockPollerNotify`.
    pub fn notify_handle(&self) -> VsockPollerNotify {
        VsockPollerNotify {
            port_fd: Arc::clone(&self.port_fd),
            state: Arc::clone(&self.state),
            pause_tx: self.pause_tx.clone(),
        }
    }

    /// Start the event loop.
    pub fn run(mut self) -> JoinHandle<()> {
        std::thread::Builder::new()
            .name("vsock-event-loop".to_string())
            .spawn(move || self.poller_loop())
            .expect("failed to spawn vsock event loop")
    }

    /// Handle the guest's VIRTIO_VSOCK_OP_REQUEST packet.
    fn handle_connection_request(&mut self, key: ConnKey, packet: VsockPacket) {
        if self.connections.contains_key(&key) {
            // Connection already exists
            self.send_conn_rst(key);
            return;
        }

        let Some(mapping) = self.port_mappings.get(&packet.header.dst_port())
        else {
            // Drop the unknown connection so that it times out in the guest.
            warn!(
                &self.log,
                "dropping connect request to unknown mapping";
                "packet" => ?packet,
            );
            return;
        };

        match VsockProxyConn::new(mapping.addr()) {
            Ok(mut conn) => {
                conn.update_peer_credit(&packet.header);
                self.connections.insert(key, conn);
                self.rx.push_back(RxEvent::NewConnection(key));
            }
            Err(e) => {
                self.send_conn_rst(key);
                error!(self.log, "{e}");
            }
        };
    }

    /// Handle the guest's VIRTIO_VSOCK_OP_SHUTDOWN packet.
    fn handle_shutdown(&mut self, key: ConnKey, flags: VsockPacketFlags) {
        if let Entry::Occupied(mut entry) = self.connections.entry(key) {
            let conn = entry.get_mut();

            // Guest won't receive more data
            if flags.contains(VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE)
            {
                if let Err(e) = conn.shutdown_guest_read() {
                    error!(
                        &self.log,
                        "cannot transition vsock connection state: {e}";
                        "conn" => ?conn,
                    );
                    entry.remove();
                    self.send_conn_rst(key);
                    return;
                };
            }
            // Guest won't send more data
            if flags.contains(VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_SEND) {
                if let Err(e) = conn.shutdown_guest_write() {
                    error!(
                        &self.log,
                        "cannot transition vsock connection state: {e}";
                        "conn" => ?conn,
                    );
                    entry.remove();
                    self.send_conn_rst(key);
                    return;
                };
            }

            if conn.should_close() {
                if !conn.has_buffered_data() {
                    self.connections.remove(&key);
                    // virtio spec states:
                    //
                    // Clean disconnect is achieved by one or more
                    // VIRTIO_VSOCK_OP_SHUTDOWN packets that indicate no
                    // more data will be sent and received, followed by a
                    // VIRTIO_VSOCK_OP_RST response from the peer.
                    self.send_conn_rst(key);
                } else {
                    self.quiescing.push_back(ClosingConn {
                        key,
                        started: Instant::now(),
                    });
                }
            }
        }
    }

    /// Handle the guest's VIRTIO_VSOCK_OP_RW packet.
    fn handle_rw_packet(&mut self, key: ConnKey, packet: VsockPacket) {
        if let Entry::Occupied(mut entry) = self.connections.entry(key) {
            let conn = entry.get_mut();

            // If we have a valid connection, attempt to consume the guest's
            // packet.
            if let Err(e) = conn.recv_packet(packet) {
                error!(
                    &self.log,
                    "failed to push vsock packet data into the conn vbuf: {e}";
                    "conn" => ?conn,
                );

                entry.remove();
                self.send_conn_rst(key);
                return;
            }

            if let Some(interests) = conn.poll_interests() {
                let fd = conn.get_fd();
                self.associate_fd(key, fd, interests);
            }
        };
    }

    /// Handle the guest's tx virtqueue.
    fn handle_tx_queue_event(&mut self) {
        loop {
            let packet = match self.queues.recv_packet().transpose() {
                Ok(Some(packet)) => packet,
                // No more packets on the guest's tx queue
                Ok(None) => break,
                Err(e) => {
                    warn!(&self.log, "dropping invalid vsock packet: {e}");
                    continue;
                }
            };

            probes::vsock_pkt_tx!(|| &packet.header);
            // If the packet is not destined for the host drop it.
            if packet.header.dst_cid() != VSOCK_HOST_CID {
                warn!(
                    &self.log,
                    "droppping vsock packet not destined for the host";
                    "packet" => ?packet,
                );
                continue;
            }

            // If the packet is not coming from our guest drop it.
            if packet.header.src_cid() != self.guest_cid.get() {
                // Note that we could send a RST here but technically we should
                // not know how to address this guest cid as it's not the one
                // we assigned to our guest.
                warn!(
                    &self.log,
                    "droppping vsock packet not arriving from our guest cid";
                    "packet" => ?packet,
                );
                continue;
            }

            let key = ConnKey {
                host_port: packet.header.dst_port(),
                guest_port: packet.header.src_port(),
            };

            // We only support stream connections
            let Some(VsockSocketType::Stream) = packet.header.socket_type()
            else {
                self.send_conn_rst(key);
                warn!(&self.log,
                    "received invalid vsock packet type";
                    "packet" => ?packet,
                );
                continue;
            };

            let Some(packet_op) = packet.header.op() else {
                warn!(
                    &self.log,
                    "received vsock packet with unknown op code";
                    "packet" => ?packet,
                );
                return;
            };

            if let Some(conn) = self.connections.get_mut(&key) {
                // Regardless of the vsock operation, we need to record the
                // peers credit info
                conn.update_peer_credit(&packet.header);
                match packet_op {
                    VsockPacketOp::Reset => {
                        self.connections.remove(&key);
                    }
                    VsockPacketOp::Shutdown => {
                        self.handle_shutdown(key, packet.header.flags());
                    }
                    VsockPacketOp::CreditUpdate => continue,
                    VsockPacketOp::CreditRequest => {
                        self.rx.push_back(RxEvent::CreditUpdate(key));
                    }
                    VsockPacketOp::ReadWrite => {
                        self.handle_rw_packet(key, packet);
                    }
                    // We are operating on an existing connection either of
                    // these should not be received
                    //
                    // XXX: send a RST, but what about our orignal connection?
                    op @ (VsockPacketOp::Request | VsockPacketOp::Response) => {
                        warn!(
                            &self.log,
                            "received vsock packet with op code \
                            {op:?} while operating on an exiting connection"
                        );
                    }
                }
            } else {
                match packet_op {
                    VsockPacketOp::Request => {
                        self.handle_connection_request(key, packet)
                    }
                    VsockPacketOp::Reset => {}
                    _ => {
                        warn!(
                            &self.log,
                            "received a vsock packet for an unknown connection \
                            that was not a REQUEST or RST";
                            "packet" => ?packet,
                        );
                    }
                }
            }
        }
    }

    /// Process the rx virtqueue (host -> guest).
    fn handle_rx_queue_event(&mut self) {
        // Now that more descriptors have become available for sending vsock
        // packets attempt to drain pending packets
        self.process_pending_rx();

        // Re-register connections that were blocked waiting for rx queue space.
        // It would be nice if we had a hint of how many descriptors became
        // available but that's not the case today.
        for key in std::mem::take(&mut self.rx_blocked).drain(..) {
            // It's possible that the guest has sent a RST for this connection
            // while we were blocked and we removed our tracked `ConnKey`.
            if let Some(conn) = self.connections.get(&key) {
                // It's possible that by the time we are ready to send the guest
                // data again it has since sent us a SHUTDOWN with the
                // `VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE` flag and the connection
                // is in the process of shutting down.
                if let Some(interests) = conn.poll_interests() {
                    let fd = conn.get_fd();
                    self.associate_fd(key, fd, interests);
                }
            }
        }
    }

    // Attempt to send any queued rx packets destined for the guest.
    fn process_pending_rx(&mut self) {
        while let Some(permit) = self.queues.try_rx_permit() {
            let Some(rx_event) = self.rx.pop_front() else {
                break;
            };

            match rx_event {
                RxEvent::Reset(key) => {
                    let packet = VsockPacket::new_reset(
                        VsockGuestAddr::from_conn_key(self.guest_cid, key),
                    );
                    permit.write(&packet.header, &packet.data);
                }
                RxEvent::NewConnection(key) => {
                    let packet = VsockPacket::new_response(
                        VsockGuestAddr::from_conn_key(self.guest_cid, key),
                    );
                    permit.write(&packet.header, &packet.data);

                    if let Entry::Occupied(mut entry) =
                        self.connections.entry(key)
                    {
                        let conn = entry.get_mut();
                        if let Err(e) = conn.set_established() {
                            error!(
                                &self.log,
                                "cannot transition vsock connection state: {e}";
                                "conn" => ?conn,
                            );
                            entry.remove();
                            self.send_conn_rst(key);
                            continue;
                        };

                        if let Some(interests) = conn.poll_interests() {
                            let fd = conn.get_fd();
                            self.associate_fd(key, fd, interests);
                        }
                    }
                }
                RxEvent::CreditUpdate(key) => {
                    if let Some(conn) = self.connections.get_mut(&key) {
                        let packet = VsockPacket::new_credit_update(
                            VsockGuestAddr::from_conn_key(self.guest_cid, key),
                            conn.fwd_cnt(),
                        );
                        permit.write(&packet.header, &packet.data);
                        conn.mark_credit_sent();
                    }
                }
            }
        }
    }

    /// Handle a user event. Returns `true` if the event loop should pause.
    fn handle_user_event(&mut self, event: VsockEvent) -> bool {
        let mut should_pause = false;
        match event {
            VsockEvent::TxQueue => self.handle_tx_queue_event(),
            VsockEvent::RxQueue => self.handle_rx_queue_event(),
            VsockEvent::Pause => should_pause = true,
        }
        should_pause
    }

    /// Handle an fd event by flushing data to the underlying socket from the
    /// connections [`VsockBuf`], and by reading data from the socket and
    /// sending it to the guest as a `VIRTIO_VSOCK_OP_RW` packet.
    fn handle_fd_event(&mut self, event: PortEvent, read_buf: &mut [u8]) {
        let key = ConnKey::from_portev_user(event.user);
        let events = PollFlags::from_bits_retain(event.events as i16);

        if is_writable(events) {
            self.handle_writable_fd(key);
        }

        if is_readable(events) {
            self.handle_readable_fd(key, read_buf);
        }
    }

    /// When an fd is writable, drain buffered guest data to the host socket.
    fn handle_writable_fd(&mut self, key: ConnKey) {
        let Some(conn) = self.connections.get_mut(&key) else {
            return;
        };

        loop {
            match conn.flush() {
                Ok(0) => break,
                Ok(nbytes) => {
                    conn.update_fwd_cnt(nbytes as u32);
                    if conn.needs_credit_update() {
                        self.rx.push_back(RxEvent::CreditUpdate(key));
                    }
                }
                Err(e) if e.kind() == ErrorKind::WouldBlock => break,
                Err(e) => {
                    error!(&self.log, "error writing to socket: {e}");
                    break;
                }
            }
        }

        // We have finished draining our buffered data to the host, so check if
        // we should remove ourselves from the active connections.
        if conn.should_close() && !conn.has_buffered_data() {
            self.connections.remove(&key);
            self.send_conn_rst(key);
            return;
        }

        if let Some(interests) = conn.poll_interests() {
            let fd = conn.get_fd();
            self.associate_fd(key, fd, interests);
        }
    }

    /// When an fd is readable, read from host socket and send to guest.
    fn handle_readable_fd(&mut self, key: ConnKey, read_buf: &mut [u8]) {
        let VsockPoller { queues, connections, guest_cid, rx_blocked, .. } =
            self;

        let Some(conn) = connections.get_mut(&key) else {
            return;
        };

        // The guest is no longer expecting any data
        if !conn.guest_can_read() {
            return;
        }
        loop {
            let Some(permit) = queues.try_rx_permit() else {
                rx_blocked.push(key);
                break;
            };

            let credit = conn.peer_credit();
            if credit == 0 {
                // TODO: when this happens under sufficient load there's the
                // possibility we wake up the event loop repeatedly and we
                // should defer associating this fd again until there's enough
                // credit. This is similar to the `rx_blocked` queue but
                // slightly different.
                break;
            }

            let max_read = read_buf
                .len()
                // limited by how many bytes the desc chain has
                .min(permit.available_data_space())
                // limited by how many bytes the guest can handle
                .min(credit as usize);

            match conn.socket.read(&mut read_buf[..max_read]) {
                Ok(0) => {
                    // TODO (propolis#1102):
                    // This is an overly aggressive shutdown. Typically EOF
                    // signals that the conenction has been closed, however one
                    // can intentionally shutdown read|write halves
                    // independently.
                    let packet = VsockPacket::new_shutdown(
                        VsockGuestAddr::from_conn_key(*guest_cid, key),
                        VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_SEND
                            | VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE,
                        conn.fwd_cnt(),
                    );
                    permit.write(&packet.header, &packet.data);
                    self.quiescing.push_back(ClosingConn {
                        key,
                        started: Instant::now(),
                    });
                    return;
                }
                Ok(nbytes) => {
                    let read_u32: u32 = nbytes
                        .try_into()
                        .expect("max_read is <=u32::MAX by min() above");
                    conn.update_tx_cnt(read_u32);
                    let VsockPacket { header, data } = VsockPacket::new_rw(
                        VsockGuestAddr::from_conn_key(*guest_cid, key),
                        conn.fwd_cnt(),
                        &read_buf[..nbytes],
                    );
                    permit.write(&header, &data);
                }
                Err(e) if e.kind() == ErrorKind::WouldBlock => break,
                Err(e) => {
                    error!(
                        &self.log,
                        "vsock backend socket read failed: {e}";
                        "key" => ?key,
                        "conn" => ?conn,
                    );

                    connections.remove(&key);
                    let packet = VsockPacket::new_reset(
                        VsockGuestAddr::from_conn_key(*guest_cid, key),
                    );
                    permit.write(&packet.header, &packet.data);
                    return;
                }
            }
        }

        if let Some(interests) = conn.poll_interests() {
            let fd = conn.get_fd();
            self.associate_fd(key, fd, interests);
        }
    }

    /// Associate a connections underlying socket fd with our port fd.
    fn associate_fd(&mut self, key: ConnKey, fd: RawFd, interests: PollFlags) {
        let ret = unsafe {
            libc::port_associate(
                self.port_fd.as_raw_fd(),
                libc::PORT_SOURCE_FD,
                fd as usize,
                interests.bits() as i32,
                key.to_portev_user() as *mut c_void,
            )
        };

        if ret < 0 {
            let err = std::io::Error::last_os_error();
            if let Some(conn) = self.connections.remove(&key) {
                error!(
                    &self.log,
                    "vsock port_assocaite failed: {err}";
                    "key" => ?key,
                    "conn" => ?conn,
                );
                self.send_conn_rst(key);
            }
        }
    }

    /// Enqueue a RST packet for the provided [`ConnKey`]
    fn send_conn_rst(&mut self, key: ConnKey) {
        self.rx.push_back(RxEvent::Reset(key));
    }

    fn quiesce_connections(&mut self) {
        // NOTE: this intentionally collides with the method name in Rust 1.93,
        // we plan to remove the extension trait below once propolis gets a Rust
        // version bump.
        #[allow(unstable_name_collisions)]
        // NB: We are a single threaded event-loop, therefore any connection
        // that gets put on the quiesce queue should not expire before previous
        // entries have.
        while let Some(conn) = self.quiescing.pop_front_if(|conn| {
            conn.started.elapsed() > DEFAULT_QUIESCE_TIMEOUT
        }) {
            // It's possible that the guest sent us a RST for the connection,
            // since we put it on the quiesce queue.
            if self.connections.remove(&conn.key).is_some() {
                // If we have a connection, make sure we send a RST, so the
                // guest knows we are done with it.
                self.send_conn_rst(conn.key);
            }
        }
    }

    fn handle_events(&mut self) {
        const MAX_EVENTS: u32 = 32;

        let mut events = [const { unsafe { std::mem::zeroed::<libc::port_event>() } };
            MAX_EVENTS as usize];
        let mut read_buf: Box<[u8]> = vec![0u8; 1024 * 64].into();

        loop {
            let mut ts = libc::timespec {
                // We use the quiesce timeout so that we don't wait
                // unnecessarily long to cleanup connections.
                tv_sec: DEFAULT_QUIESCE_TIMEOUT.as_secs() as i64,
                tv_nsec: 0,
            };
            let mut nget = 1;

            let ret = unsafe {
                libc::port_getn(
                    self.port_fd.as_raw_fd(),
                    events.as_mut_ptr(),
                    MAX_EVENTS,
                    &mut nget,
                    // TODO currently we are not supplying a timeout because
                    // there is no other work to do unless we are woken up. In
                    // the near future we will likely periodically wake up to
                    // service the shutdown quiesce queue.
                    &mut ts,
                )
            };

            if ret < 0 {
                let err = std::io::Error::last_os_error();
                match err.raw_os_error().expect(
                    "`raw_os_error` is documented to always return `Some` \
                    when obtained via `last_os_error`",
                ) {
                    // A signal was caught so process the loop again
                    libc::EINTR => continue,
                    libc::EBADF | libc::EBADFD => {
                        // This means our event loop is effectively no
                        // longer servicable and the vsock device is useless.
                        error!(
                            &self.log,
                            "vsock port fd is no longer valid: {err}"
                        );
                        return;
                    }
                    libc::ETIME => {
                        // Fall through
                        //
                        // We hit our timeout:
                        // - nget should be zero
                        // - we may have pending_rx
                        // - we may have conenctions to quiesce
                    }
                    _ => {
                        error!(&self.log, "vsock port_getn returned: {err}");
                        continue;
                    }
                }
            }

            assert!(
                nget as usize <= events.len(),
                "event port returned what we asked it for"
            );
            let events = unsafe {
                std::slice::from_raw_parts(events.as_ptr(), nget as usize)
            };

            let mut should_pause = false;
            for event in events {
                let event = PortEvent::from_raw(*event);

                match event.source {
                    EventSource::User => {
                        should_pause = match VsockEvent::try_from(event.user) {
                            Ok(event) => self.handle_user_event(event),
                            Err(unknown_event) => {
                                error!(
                                &self.log,
                                "unknown event port user event {unknown_event}"
                            );
                                false
                            }
                        };
                    }
                    EventSource::Fd => {
                        self.handle_fd_event(event, &mut read_buf);
                    }
                    _ => {}
                };
            }

            // Cleanup any connection waiting to be be reaped
            self.quiesce_connections();

            // Process any pending rx events
            self.process_pending_rx();

            // `[Lifecycle::pause]` has been requested so we hang out waiting
            // for further instruction on our pause channel.
            if should_pause {
                return;
            }
        }
    }

    // This is the gerneral flow of the single-threaded processing event-loop:
    //
    //                 ┌─────────virtio-socket─event-loop───────┐
    //                 │                                        │
    //                 │         ┌─────────────────────┐        │
    //                 │         │ start (set_running) ◄────┐   │
    //                 │         └──────────┬──────────┘    │   │
    // ┌───────────┐   │                    │               │   │
    // │vq rx event│   │                    │               │   │
    // │vq tx event│   │         ┌──────────▼──────────┐    │   │
    // │fd pollset ├───┼─────────►   handle_events()   │    │   │
    // │pause event│   │         └──────────┬──────────┘    │   │
    // └───────────┘   │                    │               │   │
    //                 │                    │               │   │
    //                 │         ┌──────────▼──────────┐    │   │
    //                 │         │ paused (set_stopped)│    │   │
    //                 │         └──────────┬──────────┘    │   │
    //                 │                    │               │   │
    //                 │                    │               │   │
    //                 │             ┌──────▼───────┐       │   │
    //                 │             │   pause_rx   │       │   │
    //                 │             │              │       │   │
    //                 │             │  - resume────┼───────┘   │
    //                 │             │  - reset─────┼─►cleanup  │
    //                 │             │  - halt───┐  │   state   │
    //                 │             │           │  │           │
    //                 │             └───────────┼──┘           │
    //                 │                         │              │
    //                 └─────────────────────────┼──────────────┘
    //                                           ▼
    //                                          Exit
    //
    // The event-loop is executing in a dedicated thread and therefore must
    // be able to handle triggers from propolis as it manages the device
    // lifecycle as documented in the `Lifecycle` trait. Propolis guarantees
    // that the device will transition to the `Lifecycle::paused` state
    // before it attempts to resume, reset, or halt the device. We rely on a
    // `port_send(3C)` to inject a `VsockEvent::Pause` user event so that we may
    // break out of the processing loop and await further instruction. When we
    // are in this paused state we rely on the internal pause mpsc channel to
    // deliver resume, reset, and exit events. We went with this design because
    // it removes the extra complexity of calling `port_dissociate(3C)` on
    // every tracked fd and later re-associating them, this allows us to yield
    // execution until one of the desired next state events is delivered.
    fn poller_loop(&mut self) {
        // Wait until propolis tells us to start the virtio-socket device.
        // If we are never told to start that is okay as there will be no need
        // to ever transition the device from running->paused->halt.
        //
        // TODO: If propolis-server is started but the VM is never transitioned
        // to running this will leave a useless thread around that will be
        // cleanned up when propolis-server exits. This also means if one sends
        // a new instance spec to the pre-existing propolis-server we will
        // accumulate these "dead" threads, howerver we never do this in the
        // actual product.
        self.start_barrier.wait();

        loop {
            // We are running!
            self.state.set_running();

            // Handle events until we are told to pause.
            self.handle_events();

            // Transition to stopped and await for next steps.
            self.state.set_stopped();

            loop {
                match self.pause_rx.recv() {
                    Ok(PausedCmd::Resume) => break,
                    Ok(PausedCmd::Reset { oneshot }) => {
                        self.reset();
                        oneshot.send(()).unwrap();
                    }
                    Ok(PausedCmd::Halt) => {
                        self.reset();
                        return;
                    }
                    Err(_) => {
                        error!(
                        &self.log,
                        "all VsockPoller pause_tx senders have been dropped"
                    );
                        return;
                    }
                }
            }
        }
    }

    fn reset(&mut self) {
        self.connections.clear();
        self.rx.clear();
        self.rx_blocked.clear();
        self.quiescing.clear();
        self.queues.clear_rx_chain();
    }
}

/// The source of a port event.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EventSource {
    /// User event i.e. `port_send(3C)`
    User,
    /// File descriptor event
    Fd,
    /// Unknown source for the vsock backend
    Unknown(u16),
}

impl EventSource {
    fn from_raw(source: u16) -> Self {
        match source as i32 {
            libc::PORT_SOURCE_USER => EventSource::User,
            libc::PORT_SOURCE_FD => EventSource::Fd,
            _ => EventSource::Unknown(source),
        }
    }
}

/// A port event retrieved from an event port.
///
/// This represents an event from one of the various event sources (file
/// descriptors, timers, user events, etc.).
#[derive(Debug, Clone)]
struct PortEvent {
    /// The events that occurred (source-specific)
    events: i32,
    /// The source of the event
    source: EventSource,
    /// The object associated with the event (interpretation depends on source)
    #[allow(dead_code)]
    object: usize,
    /// User-defined data provided during association
    user: usize,
}

impl PortEvent {
    fn from_raw(event: libc::port_event) -> Self {
        PortEvent {
            events: event.portev_events,
            source: EventSource::from_raw(event.portev_source),
            object: event.portev_object,
            user: event.portev_user as usize,
        }
    }
}

impl VsockGuestAddr {
    /// Helper function to construct a `[VsockGuestAddr]` from a guest context
    /// ID and a `[ConnKey]`.
    fn from_conn_key(guest_cid: GuestCid, key: ConnKey) -> Self {
        Self { guest_cid, src_port: key.host_port, dst_port: key.guest_port }
    }
}

// TODO this can become `[VecDeque::pop_front_if]` when we update to Rust 1.93,
// until then the impl is shamelessly borrowed.
trait VecDequeExt<T> {
    fn pop_front_if(
        &mut self,
        predicate: impl FnOnce(&mut T) -> bool,
    ) -> Option<T>;
}

impl<T> VecDequeExt<T> for VecDeque<T> {
    fn pop_front_if(
        &mut self,
        predicate: impl FnOnce(&mut T) -> bool,
    ) -> Option<T> {
        let first = self.front_mut()?;
        if predicate(first) {
            self.pop_front()
        } else {
            None
        }
    }
}

#[cfg(test)]
mod test {
    use std::io::{Read, Write};
    use std::net::TcpListener;
    use std::sync::atomic::{AtomicUsize, Ordering};
    use std::sync::Arc;
    use std::sync::Barrier;
    use std::time::Duration;

    use iddqd::IdHashMap;

    use zerocopy::{FromBytes, IntoBytes};

    use crate::hw::virtio::testutil::{QueueWriter, TestVirtQueues, VqSize};
    use crate::hw::virtio::vsock::{VsockVq, VSOCK_RX_QUEUE, VSOCK_TX_QUEUE};
    use crate::vsock::packet::{
        VsockPacketFlags, VsockPacketHeader, VsockPacketOp, VsockSocketType,
    };
    use crate::vsock::proxy::{VsockPortMapping, CONN_TX_BUF_SIZE};
    use crate::vsock::{GuestCid, VSOCK_HOST_CID};

    use super::VsockPoller;

    fn test_logger() -> slog::Logger {
        use slog::Drain;
        let decorator = slog_term::TermDecorator::new().stderr().build();
        let drain = slog_term::FullFormat::new(decorator).build().fuse();
        let drain = slog_async::Async::new(drain).build().fuse();
        slog::Logger::root(drain, slog::o!("component" => "vsock-test"))
    }

    const QUEUE_SIZE: u16 = 64;
    const PAGE_SIZE: u64 = 0x1000;

    /// Bind a TCP listener on an ephemeral port and return it along with an
    /// `IdHashMap<VsockPortMapping>` that maps `vsock_port` to the listener's
    /// actual address.
    fn bind_test_backend(
        vsock_port: u32,
    ) -> (TcpListener, IdHashMap<VsockPortMapping>) {
        let listener = TcpListener::bind("127.0.0.1:0").unwrap();
        let addr = listener.local_addr().unwrap();
        let mut backends = IdHashMap::new();
        backends.insert_overwrite(VsockPortMapping::new(vsock_port, addr));
        (listener, backends)
    }

    /// Test harness for vsock poller tests using shared testutil infrastructure.
    struct VsockTestHarness {
        tvqs: TestVirtQueues,
        rx_writer: QueueWriter,
        tx_writer: QueueWriter,
    }

    impl VsockTestHarness {
        fn new() -> Self {
            let tvqs = TestVirtQueues::new(&[
                VqSize::new(QUEUE_SIZE), // RX
                VqSize::new(QUEUE_SIZE), // TX
                VqSize::new(1),          // Event
            ]);

            // RX and TX use separate data regions
            let rx_writer = tvqs.writer(VSOCK_RX_QUEUE as usize, 0);
            let tx_writer =
                tvqs.writer(VSOCK_TX_QUEUE as usize, PAGE_SIZE * 16);

            Self { tvqs, rx_writer, tx_writer }
        }

        fn make_vsock_vq(&self) -> VsockVq {
            let queues: Vec<_> =
                self.tvqs.queues().iter().map(|q| q.clone()).collect();
            let acc = self.tvqs.mem_acc().child(Some("vsock-vq".to_string()));
            VsockVq::new(queues, acc)
        }

        /// Add a writable descriptor to the RX queue and publish it.
        fn add_rx_writable(&mut self, len: u32) -> u16 {
            let d = self.rx_writer.add_writable(self.tvqs.mem_acc(), len);
            self.rx_writer.publish_avail(self.tvqs.mem_acc(), d);
            d
        }

        /// Add a readable descriptor to the TX queue.
        fn add_tx_readable(&mut self, data: &[u8]) -> u16 {
            self.tx_writer.add_readable(self.tvqs.mem_acc(), data)
        }

        /// Publish a descriptor on the TX queue.
        fn publish_tx(&mut self, head: u16) {
            self.tx_writer.publish_avail(self.tvqs.mem_acc(), head);
        }

        /// Chain two TX descriptors together.
        fn chain_tx(&mut self, from: u16, to: u16) {
            self.tx_writer.chain(self.tvqs.mem_acc(), from, to);
        }

        /// Reset TX writer cursors for reuse.
        fn reset_tx_cursors(&mut self) {
            self.tx_writer.reset_cursors();
        }

        /// Reset RX writer cursors for reuse.
        fn reset_rx_cursors(&mut self) {
            self.rx_writer.reset_cursors();
        }

        /// Read a vsock packet header and data from a used ring entry.
        fn read_vsock_packet(
            &self,
            used_index: u16,
        ) -> (VsockPacketHeader, Vec<u8>) {
            let mem_acc = self.tvqs.mem_acc();
            let elem = self.rx_writer.read_used_elem(mem_acc, used_index);
            let desc_id = elem.id as u16;
            let total_len = elem.len as usize;

            // Read the entire buffer (header + data)
            let buf =
                self.rx_writer.read_desc_data(mem_acc, desc_id, total_len);

            // Parse header from the first bytes
            let hdr_size = std::mem::size_of::<VsockPacketHeader>();
            let (hdr, data) = buf.split_at(hdr_size);
            let hdr = VsockPacketHeader::read_from_bytes(hdr)
                .expect("buffer should contain valid header");

            (hdr, data.to_vec())
        }

        fn rx_used_idx(&self) -> u16 {
            self.rx_writer.used_idx(self.tvqs.mem_acc())
        }

        fn tx_used_idx(&self) -> u16 {
            self.tx_writer.used_idx(self.tvqs.mem_acc())
        }
    }

    /// Helper: serialize a VsockPacketHeader to bytes.
    fn hdr_as_bytes(hdr: &VsockPacketHeader) -> &[u8] {
        hdr.as_bytes()
    }

    /// Spin until a condition is met, with a timeout.
    fn wait_for_condition<F>(mut f: F, timeout_ms: u64)
    where
        F: FnMut() -> bool,
    {
        let start = std::time::Instant::now();
        let timeout = Duration::from_millis(timeout_ms);
        while !f() {
            if start.elapsed() > timeout {
                panic!("timed out waiting for condition");
            }
            std::thread::sleep(Duration::from_millis(1));
        }
    }

    #[test]
    fn request_receives_response() {
        let vsock_port = 3000;
        let guest_port = 1234;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (_listener, backends) = bind_test_backend(vsock_port);

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        harness.add_rx_writable(256);

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        let mut hdr = VsockPacketHeader::new();
        hdr.set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(crate::vsock::packet::VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        let (resp_hdr, _) = harness.read_vsock_packet(0);
        assert_eq!(resp_hdr.op(), Some(VsockPacketOp::Response));
        assert_eq!(resp_hdr.src_cid(), VSOCK_HOST_CID);
        assert_eq!(resp_hdr.dst_cid(), guest_cid.get());
        assert_eq!(resp_hdr.src_port(), vsock_port);
        assert_eq!(resp_hdr.dst_port(), guest_port);
        assert_eq!(resp_hdr.socket_type(), Some(VsockSocketType::Stream));

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn rw_with_invalid_socket_type_receives_rst() {
        let guest_cid = GuestCid::try_from(50).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            IdHashMap::new(),
        )
        .unwrap();

        harness.add_rx_writable(256);

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        let mut hdr = VsockPacketHeader::new();
        hdr.set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(5555)
            .set_dst_port(8080)
            .set_len(0)
            .set_socket_type(VsockSocketType::InvalidTestValue)
            .set_op(VsockPacketOp::ReadWrite)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        let (resp_hdr, _) = harness.read_vsock_packet(0);
        assert_eq!(resp_hdr.op(), Some(VsockPacketOp::Reset));
        assert_eq!(resp_hdr.src_cid(), VSOCK_HOST_CID);
        assert_eq!(resp_hdr.dst_cid(), guest_cid.get());
        assert_eq!(resp_hdr.src_port(), 8080);
        assert_eq!(resp_hdr.dst_port(), 5555);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn request_then_rw_delivers_data() {
        let vsock_port = 3000;
        let guest_port = 1234;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        for _ in 0..4 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Send REQUEST
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Accept TCP connection and wait for RESPONSE
        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Send RW packet with data payload
        let payload = b"hello from guest via vsock!";
        let mut rw_hdr = VsockPacketHeader::new();
        rw_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(payload.len() as u32)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::ReadWrite)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_hdr = harness.add_tx_readable(hdr_as_bytes(&rw_hdr));
        let d_body = harness.add_tx_readable(payload);
        harness.chain_tx(d_hdr, d_body);
        harness.publish_tx(d_hdr);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Read from accepted TCP stream and verify
        let mut buf = vec![0u8; payload.len()];
        accepted.read_exact(&mut buf).unwrap();
        assert_eq!(&buf, payload);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn credit_update_sent_after_flushing_half_buffer() {
        let vsock_port = 4000;
        let guest_port = 2000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Provide plenty of RX descriptors for RESPONSE + credit updates
        for _ in 0..16 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Establish connection
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Send enough data to exceed half the buffer capacity (64KB).
        let chunk_size = 8192;
        let num_chunks = (CONN_TX_BUF_SIZE / 2) / chunk_size + 1;
        let payload = vec![0xAB_u8; chunk_size];
        let total_sent = num_chunks * chunk_size;
        let mut tx_consumed = 1u16; // REQUEST was consumed

        for _ in 0..num_chunks {
            // Reuse descriptor slots each iteration
            harness.reset_tx_cursors();

            let mut rw_hdr = VsockPacketHeader::new();
            rw_hdr
                .set_src_cid(guest_cid)
                .set_dst_cid_raw(VSOCK_HOST_CID)
                .set_src_port(guest_port)
                .set_dst_port(vsock_port)
                .set_len(payload.len() as u32)
                .set_socket_type(VsockSocketType::Stream)
                .set_op(VsockPacketOp::ReadWrite)
                .set_buf_alloc(65536)
                .set_fwd_cnt(0);

            let d_hdr = harness.add_tx_readable(hdr_as_bytes(&rw_hdr));
            let d_body = harness.add_tx_readable(&payload);
            harness.chain_tx(d_hdr, d_body);
            harness.publish_tx(d_hdr);
            notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

            tx_consumed += 1;
            wait_for_condition(|| harness.tx_used_idx() >= tx_consumed, 5000);
        }

        // Drain the data from the accepted socket to confirm it arrived
        let mut buf = vec![0u8; total_sent];
        accepted.read_exact(&mut buf).unwrap();
        assert!(buf.iter().all(|&b| b == 0xAB));

        // Look for a CREDIT_UPDATE in the RX used entries
        let rx_used = harness.rx_used_idx();
        assert!(rx_used >= 2, "expected at least RESPONSE + CREDIT_UPDATE");

        let mut found_credit_update = false;
        for i in 1..rx_used {
            let (hdr, _) = harness.read_vsock_packet(i);
            if hdr.op() == Some(VsockPacketOp::CreditUpdate) {
                assert_eq!(hdr.src_cid(), VSOCK_HOST_CID);
                assert_eq!(hdr.dst_cid(), guest_cid.get());
                assert_eq!(hdr.src_port(), vsock_port);
                assert_eq!(hdr.dst_port(), guest_port);
                assert_eq!(hdr.buf_alloc(), CONN_TX_BUF_SIZE as u32);
                found_credit_update = true;
                break;
            }
        }
        assert!(found_credit_update, "expected a CREDIT_UPDATE on RX queue");

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn rst_removes_established_connection() {
        let vsock_port = 5000;
        let guest_port = 3000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        for _ in 0..4 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Send REQUEST
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Send RST
        let mut rst_hdr = VsockPacketHeader::new();
        rst_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Reset)
            .set_buf_alloc(0)
            .set_fwd_cnt(0);

        let d_rst = harness.add_tx_readable(hdr_as_bytes(&rst_hdr));
        harness.publish_tx(d_rst);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Wait for the RST to be consumed
        wait_for_condition(|| harness.tx_used_idx() >= 2, 5000);

        // Verify the TCP connection was closed by reading from the
        // accepted stream.
        let mut buf = [0u8; 1];
        let result = accepted.read(&mut buf);
        match result {
            Ok(0) => {}
            Err(_) => {}
            Ok(n) => panic!("expected EOF or error, got {n} bytes"),
        }

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn end_to_end_guest_to_host() {
        let vsock_port = 7000;
        let guest_port = 5000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Pre-populate RX queue with writable descriptors for RESPONSE + data
        for _ in 0..8 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Write REQUEST packet into TX queue
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Accept the TCP connection (blocks until poller connects)
        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();

        // Wait for RESPONSE on RX queue
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Guest->Host: send RW packet with payload
        let payload = b"hello from guest via vsock end-to-end!";
        let mut rw_hdr = VsockPacketHeader::new();
        rw_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(payload.len() as u32)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::ReadWrite)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_hdr = harness.add_tx_readable(hdr_as_bytes(&rw_hdr));
        let d_body = harness.add_tx_readable(payload);
        harness.chain_tx(d_hdr, d_body);
        harness.publish_tx(d_hdr);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Read from accepted TCP stream, and verify guest->host data
        let mut buf = vec![0u8; payload.len()];
        accepted.read_exact(&mut buf).unwrap();
        assert_eq!(&buf, payload, "guest->host data mismatch");

        // Host->Guest: write data into accepted TCP stream
        let host_payload = b"reply from host via vsock!";
        accepted.write_all(host_payload).unwrap();
        accepted.flush().unwrap();

        // Wait for RW packet on RX queue (RESPONSE was 1, now expect 2+)
        wait_for_condition(|| harness.rx_used_idx() >= 2, 5000);

        // Read back the RW packet from RX used ring entry 1
        let (resp_hdr, host_buf) = harness.read_vsock_packet(1);

        assert_eq!(resp_hdr.op(), Some(VsockPacketOp::ReadWrite));
        assert_eq!(resp_hdr.src_port(), vsock_port);
        assert_eq!(resp_hdr.dst_port(), guest_port);
        assert_eq!(&host_buf, host_payload, "host->guest data mismatch");

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn rx_blocked_resumes_when_descriptors_available() {
        let vsock_port = 6000;
        let guest_port = 4000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Provide only one RX descriptor, just enough for the RESPONSE.
        harness.add_rx_writable(4096);

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Send REQUEST
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // The RESPONSE consumed the only RX descriptor. Write data from
        // the host side.
        let host_data = b"data from the host side";
        accepted.write_all(host_data).unwrap();
        accepted.flush().unwrap();

        // Give the poller time to attempt delivery (and get blocked)
        std::thread::sleep(Duration::from_millis(100));

        // Verify no new used entries appeared (still just the RESPONSE)
        assert_eq!(harness.rx_used_idx(), 1);

        // Add new RX descriptors and notify
        harness.reset_rx_cursors();
        harness.add_rx_writable(4096);
        notify.queue_notify(VSOCK_RX_QUEUE).unwrap();

        // Wait for the data to be delivered
        wait_for_condition(|| harness.rx_used_idx() >= 2, 5000);

        let (rw_hdr, payload) = harness.read_vsock_packet(1);
        assert_eq!(rw_hdr.op(), Some(VsockPacketOp::ReadWrite));
        assert_eq!(rw_hdr.src_port(), vsock_port);
        assert_eq!(rw_hdr.dst_port(), guest_port);
        assert_eq!(&payload, host_data);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    /// End-to-end test with large data transfers in both directions,
    /// exercising rx_blocked, credit updates, and descriptor replenishment
    /// across many batches of reused descriptor slots.
    #[test]
    fn end_to_end_large_data() {
        let total_bytes: usize = 10 * 1024 * 1024;

        let vsock_port = 8000;
        let guest_port = 6000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Provide initial RX descriptors for RESPONSE + credit updates
        for _ in 0..8 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Establish connection
        // Use a large buf_alloc so host->guest credit doesn't run out
        // before we've transferred all the data.
        let buf_alloc = total_bytes as u32 * 2;

        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(buf_alloc)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();

        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // A reader thread drains the TCP socket while the main thread
        // injects RW packets in batches, reusing descriptor slots and
        // guest memory between batches.
        let guest_data: Vec<u8> =
            (0..total_bytes).map(|i| (i % 251) as u8).collect();

        // Track how many bytes the reader has consumed so we can apply
        // backpressure and avoid overflowing the poller's VsockBuf.
        let bytes_read = Arc::new(AtomicUsize::new(0));
        let tcp_reader = {
            let mut stream = accepted.try_clone().unwrap();
            let len = total_bytes;
            let progress = Arc::clone(&bytes_read);
            std::thread::spawn(move || {
                let mut result = Vec::with_capacity(len);
                let mut chunk = vec![0u8; 65536];
                let mut total = 0;
                while total < len {
                    let n = stream.read(&mut chunk).unwrap();
                    assert!(n > 0, "unexpected EOF after {total}/{len}");
                    result.extend_from_slice(&chunk[..n]);
                    total += n;
                    progress.store(total, Ordering::Release);
                }
                result
            })
        };

        let chunk_size = 4096;
        let batch_packets = 8; // 8 packets × 2 descs = 16 descs per batch
        let mut guest_sent = 0usize;
        // TX used_idx starts at 1 (the REQUEST was consumed)
        let mut tx_consumed = 1u16;

        while guest_sent < total_bytes {
            let remaining = (total_bytes - guest_sent).div_ceil(chunk_size);
            let this_batch = std::cmp::min(batch_packets, remaining);
            // Backpressure: don't let in-flight data exceed VsockBuf
            // capacity. The poller buffers TX data in VsockBuf (128KB)
            // and flushes via POLLOUT. If we push faster than the
            // flush rate, the buffer overflows and panics.
            let after_send = guest_sent + this_batch * chunk_size;
            loop {
                let read = bytes_read.load(Ordering::Acquire);
                if after_send <= read + CONN_TX_BUF_SIZE {
                    break;
                }
                std::thread::sleep(Duration::from_millis(1));
            }

            // Reuse the same descriptor slots and data region each batch.
            // Safe because we wait for the previous batch to be fully
            // consumed before overwriting.
            harness.reset_tx_cursors();

            for i in 0..this_batch {
                let offset = guest_sent + i * chunk_size;
                let end = std::cmp::min(offset + chunk_size, total_bytes);
                let payload = &guest_data[offset..end];

                let mut rw_hdr = VsockPacketHeader::new();
                rw_hdr
                    .set_src_cid(guest_cid)
                    .set_dst_cid_raw(VSOCK_HOST_CID)
                    .set_src_port(guest_port)
                    .set_dst_port(vsock_port)
                    .set_len(payload.len() as u32)
                    .set_socket_type(VsockSocketType::Stream)
                    .set_op(VsockPacketOp::ReadWrite)
                    .set_buf_alloc(buf_alloc)
                    .set_fwd_cnt(0);

                let d_hdr = harness.add_tx_readable(hdr_as_bytes(&rw_hdr));
                let d_body = harness.add_tx_readable(payload);
                harness.chain_tx(d_hdr, d_body);
                harness.publish_tx(d_hdr);
            }

            notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

            // Wait for the poller to consume this entire batch before
            // we overwrite the descriptor slots in the next iteration.
            tx_consumed += this_batch as u16;
            wait_for_condition(|| harness.tx_used_idx() >= tx_consumed, 10000);

            guest_sent += this_batch * chunk_size;
            if guest_sent > total_bytes {
                guest_sent = total_bytes;
            }
        }

        let received = tcp_reader.join().unwrap();
        assert_eq!(received.len(), total_bytes);
        assert!(received == guest_data, "guest->host data mismatch");

        // A writer thread pushes data into the TCP socket while the
        // main thread replenishes RX descriptors in batches, reads
        // completed used entries, and reuses descriptor slots once
        // the entire batch has been consumed.
        let host_data: Vec<u8> =
            (0..total_bytes).map(|i| ((i + 7) % 251) as u8).collect();

        let tcp_writer = {
            let mut stream = accepted.try_clone().unwrap();
            let data = host_data.clone();
            std::thread::spawn(move || {
                stream.write_all(&data).unwrap();
            })
        };

        let mut host_to_guest = Vec::with_capacity(total_bytes);

        // Skip all used entries produced before this phase (RESPONSE +
        // any credit updates from Phase 1).
        let mut rx_next_used = harness.rx_used_idx();
        let rx_batch = 16u16;
        let mut descs_outstanding = 0u16;

        while host_to_guest.len() < total_bytes {
            // When all outstanding descriptors have been consumed we can
            // safely reuse the descriptor slots and data region.
            if descs_outstanding == 0 {
                harness.reset_rx_cursors();

                for _ in 0..rx_batch {
                    harness.add_rx_writable(4096);
                    descs_outstanding += 1;
                }
                notify.queue_notify(VSOCK_RX_QUEUE).unwrap();
            }

            // Wait for at least one new used entry.
            wait_for_condition(|| harness.rx_used_idx() > rx_next_used, 10000);

            // Drain all currently available used entries.
            let current_used = harness.rx_used_idx();
            while rx_next_used < current_used {
                let (hdr, data) = harness.read_vsock_packet(rx_next_used);
                rx_next_used += 1;
                descs_outstanding -= 1;

                if hdr.op() == Some(VsockPacketOp::ReadWrite) {
                    host_to_guest.extend_from_slice(&data);
                }
                // Credit updates and other control packets are
                // silently consumed — they're expected here.
            }
        }

        tcp_writer.join().unwrap();
        assert_eq!(host_to_guest.len(), total_bytes);
        assert!(host_to_guest == host_data, "host->guest data mismatch");

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    /// Closing the host-side TCP socket should cause the poller to send
    /// a VIRTIO_VSOCK_OP_SHUTDOWN packet with VIRTIO_VSOCK_SHUTDOWN_F_SEND
    /// to the guest, indicating the host will no longer send data.
    #[test]
    fn host_socket_eof_sends_shutdown() {
        let vsock_port = 9000;
        let guest_port = 7000;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Provide RX descriptors for RESPONSE + SHUTDOWN
        for _ in 0..4 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Establish connection
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Accept the connection, wait for RESPONSE
        let accepted = listener.accept().unwrap().0;
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Close the host-side socket to produce EOF
        drop(accepted);

        // The poller should detect EOF on the next POLLIN and send
        // a SHUTDOWN packet to the guest.
        wait_for_condition(|| harness.rx_used_idx() >= 2, 5000);

        // Read back the packet from RX used ring entry 1
        let (hdr, _data) = harness.read_vsock_packet(1);

        assert_eq!(hdr.op(), Some(VsockPacketOp::Shutdown));
        assert_eq!(hdr.src_cid(), VSOCK_HOST_CID);
        assert_eq!(hdr.dst_cid(), guest_cid.get());
        assert_eq!(hdr.src_port(), vsock_port);
        assert_eq!(hdr.dst_port(), guest_port);
        assert_eq!(
            hdr.flags(),
            VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_SEND
                | VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE
        );

        // Since we don't send a RST from the guest-side, the host-side should
        // send us one after `DEFAULT_QUIESCE_TIMEOUT`.
        wait_for_condition(|| harness.rx_used_idx() >= 3, 5000);

        // Read back the packet from RX used ring entry 2
        let (hdr, _data) = harness.read_vsock_packet(2);

        assert_eq!(hdr.op(), Some(VsockPacketOp::Reset));
        assert_eq!(hdr.src_cid(), VSOCK_HOST_CID);
        assert_eq!(hdr.dst_cid(), guest_cid.get());
        assert_eq!(hdr.src_port(), vsock_port);
        assert_eq!(hdr.dst_port(), guest_port);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn end_to_end_guest_to_host_closes_half_open() {
        let vsock_port = 9300;
        let guest_port = 7300;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        // Pre-populate RX queue with writable descriptors for RESPONSE + data
        for _ in 0..8 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Write REQUEST packet into TX queue
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Accept the TCP connection (blocks until poller connects)
        let _accepted = listener.accept().unwrap().0;

        // Wait for RESPONSE on RX queue
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Guest->Host: send RW packet with payload
        let payload = b"hello from guest via vsock end-to-end!";
        let mut rw_hdr = VsockPacketHeader::new();
        rw_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(payload.len() as u32)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::ReadWrite)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_hdr = harness.add_tx_readable(hdr_as_bytes(&rw_hdr));
        let d_body = harness.add_tx_readable(payload);
        harness.chain_tx(d_hdr, d_body);
        harness.publish_tx(d_hdr);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Send a Guest->Host SHUTDOWN packet with both flags set,
        // indicating the guest will no longer send or receive data.
        let mut shutdown_hdr = VsockPacketHeader::new();
        shutdown_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Shutdown)
            .set_flags(
                VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_SEND
                    | VsockPacketFlags::VIRTIO_VSOCK_SHUTDOWN_F_RECEIVE,
            )
            .set_buf_alloc(0)
            .set_fwd_cnt(0);

        let d_sd = harness.add_tx_readable(hdr_as_bytes(&shutdown_hdr));
        harness.publish_tx(d_sd);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        // Don't read any data from the underlying host socket.

        // The connection should be moved into the quiesce state and since
        // we didn't drain the internal vbuf in a timely manor we should
        // receive a RST closing the connection.
        wait_for_condition(|| harness.rx_used_idx() >= 2, 5000);

        let (rst_hdr, _) = harness.read_vsock_packet(1);
        assert_eq!(rst_hdr.op(), Some(VsockPacketOp::Reset));
        assert_eq!(rst_hdr.src_cid(), VSOCK_HOST_CID);
        assert_eq!(rst_hdr.dst_cid(), guest_cid.get());
        assert_eq!(rst_hdr.src_port(), vsock_port);
        assert_eq!(rst_hdr.dst_port(), guest_port);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    /// Sending a [`PausedCmd::Reset`] event while the event-loop is paused
    /// should drop all active connections and clear cached state so that no
    /// stale [`GuestAddr`]s or TCP sockets survive into the next guest session.
    #[test]
    fn reset_clears_connections() {
        let vsock_port = 9400;
        let guest_port = 7400;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        for _ in 0..4 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Establish a connection
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Pause then reset.
        notify.pause().unwrap();
        notify.wait_stopped();
        notify.reset();

        // NOTE: We don't have a way to actually validate that the reset call
        // above has not left behind a `GuestAddr` as apart of the `VsockVq`.

        // The host-side TCP socket should now be closed because the
        // poller dropped all connections during reset.
        let mut buf = [0u8; 1];
        match accepted.read(&mut buf) {
            Ok(0) => {}
            Err(_) => {}
            Ok(n) => panic!("expected EOF or error, got {n} bytes"),
        }

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    /// Pausing freezes the event loop but preserves all connection
    /// state.  After resume, host to guest connections continue.
    #[test]
    fn pause_resume_preserves_connections() {
        let vsock_port = 9500;
        let guest_port = 7500;
        let guest_cid = GuestCid::try_from(50).unwrap();
        let (listener, backends) = bind_test_backend(vsock_port);
        listener.set_nonblocking(false).unwrap();

        let mut harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            backends,
        )
        .unwrap();

        for _ in 0..8 {
            harness.add_rx_writable(4096);
        }

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        // Establish a connection
        let mut req_hdr = VsockPacketHeader::new();
        req_hdr
            .set_src_cid(guest_cid)
            .set_dst_cid_raw(VSOCK_HOST_CID)
            .set_src_port(guest_port)
            .set_dst_port(vsock_port)
            .set_len(0)
            .set_socket_type(VsockSocketType::Stream)
            .set_op(VsockPacketOp::Request)
            .set_buf_alloc(65536)
            .set_fwd_cnt(0);

        let d_tx = harness.add_tx_readable(hdr_as_bytes(&req_hdr));
        harness.publish_tx(d_tx);
        notify.queue_notify(VSOCK_TX_QUEUE).unwrap();

        let mut accepted = listener.accept().unwrap().0;
        accepted.set_nonblocking(false).unwrap();
        accepted.set_read_timeout(Some(Duration::from_secs(5))).unwrap();
        wait_for_condition(|| harness.rx_used_idx() >= 1, 5000);

        // Pause and resume — connection state should survive.
        notify.pause().unwrap();
        notify.wait_stopped();
        notify.resume();

        // Write data from the host side through the still-open TCP
        // socket. The poller should deliver it to the guest.
        let host_payload = b"data after resume";
        accepted.write_all(host_payload).unwrap();
        accepted.flush().unwrap();

        wait_for_condition(|| harness.rx_used_idx() >= 2, 5000);

        let (rw_hdr, data) = harness.read_vsock_packet(1);
        assert_eq!(rw_hdr.op(), Some(VsockPacketOp::ReadWrite));
        assert_eq!(&data, host_payload);

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }

    #[test]
    fn halt_from_paused() {
        let guest_cid = GuestCid::try_from(50).unwrap();

        let harness = VsockTestHarness::new();
        let vq = harness.make_vsock_vq();
        let log = test_logger();
        let start_barrier = Arc::new(Barrier::new(2));
        let poller = VsockPoller::new(
            log,
            start_barrier.clone(),
            guest_cid,
            vq,
            IdHashMap::new(),
        )
        .unwrap();

        let notify = poller.notify_handle();
        let handle = poller.run();
        start_barrier.wait();

        notify.pause().unwrap();
        notify.halt();
        handle.join().unwrap();
    }
}


================================================
FILE: lib/propolis/src/vsock/poller_stub.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;
use std::sync::Barrier;
use std::thread::JoinHandle;

use iddqd::IdHashMap;
use slog::Logger;

use crate::hw::virtio::vsock::VsockVq;
use crate::vsock::proxy::VsockPortMapping;
use crate::vsock::GuestCid;

bitflags! {
    pub struct PollEvents: i32 {
        const IN = libc::POLLIN as i32;
        const OUT = libc::POLLOUT as i32;
    }
}

pub struct VsockPollerNotify;

impl VsockPollerNotify {
    pub fn queue_notify(&self, _id: u16) -> std::io::Result<()> {
        return Err(std::io::Error::other(
            "not available on non-illumos systems",
        ));
    }

    pub fn pause(&self) -> std::io::Result<()> {
        Ok(())
    }

    pub fn resume(&self) {}

    pub fn reset(&self) {}

    pub fn halt(&self) {}

    pub fn wait_stopped(&self) {}
}

pub struct VsockPoller;

impl VsockPoller {
    pub fn new(
        _log: Logger,
        _start_barrier: Arc<Barrier>,
        _cid: GuestCid,
        _queues: VsockVq,
        _port_mappings: IdHashMap<VsockPortMapping>,
    ) -> std::io::Result<Self> {
        return Err(std::io::Error::other(
            "VsockPoller is not available on non-illumos systems",
        ));
    }

    pub fn notify_handle(&self) -> VsockPollerNotify {
        VsockPollerNotify {}
    }

    pub fn run(self) -> JoinHandle<()> {
        std::thread::Builder::new()
            .name("vsock-event-loop".to_string())
            .spawn(move || {})
            .expect("failed to spawn vsock event loop")
    }
}


================================================
FILE: lib/propolis/src/vsock/proxy.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::net::SocketAddr;
use std::net::TcpStream;
use std::num::NonZeroUsize;
use std::num::Wrapping;
use std::os::fd::AsRawFd;
use std::os::fd::RawFd;
use std::sync::Arc;
use std::sync::Barrier;
use std::thread::JoinHandle;
use std::time::Duration;

use iddqd::IdHashItem;
use iddqd::IdHashMap;
use nix::poll::PollFlags;
use serde::Deserialize;
use slog::error;
use slog::Logger;

use crate::hw::virtio::vsock::VsockVq;
use crate::vsock::buffer::VsockBuf;
use crate::vsock::buffer::VsockBufError;
use crate::vsock::packet::VsockPacket;
use crate::vsock::packet::VsockPacketHeader;
use crate::vsock::poller::VsockPoller;
use crate::vsock::poller::VsockPollerNotify;
use crate::vsock::GuestCid;
use crate::vsock::VsockBackend;
use crate::vsock::VsockError;

/// Default buffer size for guest->host data.
pub const CONN_TX_BUF_SIZE: usize = 1024 * 128;

/// Connection lifecycle state for a vsock connection.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ConnState {
    // The guest has sent us a VIRTIO_VSOCK_OP_REQUEST
    Init,
    /// We have sent VIRTIO_VSOCK_OP_RESPONSE - connection can send/recv data
    Established,
    /// The connection is in the process of closing - read and write halves are
    /// tracked seperately.
    /// NB: This is tracking what the Guest has told us about itself.
    Closing {
        read: bool,
        write: bool,
    },
}

#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq)]
pub struct ConnKey {
    /// The port the guest is transmitting to.
    pub(crate) host_port: u32,
    /// The port the guest is transmitting from.
    pub(crate) guest_port: u32,
}

// This impl allows us to convert to and from a portev_user object (see
// port_associate3C). The conversion to and from a usize allows us to encode
// the key in the pointer value itself rather than allocating memory.
//
// NB: This object is defined as a `*mut c_void` and therefore will not be
// 64bits on all platforms, but we currently only support x86_64 hardware,
// therefore we are leaving a static assertion behind as a future hint to
// ourselves.
impl ConnKey {
    /// Pack the host + port into a usize
    pub fn to_portev_user(self) -> usize {
        static_assertions::assert_eq_size!(u64, usize);
        ((self.host_port as usize) << 32) | (self.guest_port as usize)
    }

    /// Unpack the host + port from a usize
    pub fn from_portev_user(val: usize) -> Self {
        Self { host_port: (val >> 32) as u32, guest_port: val as u32 }
    }
}

#[derive(Debug, thiserror::Error)]
pub enum ProxyConnError {
    #[error("Failed to connect to vsock backend {backend}: {source}")]
    Socket {
        backend: SocketAddr,
        #[source]
        source: std::io::Error,
    },
    #[error("Failed to put socket into nonblocking mode: {0}")]
    NonBlocking(#[source] std::io::Error),
    #[error("Cannot transition connection from {from:?} to {to:?}")]
    InvalidStateTransition { from: ConnState, to: ConnState },
}

/// An established guest<=>host connection.
///
/// Note that the internal state of the proxy connection uses `Wrapping<u32>`
/// because the virtio spec uses the following calculation to determine how much
/// buffer space a guest has:
///
/// /* tx_cnt is the sender's free-running bytes transmitted counter */
/// u32 peer_free = peer_buf_alloc - (tx_cnt - peer_fwd_cnt);
///
/// The lifetime of a connection can exceed u32::MAX bytes transmitted, so we
/// rely on wrapping semantics to determine the difference.
#[derive(Debug)]
pub struct VsockProxyConn {
    pub(crate) socket: TcpStream,
    /// Current connection state.
    state: ConnState,
    /// Ring buffer used to receive packets from the guest tx virt queue.
    vbuf: VsockBuf,
    /// Bytes we've consumed from vbuf (forwarded to socket).
    fwd_cnt: Wrapping<u32>,
    /// The fwd_cnt value we last sent to the guest in a credit update.
    last_fwd_cnt_sent: Wrapping<u32>,
    /// Bytes we've sent to the guest from the socket.
    tx_cnt: Wrapping<u32>,
    /// Guest's buffer allocation.
    peer_buf_alloc: u32,
    /// Bytes the guest has consumed from their buffer.
    peer_fwd_cnt: Wrapping<u32>,
}

impl VsockProxyConn {
    /// Create a new `VsockProxyConn` connected to an underlying host socket.
    pub fn new(addr: &SocketAddr) -> Result<Self, ProxyConnError> {
        let socket =
            TcpStream::connect_timeout(addr, Duration::from_millis(100))
                .map_err(|e| ProxyConnError::Socket {
                    backend: *addr,
                    source: e,
                })?;
        socket.set_nonblocking(true).map_err(ProxyConnError::NonBlocking)?;

        Ok(Self {
            socket,
            state: ConnState::Init,
            vbuf: VsockBuf::new(NonZeroUsize::new(CONN_TX_BUF_SIZE).unwrap()),
            fwd_cnt: Wrapping(0),
            last_fwd_cnt_sent: Wrapping(0),
            tx_cnt: Wrapping(0),
            peer_buf_alloc: 0,
            peer_fwd_cnt: Wrapping(0),
        })
    }

    /// Set of `PollEvents` that this connection is interested in.
    pub fn poll_interests(&self) -> Option<PollFlags> {
        let mut interests = PollFlags::empty();
        interests.set(PollFlags::POLLOUT, self.has_buffered_data());
        interests.set(PollFlags::POLLIN, self.guest_can_read());

        match interests.is_empty() {
            true => None,
            false => Some(interests),
        }
    }

    /// Returns `true` if the connection has data pending in its ring buffer
    /// that needs to be flushed to the underlying socket.
    pub fn has_buffered_data(&self) -> bool {
        !self.vbuf.is_empty()
    }

    /// Set the connection to established.
    pub fn set_established(&mut self) -> Result<(), ProxyConnError> {
        match self.state {
            ConnState::Init => self.state = ConnState::Established,
            current => {
                return Err(ProxyConnError::InvalidStateTransition {
                    from: current,
                    to: ConnState::Established,
                })
            }
        }

        Ok(())
    }

    /// Check if the connection can read from the host socket.
    pub fn guest_can_read(&self) -> bool {
        matches!(
            self.state,
            ConnState::Established | ConnState::Closing { read: false, .. }
        )
    }

    pub fn shutdown_guest_read(&mut self) -> Result<(), ProxyConnError> {
        self.state = match self.state {
            ConnState::Established => {
                ConnState::Closing { read: true, write: false }
            }
            ConnState::Closing { write, .. } => {
                ConnState::Closing { read: true, write: write }
            }
            current => {
                return Err(ProxyConnError::InvalidStateTransition {
                    from: current,
                    to: ConnState::Closing { read: true, write: false },
                })
            }
        };

        Ok(())
    }

    pub fn shutdown_guest_write(&mut self) -> Result<(), ProxyConnError> {
        self.state = match self.state {
            ConnState::Established => {
                ConnState::Closing { read: false, write: true }
            }
            ConnState::Closing { read, .. } => {
                ConnState::Closing { read, write: true }
            }
            current => {
                return Err(ProxyConnError::InvalidStateTransition {
                    from: current,
                    to: ConnState::Closing { read: true, write: false },
                })
            }
        };

        Ok(())
    }

    /// Check if the connection should be removed.
    pub fn should_close(&self) -> bool {
        matches!(self.state, ConnState::Closing { read: true, write: true })
    }

    /// Update peer credit info from a packet header.
    pub fn update_peer_credit(&mut self, header: &VsockPacketHeader) {
        self.peer_buf_alloc = header.buf_alloc();
        self.peer_fwd_cnt = Wrapping(header.fwd_cnt());
    }

    /// Process a packet received from the guest tx queue.
    pub fn recv_packet(
        &mut self,
        packet: VsockPacket,
    ) -> Result<(), VsockBufError> {
        self.vbuf.push(packet.data)
    }

    pub fn flush(&mut self) -> std::io::Result<usize> {
        self.vbuf.write_to(&mut self.socket)
    }

    /// Calculate how much data we can send to the guest based on their credit.
    pub fn peer_credit(&self) -> u32 {
        let in_flight = (self.tx_cnt - self.peer_fwd_cnt).0;
        self.peer_buf_alloc.saturating_sub(in_flight)
    }

    /// Update fwd_cnt after consuming data from vbuf.
    pub fn update_fwd_cnt(&mut self, bytes: u32) {
        self.fwd_cnt += Wrapping(bytes);
    }

    /// Update tx_cnt after sending data to guest.
    pub fn update_tx_cnt(&mut self, bytes: u32) {
        self.tx_cnt += Wrapping(bytes);
    }

    /// Get our current fwd_cnt to report to the guest.
    pub fn fwd_cnt(&self) -> u32 {
        self.fwd_cnt.0
    }

    /// Get our buffer allocation to report to the guest.
    pub fn buf_alloc(&self) -> u32 {
        self.vbuf.capacity() as u32
    }

    /// Check if we should send a credit update to the guest.
    ///
    /// Returns true if we've consumed more than half of our buffer capacity
    /// since the last credit update was sent.
    pub fn needs_credit_update(&self) -> bool {
        let bytes_consumed_since_update =
            (self.fwd_cnt - self.last_fwd_cnt_sent).0;
        bytes_consumed_since_update > (self.vbuf.capacity() / 2) as u32
    }

    /// Mark that we've sent a credit update with the current fwd_cnt.
    pub fn mark_credit_sent(&mut self) {
        self.last_fwd_cnt_sent = self.fwd_cnt;
    }

    pub fn get_fd(&self) -> RawFd {
        self.socket.as_raw_fd()
    }
}

#[derive(Deserialize, Debug, Clone, Copy)]
pub struct VsockPortMapping {
    port: u32,
    // TODO this could be extended to support Unix sockets as well.
    addr: SocketAddr,
}

impl VsockPortMapping {
    pub fn new(port: u32, addr: SocketAddr) -> Self {
        Self { port, addr }
    }

    pub fn addr(&self) -> &SocketAddr {
        &self.addr
    }
}

impl IdHashItem for VsockPortMapping {
    type Key<'a> = u32;

    fn key(&self) -> Self::Key<'_> {
        self.port
    }

    iddqd::id_upcast!();
}

/// virtio-socket backend that proxies between a guest and a host UDS.
pub struct VsockProxy {
    log: Logger,
    start_barrier: Arc<Barrier>,
    poller: VsockPollerNotify,
    _evloop_handle: JoinHandle<()>,
}

impl VsockProxy {
    pub fn new(
        log: Logger,
        cid: GuestCid,
        queues: VsockVq,
        port_mappings: IdHashMap<VsockPortMapping>,
    ) -> Self {
        // We use a `Barrier` to gate when the spawned `VsockPoller` thread is
        // allowed to transition to the running state and is allowed to start
        // using guest memory. Forward progress will be made when the
        // virtio-socket device calls `Lifecycle::start`.
        let start_barrier = Arc::new(Barrier::new(2));
        let evloop = VsockPoller::new(
            log.clone(),
            start_barrier.clone(),
            cid,
            queues,
            port_mappings,
        )
        .unwrap();
        let poller = evloop.notify_handle();
        let jh = evloop.run();

        Self { log, start_barrier, poller, _evloop_handle: jh }
    }

    /// Notification from the vsock device that one of the queues has had an
    /// event.
    fn queue_notify(&self, vq_id: u16) -> std::io::Result<()> {
        self.poller.queue_notify(vq_id)
    }

    pub fn start(&self) {
        // TODO this *should* only be called once given the guarantees propolis
        // gives us. We might want to start using the `Indicator` type in the
        // future.
        self.start_barrier.wait();
    }

    pub fn pause(&self) -> std::io::Result<()> {
        self.poller.pause()
    }

    pub fn wait_stopped(&self) {
        self.poller.wait_stopped();
    }

    pub fn resume(&self) {
        self.poller.resume();
    }

    pub fn reset(&self) {
        self.poller.reset();
    }

    pub fn halt(&self) {
        self.poller.halt();
        self.poller.wait_stopped();
    }
}

impl VsockBackend for VsockProxy {
    fn queue_notify(&self, queue_id: u16) -> Result<(), VsockError> {
        self.queue_notify(queue_id)
            // Log the raw error in additon to returning the top level
            // `VsockError`
            .inspect_err(|e| {
                error!(&self.log,
                    "failed to send virtqueue notification: {e}";
                    "queue" => %queue_id,
                )
            })
            .map_err(|_| VsockError::QueueNotify { queue: queue_id })
    }
}


================================================
FILE: lib/propolis-client/Cargo.toml
================================================
[package]
name = "propolis-client"
version = "0.1.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
async-trait.workspace = true
base64.workspace = true
crucible-client-types.workspace = true
futures.workspace = true
progenitor.workspace = true
progenitor-client.workspace = true
propolis-api-types-versions.workspace = true
rand.workspace = true
reqwest = { workspace = true, features = ["json", "rustls"] }
schemars = { workspace = true, features = ["uuid1"] }
serde = { workspace = true, features = ["derive"] }
serde_json.workspace = true
slog.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["net"] }
tokio-tungstenite.workspace = true
uuid = { workspace = true, features = ["serde", "v4"] }

[dev-dependencies]
tokio = { workspace = true, features = ["test-util", "macros"] }


================================================
FILE: lib/propolis-client/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A client for the Propolis hypervisor frontend's server API.

/// Re-exports of types related to instance specs.
///
/// These types are re-exported for the convenience of components like
/// sled-agent that may wish to expose instance specs in their own APIs.
/// Defining the sled-agent API in terms of these "native" types allows
/// sled-agent to reuse their trait implementations (and in particular use
/// "manual" impls of things that Progenitor would otherwise derive).
///
/// In the generated client, the native "top-level" instance spec and component
/// types ([`crate::instance_spec::VersionedInstanceSpec`],
/// [`crate::instance_spec::InstanceSpecV0`], and
/// [`crate::instance_spec::ReplacementComponent`]) replace their generated
/// counterparts. This obviates the need to maintain `From` impls to convert
/// between native and generated types.
pub mod instance_spec {
    pub use propolis_api_types_versions::latest::components::{
        backends::*, board::*, devices::*,
    };
    pub use propolis_api_types_versions::latest::instance::{
        InstanceMetadata, InstanceProperties, ReplacementComponent,
    };
    pub use propolis_api_types_versions::latest::instance_spec::*;
    // Re-export v1 types with V0 suffix for backward compatibility with
    // progenitor-generated clients.
    pub use propolis_api_types_versions::v1::instance_spec::{
        Component as ComponentV0, InstanceSpec as InstanceSpecV0,
        InstanceSpecGetResponse as InstanceSpecGetResponseV0,
        InstanceSpecStatus as InstanceSpecStatusV0, VersionedInstanceSpec,
    };
}

// Re-export Crucible client types that appear in their serialized forms in
// instance specs. This allows clients to ensure they serialize/deserialize
// these types using the same versions as the Propolis client associated with
// the server they want to talk to.
pub use crucible_client_types::{CrucibleOpts, VolumeConstructionRequest};

progenitor::generate_api!(
    spec = "../../openapi/propolis-server/propolis-server-latest.json",
    interface = Builder,
    tags = Separate,
    replace = {
        PciPath = propolis_api_types_versions::latest::instance_spec::PciPath,
        ReplacementComponent = propolis_api_types_versions::latest::instance::ReplacementComponent,
        InstanceSpec = propolis_api_types_versions::latest::instance_spec::InstanceSpec,
        InstanceSpecStatus = propolis_api_types_versions::latest::instance_spec::InstanceSpecStatus,
        InstanceProperties = propolis_api_types_versions::latest::instance::InstanceProperties,
        InstanceMetadata = propolis_api_types_versions::latest::instance::InstanceMetadata,
        InstanceSpecGetResponse = propolis_api_types_versions::latest::instance_spec::InstanceSpecGetResponse,
        SmbiosType1Input = propolis_api_types_versions::latest::instance_spec::SmbiosType1Input,
        VersionedInstanceSpec = propolis_api_types_versions::latest::instance_spec::VersionedInstanceSpec,
        CpuidEntry = propolis_api_types_versions::latest::components::board::CpuidEntry,
    },
    // Automatically derive JsonSchema for instance spec-related types so that
    // they can be reused in sled-agent's API. This can't be done with a
    // `derives = [schemars::JsonSchema]` directive because the `SpecKey` type
    // needs to implement that trait manually (see below).
    patch = {
        BootSettings = { derives = [ Default ] },
        CpuidEntry = { derives = [ PartialEq, Eq, Copy ] },
        InstanceMetadata = { derives = [ PartialEq ] },
        SpecKey = { derives = [ PartialEq, Eq, Ord, PartialOrd, Hash ] },
    }
);

pub mod support;


================================================
FILE: lib/propolis-client/src/support.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::HashMap;
use std::net::SocketAddr;
use std::pin::Pin;
use std::task::{Context, Poll};
use std::time::Duration;

use self::tungstenite::http;
use futures::{Sink, SinkExt, StreamExt};
use serde::{Deserialize, Serialize};
use slog::Logger;
use tokio::io::{AsyncRead, AsyncWrite};
use tokio_tungstenite::tungstenite::protocol::Role;
use tokio_tungstenite::tungstenite::{Error as WSError, Message as WSMessage};
// re-export as an escape hatch for crate-version-matching problems
pub use tokio_tungstenite::{tungstenite, WebSocketStream};

use crate::types::{Chipset, I440Fx};
use crate::Client as PropolisClient;

impl Default for Chipset {
    fn default() -> Self {
        Self::I440Fx(I440Fx { enable_pcie: false })
    }
}

/// Generates a 20-byte NVMe device serial number from the bytes in a string
/// slice. If the slice is too short to populate the entire serial number, the
/// remaining bytes are filled with `pad`.
///
/// NOTE: Version 1.2.1 of the NVMe specification (June 5, 2016) specifies in
/// section 1.5 that ASCII data fields, including serial numbers, must be
/// left-justified and must use 0x20 bytes (spaces) as the padding value. This
/// function allows callers to choose a non-0x20 padding value to preserve the
/// serial numbers for existing disks, which serial numbers may have been used
/// previously and persisted into a VM's nonvolatile EFI variables (such as its
/// boot order variables).
//
// TODO(#790): Ideally, this routine would have no `pad` parameter at all and
// would always pad with spaces, but whether this is ultimately possible depends
// on whether Omicron can start space-padding serial numbers for disks that were
// attached to a Propolis VM that zero-padded their serial numbers.
pub fn nvme_serial_from_str(s: &str, pad: u8) -> [u8; 20] {
    let mut sn = [0u8; 20];

    let bytes_from_slice = sn.len().min(s.len());
    sn[..bytes_from_slice].copy_from_slice(&s.as_bytes()[..bytes_from_slice]);
    sn[bytes_from_slice..].fill(pad);
    sn
}

/// Clone of `InstanceSerialConsoleControlMessage` type defined in
/// `propolis_api_types`, with which this must be kept in sync.
///
/// Until Dropshot grows the ability to add arbitrary type definitions to the
/// OpenAPI document, the types related to websocket activity will need to be
/// manually duplicated for use.
#[derive(Clone, Debug, Deserialize, Serialize)]
pub enum InstanceSerialConsoleControlMessage {
    Migrating { destination: SocketAddr, from_start: u64 },
}

/// A trait representing a console stream.
pub trait SerialConsoleStream: AsyncRead + AsyncWrite + Unpin + Send {}
impl<T: AsyncRead + AsyncWrite + Unpin + Send> SerialConsoleStream for T {}

/// Represents a way to build a serial console stream.
#[async_trait::async_trait]
pub(crate) trait SerialConsoleStreamBuilder: Send {
    async fn build(
        &mut self,
        address: SocketAddr,
        offset: WSClientOffset,
    ) -> Result<Box<dyn SerialConsoleStream>, WSError>;
}

/// A serial console builder that uses a Propolis client to build the
/// socket.
#[derive(Debug)]
struct PropolisSerialBuilder {}

impl PropolisSerialBuilder {
    /// Creates a new `PropolisSerialBuilder`.
    pub fn new() -> Self {
        Self {}
    }
}

#[async_trait::async_trait]
impl SerialConsoleStreamBuilder for PropolisSerialBuilder {
    async fn build(
        &mut self,
        address: SocketAddr,
        offset: WSClientOffset,
    ) -> Result<Box<dyn SerialConsoleStream>, WSError> {
        let client = PropolisClient::new(&format!("http://{address}"));
        let mut req = client.instance_serial();

        match offset {
            WSClientOffset::FromStart(offset) => {
                req = req.from_start(offset);
            }
            WSClientOffset::MostRecent(offset) => {
                req = req.most_recent(offset);
            }
        }

        let upgraded = req
            .send()
            .await
            .map_err(|e| {
                WSError::Http(http::Response::new(Some(
                    e.to_string().into_bytes(),
                )))
            })?
            .into_inner();

        Ok(Box::new(upgraded))
    }
}

/// A serial console builder for tests.
///
/// This works by mapping `SocketAddr`s to streams, inserting an optional
/// delay in the middle.
///
/// Primarily intended for testing.
pub struct TestSerialBuilder<St> {
    client_conns_and_delays: HashMap<SocketAddr, (Duration, St)>,
}

impl<St: SerialConsoleStream> TestSerialBuilder<St> {
    fn new(
        client_conns_and_delays: impl IntoIterator<
            Item = (SocketAddr, Duration, St),
        >,
    ) -> Self {
        Self {
            client_conns_and_delays: client_conns_and_delays
                .into_iter()
                .map(|(address, delay, stream)| (address, (delay, stream)))
                .collect(),
        }
    }
}

#[async_trait::async_trait]
impl<St: SerialConsoleStream + 'static> SerialConsoleStreamBuilder
    for TestSerialBuilder<St>
{
    async fn build(
        &mut self,
        address: SocketAddr,
        // offset is currently unused by this builder. Worth testing in
        // the future.
        _offset: WSClientOffset,
    ) -> Result<Box<dyn SerialConsoleStream>, WSError> {
        if let Some((delay, stream)) =
            self.client_conns_and_delays.remove(&address)
        {
            tokio::time::sleep(delay).await;
            Ok(Box::new(stream))
        } else {
            Err(WSError::Http(http::Response::new(Some(
                format!("no duplex connection found for address {address}")
                    .into_bytes(),
            ))))
        }
    }
}

pub enum WSClientOffset {
    FromStart(u64),
    MostRecent(u64),
}

/// This is a trivial abstraction wrapping the websocket connection returned
/// by [crate::Client::instance_serial], providing the additional
/// functionality of connecting to the new propolis-server when an instance
/// is migrated (thus providing the illusion of the connection being
/// seamlessly maintained through migration)
///
/// # `Sink` implementation
///
/// `InstanceSerialConsoleHelper` implements [`Sink`]`<`[`WSMessage`]`>` to
/// write data over the websocket connection. To send character inputs for
/// the console, use [`WSMessage::Binary`].
pub struct InstanceSerialConsoleHelper {
    stream_builder: Box<dyn SerialConsoleStreamBuilder>,
    ws_stream: WebSocketStream<Box<dyn SerialConsoleStream>>,
    log: Option<Logger>,
}

impl InstanceSerialConsoleHelper {
    /// Creates a new serial console helper by using a Propolis client to
    /// connect to the provided address and using the given offset.
    ///
    /// Returns an error if the helper failed to connect to the address.
    pub async fn new(
        address: SocketAddr,
        offset: WSClientOffset,
        log: Option<Logger>,
    ) -> Result<Self, WSError> {
        let stream_builder = PropolisSerialBuilder::new();
        Self::new_with_builder(stream_builder, address, offset, log).await
    }

    /// Creates a new serial console helper for testing.
    ///
    /// The `connections` parameter represents a mapping from addresses to
    /// streams. The `SocketAddr` passed in is arbitrary, and is only used
    /// as a map key.
    pub async fn new_test<St: SerialConsoleStream + 'static>(
        connections: impl IntoIterator<Item = (SocketAddr, St)>,
        address: SocketAddr,
        offset: WSClientOffset,
        log: Option<Logger>,
    ) -> Result<Self, WSError> {
        let stream_builder = TestSerialBuilder::new(
            connections
                .into_iter()
                .map(|(addr, stream)| (addr, Duration::ZERO, stream)),
        );
        Self::new_with_builder(stream_builder, address, offset, log).await
    }

    /// Creates a new serial console helper for testing, with delays before
    /// connecting.
    ///
    /// This is similar to [`Self::new_test`], except before each connection
    /// starts an artificial delay can be introduced.
    ///
    /// Primarily intended for advanced testing scenarios.
    pub async fn new_test_with_delays<St: SerialConsoleStream + 'static>(
        connections: impl IntoIterator<Item = (SocketAddr, Duration, St)>,
        address: SocketAddr,
        offset: WSClientOffset,
        log: Option<Logger>,
    ) -> Result<Self, WSError> {
        let stream_builder = TestSerialBuilder::new(connections);
        Self::new_with_builder(stream_builder, address, offset, log).await
    }

    // Currently used for testing, and not exposed to clients.
    pub(crate) async fn new_with_builder(
        mut stream_builder: impl SerialConsoleStreamBuilder + 'static,
        address: SocketAddr,
        offset: WSClientOffset,
        log: Option<Logger>,
    ) -> Result<Self, WSError> {
        let stream = stream_builder.build(address, offset).await?;
        let ws_stream =
            WebSocketStream::from_raw_socket(stream, Role::Client, None).await;
        Ok(Self { stream_builder: Box::new(stream_builder), ws_stream, log })
    }

    /// Receives the next [WSMessage] from the server, holding it in
    /// abeyance until it is processed.
    ///
    /// Returns [Option::None] if the connection has been terminated.
    ///
    /// # Cancel safety
    ///
    /// This method is cancel-safe and can be used in a `select!` loop
    /// without causing any messages to be dropped. However,
    /// [InstanceSerialConsoleMessage::process] must be awaited to retrieve
    /// the inner [WSMessage], and that portion is not cancel-safe.
    pub async fn recv(
        &mut self,
    ) -> Option<Result<InstanceSerialConsoleMessage<'_>, WSError>> {
        // Note that ws_stream.next() eventually calls tungstenite's
        // read_message. From manual inspection, it looks like read_message
        // is written in a cancel-safe fashion so pending packets are
        // buffered before being written out.
        //
        // We currently assume and don't test that ws_stream.next() is
        // cancel-safe. That would be a good test to add in the future but
        // will require some testing infrastructure to insert delays in the
        // I/O stream manually.
        let message = self.ws_stream.next().await?;
        match message {
            Ok(message) => {
                Some(Ok(InstanceSerialConsoleMessage { helper: self, message }))
            }
            Err(error) => Some(Err(error)),
        }
    }
}

impl Sink<WSMessage> for InstanceSerialConsoleHelper {
    type Error = WSError;

    fn poll_ready(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Result<(), Self::Error>> {
        self.ws_stream.poll_ready_unpin(cx)
    }

    fn start_send(
        mut self: Pin<&mut Self>,
        item: WSMessage,
    ) -> Result<(), Self::Error> {
        self.ws_stream.start_send_unpin(item)
    }

    fn poll_flush(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Result<(), Self::Error>> {
        self.ws_stream.poll_flush_unpin(cx)
    }

    fn poll_close(
        mut self: Pin<&mut Self>,
        cx: &mut Context<'_>,
    ) -> Poll<Result<(), Self::Error>> {
        self.ws_stream.poll_close_unpin(cx)
    }
}

/// A [`WSMessage`] that has been received but not processed yet.
pub struct InstanceSerialConsoleMessage<'a> {
    helper: &'a mut InstanceSerialConsoleHelper,
    message: WSMessage,
}

impl InstanceSerialConsoleMessage<'_> {
    /// Processes this [WSMessage].
    ///
    /// - [WSMessage::Binary] are character output from the serial console.
    /// - [WSMessage::Close] is a close frame.
    /// - [WSMessage::Text] contain metadata, i.e. about a migration, which
    ///   this function still returns after connecting to the new server in
    ///   case the application needs to take further action (e.g. log an
    ///   event, or show a UI indicator that a migration has occurred).
    ///
    /// # Cancel safety
    ///
    /// This method is *not* cancel-safe and should *not* be called directly
    /// in a `select!` loop. If this future is not awaited to completion,
    /// then not only will messages will be dropped, any pending migrations
    /// will not complete.
    ///
    /// Like other non-cancel-safe futures, it is OK to create this future
    /// *once*, then call it in a `select!` loop by pinning it and selecting
    /// over a `&mut` reference to it. An example is shown in [Resuming an
    /// async
    /// operation](https://tokio.rs/tokio/tutorial/select#resuming-an-async-operation).
    ///
    /// # Why this approach?
    ///
    /// There are two general approaches we can take here to deal with
    /// cancel safety:
    ///
    /// 1. Break apart processing into cancel-safe
    ///    [`InstanceSerialConsoleHelper::recv`] and non-cancel-safe (this
    ///    method) sections. This is the approach chosen here.
    /// 2. Make all of [`InstanceSerialConsoleHelper::recv`] cancel-safe.
    ///    This approach was prototyped in [this propolis
    ///    PR](https://github.com/oxidecomputer/propolis/pull/438), but was
    ///    not chosen.
    ///
    /// Why was approach 1 chosen over 2? It comes down to three reasons:
    ///
    /// 1. This approach is significantly simpler to understand and involves
    ///    less state fiddling.
    /// 2. Once we've received a `Migrating` message, the migration is
    ///    actually *done*. From there onwards, connecting to the new server
    ///    should be very quick and it's OK to block on that.
    /// 3. Once we've received a `Migrating` message, we shouldn't be
    ///    sending further messages to the old websocket stream. With
    ///    approach 2, we'd have to do extra work to buffer up those old
    ///    messages, then send them after migration is complete. That isn't
    ///    an issue with approach 1.
    ///
    /// The current implementation does have an issue where if a migration
    /// is happening and we haven't received the `Migrating` message yet,
    /// we'll send messages over the old websocket stream. This can be
    /// addressed in several ways:
    ///
    /// - Maintain a sequence number and a local bounded buffer for
    ///   messages, and include the sequence number in the `Migrating`
    ///   message. Replay messages starting from the sequence number
    ///   afterwards.
    /// - Buffer messages received during migration on the server rather
    ///   than the client.
    pub async fn process(self) -> Result<WSMessage, WSError> {
        if let WSMessage::Text(json) = &self.message {
            match serde_json::from_str(json) {
                Ok(InstanceSerialConsoleControlMessage::Migrating {
                    destination,
                    from_start,
                }) => {
                    let stream = self
                        .helper
                        .stream_builder
                        .build(
                            destination,
                            WSClientOffset::FromStart(from_start),
                        )
                        .await?;
                    self.helper.ws_stream = WebSocketStream::from_raw_socket(
                        stream,
                        Role::Client,
                        None,
                    )
                    .await;
                }
                Err(e) => {
                    if let Some(log) = &self.helper.log {
                        slog::warn!(
                            log,
                            "Unsupported control message {:?}: {:?}",
                            json,
                            e
                        );
                    }
                    // don't return error, might be a future addition understood by consumer
                }
            }
        }

        Ok(self.message)
    }
}

#[allow(dead_code)]
fn assert_send<T: Send>() {}

fn _assert_impls() {
    assert_send::<InstanceSerialConsoleHelper>();
    assert_send::<InstanceSerialConsoleMessage>();
}

#[cfg(test)]
mod test {
    use super::InstanceSerialConsoleControlMessage;
    use super::InstanceSerialConsoleHelper;
    use super::Role;
    use super::WSClientOffset;
    use super::WSError;
    use super::WSMessage;
    use super::WebSocketStream;
    use futures::{SinkExt, StreamExt};
    use std::net::IpAddr;
    use std::net::Ipv6Addr;
    use std::net::SocketAddr;
    use std::time::Duration;
    use tokio::io::AsyncRead;
    use tokio::io::AsyncWrite;
    use tokio::time::Instant;

    #[tokio::test]
    async fn test_connection_helper() {
        let address = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), 12000);
        let (client_conn, server_conn) = tokio::io::duplex(1024);

        let mut client = InstanceSerialConsoleHelper::new_test(
            [(address, client_conn)],
            address,
            WSClientOffset::FromStart(0),
            None,
        )
        .await
        .unwrap();
        let mut server = make_ws_server(server_conn).await;

        let sent = WSMessage::Binary(vec![1, 3, 3, 7]);
        client.send(sent.clone()).await.unwrap();
        let received = server.next().await.unwrap().unwrap();
        assert_eq!(sent, received);

        let sent = WSMessage::Binary(vec![2, 4, 6, 8]);
        server.send(sent.clone()).await.unwrap();
        let received =
            client.recv().await.unwrap().unwrap().process().await.unwrap();
        assert_eq!(sent, received);

        // just check that it *tries* to connect
        let payload = serde_json::to_string(
            &InstanceSerialConsoleControlMessage::Migrating {
                destination: SocketAddr::V4("0.0.0.0:0".parse().unwrap()),
                from_start: 0,
            },
        )
        .unwrap();
        let sent = WSMessage::Text(payload);
        server.send(sent).await.unwrap();
        let received =
            client.recv().await.unwrap().unwrap().process().await.unwrap_err();
        assert!(matches!(received, WSError::Http(_)));
    }

    // start_paused = true means that the durations passed in are used to
    // just provide a total ordering for awaits -- we don't actually wait
    // that long.
    #[tokio::test(start_paused = true)]
    async fn test_recv_cancel_safety() {
        let address_1 = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), 12000);
        let address_2 = SocketAddr::new(IpAddr::V6(Ipv6Addr::LOCALHOST), 14000);

        let (client_conn_1, server_conn_1) = tokio::io::duplex(1024);
        let (client_conn_2, server_conn_2) = tokio::io::duplex(1024);

        let mut client = InstanceSerialConsoleHelper::new_test_with_delays(
            [
                (address_1, Duration::ZERO, client_conn_1),
                // Add a delay before connecting to client 2 to test cancel safety.
                (address_2, Duration::from_secs(1), client_conn_2),
            ],
            address_1,
            WSClientOffset::FromStart(0),
            None,
        )
        .await
        .unwrap();

        let mut server_1 = make_ws_server(server_conn_1).await;
        let mut server_2 = make_ws_server(server_conn_2).await;

        let payload = serde_json::to_string(
            &InstanceSerialConsoleControlMessage::Migrating {
                destination: address_2,
                from_start: 0,
            },
        )
        .unwrap();
        let migration_message = WSMessage::Text(payload);

        let expected = vec![
            migration_message.clone(),
            WSMessage::Binary([5, 6, 7, 8].into()),
            WSMessage::Close(None),
        ];

        // Spawn a separate task that feeds values into all the servers with
        // a delay. This means that the recv() future is sometimes cancelled
        // in the select! loop below, so we can test cancel safety.
        tokio::spawn(async move {
            tokio::time::sleep(Duration::from_secs(1)).await;
            server_1.send(migration_message).await.unwrap();

            // This message sent on server 1 is *ignored* because it is sent
            // after the "migrating" message.
            let sent = WSMessage::Binary([1, 2, 3, 4].into());
            server_1.send(sent).await.unwrap();

            tokio::time::sleep(Duration::from_secs(1)).await;
            let sent = WSMessage::Binary([5, 6, 7, 8].into());
            server_2.send(sent).await.unwrap();

            server_2.close(None).await.unwrap();
        });

        let mut received = Vec::new();

        // This sends periodic messages which causes client.recv() to be
        // canceled sometimes.
        let start = Instant::now();
        let mut interval = tokio::time::interval(Duration::from_millis(250));
        loop {
            tokio::select! {
                message = client.recv() => {
                    // XXX At the end of client.recv() we should receive
                    // None, but in reality we receive a BrokenPipe message,
                    // why?
                    let message = message.expect("we terminate this loop before receiving None");
                    let message = message
                        .expect("received a message")
                        .process()
                        .await
                        .expect("no migration error occurred");

                    println!("received message: {message:?}");
                    received.push(message.clone());

                    if let WSMessage::Close(_) = message {
                        break;
                    }
                }
                _ = interval.tick() => {
                    println!("interval tick, {:?} elapsed", start.elapsed());
                }
            }
        }

        assert_eq!(received, expected);
    }

    async fn make_ws_server<S>(conn: S) -> WebSocketStream<S>
    where
        S: AsyncRead + AsyncWrite + Unpin,
    {
        WebSocketStream::from_raw_socket(conn, Role::Server, None).await
    }

    #[test]
    fn test_nvme_serial_from_str() {
        use super::nvme_serial_from_str;

        let expected = b"hello world         ";
        assert_eq!(nvme_serial_from_str("hello world", b' '), *expected);

        let expected = b"enthusiasm!!!!!!!!!!";
        assert_eq!(nvme_serial_from_str("enthusiasm", b'!'), *expected);

        let expected = b"very_long_disk_name_";
        assert_eq!(
            nvme_serial_from_str("very_long_disk_name_goes_here", b'?'),
            *expected
        );

        let expected = b"nonvolatile EFI\0\0\0\0\0";
        assert_eq!(nvme_serial_from_str("nonvolatile EFI", 0), *expected);
    }
}


================================================
FILE: openapi/propolis-server/propolis-server-1.0.0-833484.json.gitstub
================================================
8e9252917993e36d43dce96b4409ef151b7d4442:openapi/propolis-server/propolis-server-1.0.0-833484.json


================================================
FILE: openapi/propolis-server/propolis-server-2.0.0-d68a9f.json.gitstub
================================================
fd3636877061da7e951cb1fbce365f7cbf40933c:openapi/propolis-server/propolis-server-2.0.0-d68a9f.json


================================================
FILE: openapi/propolis-server/propolis-server-3.0.0-10da2b.json.gitstub
================================================
368a2225b79328514ce0ea9181d8f874019edaa2:openapi/propolis-server/propolis-server-3.0.0-10da2b.json


================================================
FILE: openapi/propolis-server/propolis-server-4.0.0-5ce09a.json.gitstub
================================================
d9e5dc0d2bb8a185f008a43be9d8131fb9ca3a06:openapi/propolis-server/propolis-server-4.0.0-5ce09a.json


================================================
FILE: openapi/propolis-server/propolis-server-5.0.0-0c6dd9.json
================================================
{
  "openapi": "3.0.3",
  "info": {
    "title": "Oxide Propolis Server API",
    "description": "API for interacting with the Propolis hypervisor frontend.",
    "contact": {
      "url": "https://oxide.computer",
      "email": "api@oxide.computer"
    },
    "version": "5.0.0"
  },
  "paths": {
    "/instance": {
      "get": {
        "operationId": "instance_get",
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceGetResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      },
      "put": {
        "operationId": "instance_ensure",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/InstanceEnsureRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "201": {
            "description": "successful creation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceEnsureResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/disk/{id}/snapshot/{snapshot_id}": {
      "post": {
        "summary": "Issues a snapshot request to a crucible backend.",
        "operationId": "instance_issue_crucible_snapshot_request",
        "parameters": [
          {
            "in": "path",
            "name": "id",
            "required": true,
            "schema": {
              "type": "string"
            }
          },
          {
            "in": "path",
            "name": "snapshot_id",
            "required": true,
            "schema": {
              "type": "string",
              "format": "uuid"
            }
          }
        ],
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "title": "Null",
                  "type": "string",
                  "enum": [
                    null
                  ]
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/disk/{id}/status": {
      "get": {
        "summary": "Gets the status of a Crucible volume backing a disk",
        "operationId": "disk_volume_status",
        "parameters": [
          {
            "in": "path",
            "name": "id",
            "required": true,
            "schema": {
              "type": "string"
            }
          }
        ],
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/VolumeStatus"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/disk/{id}/vcr": {
      "put": {
        "summary": "Issues a volume_construction_request replace to a crucible backend.",
        "operationId": "instance_issue_crucible_vcr_request",
        "parameters": [
          {
            "in": "path",
            "name": "id",
            "required": true,
            "schema": {
              "type": "string"
            }
          }
        ],
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/InstanceVCRReplace"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ReplaceResult"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/migrate/{migration_id}/start": {
      "get": {
        "summary": "DO NOT USE THIS IF YOU'RE NOT PROPOLIS-SERVER.",
        "description": "Internal API called during a migration from a destination instance to the source instance as part of the HTTP connection upgrade used to establish the migration link. This API is exported via OpenAPI purely to verify that its shape hasn't changed.",
        "operationId": "instance_migrate_start",
        "parameters": [
          {
            "in": "path",
            "name": "migration_id",
            "required": true,
            "schema": {
              "type": "string",
              "format": "uuid"
            }
          }
        ],
        "responses": {
          "101": {
            "description": "Negotiating protocol upgrade from HTTP/1.1 to WebSocket"
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        },
        "x-dropshot-websocket": {}
      }
    },
    "/instance/migration-status": {
      "get": {
        "operationId": "instance_migrate_status",
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceMigrateStatusResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/nmi": {
      "post": {
        "summary": "Issues an NMI to the instance.",
        "operationId": "instance_issue_nmi",
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "title": "Null",
                  "type": "string",
                  "enum": [
                    null
                  ]
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/serial": {
      "get": {
        "operationId": "instance_serial",
        "parameters": [
          {
            "in": "query",
            "name": "from_start",
            "description": "Character index in the serial buffer from which to read, counting the bytes output since instance start. If this is provided, `most_recent` must *not* be provided.",
            "schema": {
              "nullable": true,
              "type": "integer",
              "format": "uint64",
              "minimum": 0
            }
          },
          {
            "in": "query",
            "name": "most_recent",
            "description": "Character index in the serial buffer from which to read, counting *backward* from the most recently buffered data retrieved from the instance. (See note on `from_start` about mutual exclusivity)",
            "schema": {
              "nullable": true,
              "type": "integer",
              "format": "uint64",
              "minimum": 0
            }
          }
        ],
        "responses": {
          "101": {
            "description": "Negotiating protocol upgrade from HTTP/1.1 to WebSocket"
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        },
        "x-dropshot-websocket": {}
      }
    },
    "/instance/serial/history": {
      "get": {
        "operationId": "instance_serial_history_get",
        "parameters": [
          {
            "in": "query",
            "name": "from_start",
            "description": "Character index in the serial buffer from which to read, counting the bytes output since instance start. If this is not provided, `most_recent` must be provided, and if this *is* provided, `most_recent` must *not* be provided.",
            "schema": {
              "nullable": true,
              "type": "integer",
              "format": "uint64",
              "minimum": 0
            }
          },
          {
            "in": "query",
            "name": "max_bytes",
            "description": "Maximum number of bytes of buffered serial console contents to return. If the requested range runs to the end of the available buffer, the data returned will be shorter than `max_bytes`.",
            "schema": {
              "nullable": true,
              "type": "integer",
              "format": "uint64",
              "minimum": 0
            }
          },
          {
            "in": "query",
            "name": "most_recent",
            "description": "Character index in the serial buffer from which to read, counting *backward* from the most recently buffered data retrieved from the instance. (See note on `from_start` about mutual exclusivity)",
            "schema": {
              "nullable": true,
              "type": "integer",
              "format": "uint64",
              "minimum": 0
            }
          }
        ],
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceSerialConsoleHistoryResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/spec": {
      "get": {
        "operationId": "instance_spec_get",
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceSpecGetResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/state": {
      "put": {
        "operationId": "instance_state_put",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/InstanceStateRequested"
              }
            }
          },
          "required": true
        },
        "responses": {
          "204": {
            "description": "resource updated"
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    },
    "/instance/state-monitor": {
      "get": {
        "operationId": "instance_state_monitor",
        "requestBody": {
          "content": {
            "application/json": {
              "schema": {
                "$ref": "#/components/schemas/InstanceStateMonitorRequest"
              }
            }
          },
          "required": true
        },
        "responses": {
          "200": {
            "description": "successful operation",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/InstanceStateMonitorResponse"
                }
              }
            }
          },
          "4XX": {
            "$ref": "#/components/responses/Error"
          },
          "5XX": {
            "$ref": "#/components/responses/Error"
          }
        }
      }
    }
  },
  "components": {
    "schemas": {
      "BlobStorageBackend": {
        "description": "A storage backend for a disk whose initial contents are given explicitly by the specification.",
        "type": "object",
        "properties": {
          "base64": {
            "description": "The disk's initial contents, encoded as a base64 string.",
            "type": "string"
          },
          "readonly": {
            "description": "Indicates whether the storage is read-only.",
            "type": "boolean"
          }
        },
        "required": [
          "base64",
          "readonly"
        ],
        "additionalProperties": false
      },
      "Board": {
        "description": "A VM's mainboard.",
        "type": "object",
        "properties": {
          "chipset": {
            "description": "The chipset to expose to guest software.",
            "allOf": [
              {
                "$ref": "#/components/schemas/Chipset"
              }
            ]
          },
          "cpuid": {
            "nullable": true,
            "description": "The CPUID values to expose to the guest. If `None`, bhyve will derive default values from the host's CPUID values.",
            "allOf": [
              {
                "$ref": "#/components/schemas/Cpuid"
              }
            ]
          },
          "cpus": {
            "description": "The number of virtual logical processors attached to this VM.",
            "type": "integer",
            "format": "uint8",
            "minimum": 0
          },
          "guest_hv_interface": {
            "description": "The hypervisor platform to expose to the guest. The default is a bhyve-compatible interface with no additional features.\n\nFor compatibility with older versions of Propolis, this field is only serialized if it specifies a non-default interface.",
            "allOf": [
              {
                "$ref": "#/components/schemas/GuestHypervisorInterface"
              }
            ]
          },
          "memory_mb": {
            "description": "The amount of guest RAM attached to this VM.",
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          }
        },
        "required": [
          "chipset",
          "cpus",
          "memory_mb"
        ],
        "additionalProperties": false
      },
      "BootOrderEntry": {
        "description": "An entry in the boot order stored in a [`BootSettings`] component.",
        "type": "object",
        "properties": {
          "id": {
            "description": "The ID of another component in the spec that Propolis should try to boot from.\n\nCurrently, only disk device components are supported.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SpecKey"
              }
            ]
          }
        },
        "required": [
          "id"
        ]
      },
      "BootSettings": {
        "description": "Settings supplied to the guest's firmware image that specify the order in which it should consider its options when selecting a device to try to boot from.",
        "type": "object",
        "properties": {
          "order": {
            "description": "An ordered list of components to attempt to boot from.",
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/BootOrderEntry"
            }
          }
        },
        "required": [
          "order"
        ],
        "additionalProperties": false
      },
      "Chipset": {
        "description": "A kind of virtual chipset.",
        "oneOf": [
          {
            "description": "An Intel 440FX-compatible chipset.",
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "i440_fx"
                ]
              },
              "value": {
                "$ref": "#/components/schemas/I440Fx"
              }
            },
            "required": [
              "type",
              "value"
            ],
            "additionalProperties": false
          }
        ]
      },
      "Component": {
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/VirtioDisk"
              },
              "type": {
                "type": "string",
                "enum": [
                  "virtio_disk"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/NvmeDisk"
              },
              "type": {
                "type": "string",
                "enum": [
                  "nvme_disk"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/VirtioNic"
              },
              "type": {
                "type": "string",
                "enum": [
                  "virtio_nic"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/SerialPort"
              },
              "type": {
                "type": "string",
                "enum": [
                  "serial_port"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/PciPciBridge"
              },
              "type": {
                "type": "string",
                "enum": [
                  "pci_pci_bridge"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/QemuPvpanic"
              },
              "type": {
                "type": "string",
                "enum": [
                  "qemu_pvpanic"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/BootSettings"
              },
              "type": {
                "type": "string",
                "enum": [
                  "boot_settings"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/VirtioSocket"
              },
              "type": {
                "type": "string",
                "enum": [
                  "virtio_socket"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/SoftNpuPciPort"
              },
              "type": {
                "type": "string",
                "enum": [
                  "soft_npu_pci_port"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/SoftNpuPort"
              },
              "type": {
                "type": "string",
                "enum": [
                  "soft_npu_port"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/SoftNpuP9"
              },
              "type": {
                "type": "string",
                "enum": [
                  "soft_npu_p9"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/P9fs"
              },
              "type": {
                "type": "string",
                "enum": [
                  "p9fs"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/MigrationFailureInjector"
              },
              "type": {
                "type": "string",
                "enum": [
                  "migration_failure_injector"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/CrucibleStorageBackend"
              },
              "type": {
                "type": "string",
                "enum": [
                  "crucible_storage_backend"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/FileStorageBackend"
              },
              "type": {
                "type": "string",
                "enum": [
                  "file_storage_backend"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/BlobStorageBackend"
              },
              "type": {
                "type": "string",
                "enum": [
                  "blob_storage_backend"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/VirtioNetworkBackend"
              },
              "type": {
                "type": "string",
                "enum": [
                  "virtio_network_backend"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "$ref": "#/components/schemas/DlpiNetworkBackend"
              },
              "type": {
                "type": "string",
                "enum": [
                  "dlpi_network_backend"
                ]
              }
            },
            "required": [
              "component",
              "type"
            ],
            "additionalProperties": false
          }
        ]
      },
      "Cpuid": {
        "description": "A set of CPUID values to expose to a guest.",
        "type": "object",
        "properties": {
          "entries": {
            "description": "A list of CPUID leaves/subleaves and their associated values.\n\nPropolis servers require that each entry's `leaf` be unique and that it falls in either the \"standard\" (0 to 0xFFFF) or \"extended\" (0x8000_0000 to 0x8000_FFFF) function ranges, since these are the only valid input ranges currently defined by Intel and AMD. See the Intel 64 and IA-32 Architectures Software Developer's Manual (June 2024) Table 3-17 and the AMD64 Architecture Programmer's Manual (March 2024) Volume 3's documentation of the CPUID instruction.",
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/CpuidEntry"
            }
          },
          "vendor": {
            "description": "The CPU vendor to emulate.\n\nCPUID leaves in the extended range (0x8000_0000 to 0x8000_FFFF) have vendor-defined semantics. Propolis uses this value to determine these semantics when deciding whether it needs to specialize the supplied template values for these leaves.",
            "allOf": [
              {
                "$ref": "#/components/schemas/CpuidVendor"
              }
            ]
          }
        },
        "required": [
          "entries",
          "vendor"
        ],
        "additionalProperties": false
      },
      "CpuidEntry": {
        "description": "A full description of a CPUID leaf/subleaf and the values it produces.",
        "type": "object",
        "properties": {
          "eax": {
            "description": "The value to return in eax.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "ebx": {
            "description": "The value to return in ebx.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "ecx": {
            "description": "The value to return in ecx.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "edx": {
            "description": "The value to return in edx.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "leaf": {
            "description": "The leaf (function) number for this entry.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "subleaf": {
            "nullable": true,
            "description": "The subleaf (index) number for this entry, if it uses subleaves.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          }
        },
        "required": [
          "eax",
          "ebx",
          "ecx",
          "edx",
          "leaf"
        ],
        "additionalProperties": false
      },
      "CpuidVendor": {
        "description": "A CPU vendor to use when interpreting the meanings of CPUID leaves in the extended ID range (0x80000000 to 0x8000FFFF).",
        "type": "string",
        "enum": [
          "amd",
          "intel"
        ]
      },
      "CrucibleStorageBackend": {
        "description": "A Crucible storage backend.",
        "type": "object",
        "properties": {
          "readonly": {
            "description": "Indicates whether the storage is read-only.",
            "type": "boolean"
          },
          "request_json": {
            "description": "A serialized `[crucible_client_types::VolumeConstructionRequest]`. This is stored in serialized form so that breaking changes to the definition of a `VolumeConstructionRequest` do not inadvertently break instance spec deserialization.\n\nWhen using a spec to initialize a new instance, the spec author must ensure this request is well-formed and can be deserialized by the version of `crucible_client_types` used by the target Propolis.",
            "type": "string"
          }
        },
        "required": [
          "readonly",
          "request_json"
        ],
        "additionalProperties": false
      },
      "DlpiNetworkBackend": {
        "description": "A network backend associated with a DLPI VNIC on the host.",
        "type": "object",
        "properties": {
          "vnic_name": {
            "description": "The name of the VNIC to use as a backend.",
            "type": "string"
          }
        },
        "required": [
          "vnic_name"
        ],
        "additionalProperties": false
      },
      "DownstairsInfo": {
        "type": "object",
        "properties": {
          "region_id": {
            "nullable": true,
            "type": "string",
            "format": "uuid"
          },
          "repair_addr": {
            "nullable": true,
            "type": "string"
          },
          "state": {
            "$ref": "#/components/schemas/DownstairsInfoStatus"
          },
          "target_addr": {
            "nullable": true,
            "type": "string"
          }
        },
        "required": [
          "state"
        ]
      },
      "DownstairsInfoConnectionMode": {
        "type": "string",
        "enum": [
          "new",
          "offline",
          "faulted",
          "replaced"
        ]
      },
      "DownstairsInfoNegotiationStatus": {
        "type": "string",
        "enum": [
          "wait_connect",
          "negotiating",
          "wait_quorum",
          "reconcile",
          "live_repair_ready"
        ]
      },
      "DownstairsInfoStatus": {
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "mode": {
                "$ref": "#/components/schemas/DownstairsInfoConnectionMode"
              },
              "state": {
                "$ref": "#/components/schemas/DownstairsInfoNegotiationStatus"
              },
              "type": {
                "type": "string",
                "enum": [
                  "connecting"
                ]
              }
            },
            "required": [
              "mode",
              "state",
              "type"
            ]
          },
          {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "active"
                ]
              }
            },
            "required": [
              "type"
            ]
          },
          {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "live_repair"
                ]
              }
            },
            "required": [
              "type"
            ]
          },
          {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "stopping"
                ]
              }
            },
            "required": [
              "type"
            ]
          }
        ]
      },
      "Error": {
        "description": "Error information from a response.",
        "type": "object",
        "properties": {
          "error_code": {
            "type": "string"
          },
          "message": {
            "type": "string"
          },
          "request_id": {
            "type": "string"
          }
        },
        "required": [
          "message",
          "request_id"
        ]
      },
      "FileStorageBackend": {
        "description": "A storage backend backed by a file in the host system's file system.",
        "type": "object",
        "properties": {
          "block_size": {
            "description": "Block size of the backend",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "path": {
            "description": "A path to a file that backs a disk.",
            "type": "string"
          },
          "readonly": {
            "description": "Indicates whether the storage is read-only.",
            "type": "boolean"
          },
          "workers": {
            "nullable": true,
            "description": "Optional worker threads for the file backend, exposed for testing only.",
            "type": "integer",
            "format": "uint",
            "minimum": 1
          }
        },
        "required": [
          "block_size",
          "path",
          "readonly"
        ],
        "additionalProperties": false
      },
      "GuestHypervisorInterface": {
        "description": "A hypervisor interface to expose to the guest.",
        "oneOf": [
          {
            "description": "Expose a bhyve-like interface (\"bhyve bhyve \" as the hypervisor ID in leaf 0x4000_0000 and no additional leaves or features).",
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "bhyve"
                ]
              }
            },
            "required": [
              "type"
            ],
            "additionalProperties": false
          },
          {
            "description": "Expose a Hyper-V-compatible hypervisor interface with the supplied features enabled.",
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "hyper_v"
                ]
              },
              "value": {
                "type": "object",
                "properties": {
                  "features": {
                    "type": "array",
                    "items": {
                      "$ref": "#/components/schemas/HyperVFeatureFlag"
                    },
                    "uniqueItems": true
                  }
                },
                "required": [
                  "features"
                ],
                "additionalProperties": false
              }
            },
            "required": [
              "type",
              "value"
            ],
            "additionalProperties": false
          }
        ]
      },
      "HyperVFeatureFlag": {
        "description": "Flags that enable \"simple\" Hyper-V enlightenments that require no feature-specific configuration.",
        "type": "string",
        "enum": [
          "reference_tsc"
        ]
      },
      "I440Fx": {
        "description": "An Intel 440FX-compatible chipset.",
        "type": "object",
        "properties": {
          "enable_pcie": {
            "description": "Specifies whether the chipset should allow PCI configuration space to be accessed through the PCIe extended configuration mechanism.",
            "type": "boolean"
          }
        },
        "required": [
          "enable_pcie"
        ],
        "additionalProperties": false
      },
      "Instance": {
        "type": "object",
        "properties": {
          "properties": {
            "$ref": "#/components/schemas/InstanceProperties"
          },
          "state": {
            "$ref": "#/components/schemas/InstanceState"
          }
        },
        "required": [
          "properties",
          "state"
        ]
      },
      "InstanceEnsureRequest": {
        "type": "object",
        "properties": {
          "init": {
            "$ref": "#/components/schemas/InstanceInitializationMethod"
          },
          "properties": {
            "$ref": "#/components/schemas/InstanceProperties"
          }
        },
        "required": [
          "init",
          "properties"
        ]
      },
      "InstanceEnsureResponse": {
        "type": "object",
        "properties": {
          "migrate": {
            "nullable": true,
            "allOf": [
              {
                "$ref": "#/components/schemas/InstanceMigrateInitiateResponse"
              }
            ]
          }
        }
      },
      "InstanceGetResponse": {
        "type": "object",
        "properties": {
          "instance": {
            "$ref": "#/components/schemas/Instance"
          }
        },
        "required": [
          "instance"
        ]
      },
      "InstanceInitializationMethod": {
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "method": {
                "type": "string",
                "enum": [
                  "Spec"
                ]
              },
              "value": {
                "type": "object",
                "properties": {
                  "spec": {
                    "$ref": "#/components/schemas/InstanceSpec"
                  }
                },
                "required": [
                  "spec"
                ]
              }
            },
            "required": [
              "method",
              "value"
            ]
          },
          {
            "type": "object",
            "properties": {
              "method": {
                "type": "string",
                "enum": [
                  "MigrationTarget"
                ]
              },
              "value": {
                "type": "object",
                "properties": {
                  "migration_id": {
                    "type": "string",
                    "format": "uuid"
                  },
                  "replace_components": {
                    "type": "object",
                    "additionalProperties": {
                      "$ref": "#/components/schemas/ReplacementComponent"
                    }
                  },
                  "src_addr": {
                    "type": "string"
                  }
                },
                "required": [
                  "migration_id",
                  "replace_components",
                  "src_addr"
                ]
              }
            },
            "required": [
              "method",
              "value"
            ]
          }
        ]
      },
      "InstanceMetadata": {
        "type": "object",
        "properties": {
          "project_id": {
            "type": "string",
            "format": "uuid"
          },
          "silo_id": {
            "type": "string",
            "format": "uuid"
          },
          "sled_id": {
            "type": "string",
            "format": "uuid"
          },
          "sled_model": {
            "type": "string"
          },
          "sled_revision": {
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "sled_serial": {
            "type": "string"
          }
        },
        "required": [
          "project_id",
          "silo_id",
          "sled_id",
          "sled_model",
          "sled_revision",
          "sled_serial"
        ]
      },
      "InstanceMigrateInitiateResponse": {
        "type": "object",
        "properties": {
          "migration_id": {
            "type": "string",
            "format": "uuid"
          }
        },
        "required": [
          "migration_id"
        ]
      },
      "InstanceMigrateStatusResponse": {
        "description": "The statuses of the most recent attempts to live migrate into and out of this Propolis.\n\nIf a VM is initialized by migration in and then begins to migrate out, this structure will contain statuses for both migrations. This ensures that clients can always obtain the status of a successful migration in even after a migration out begins.\n\nThis structure only reports the status of the most recent migration in a single direction. That is, if a migration in or out fails, and a new migration attempt begins, the new migration's status replaces the old's.",
        "type": "object",
        "properties": {
          "migration_in": {
            "nullable": true,
            "description": "The status of the most recent attempt to initialize the current instance via migration in, or `None` if the instance has never been a migration target.",
            "allOf": [
              {
                "$ref": "#/components/schemas/InstanceMigrationStatus"
              }
            ]
          },
          "migration_out": {
            "nullable": true,
            "description": "The status of the most recent attempt to migrate out of the current instance, or `None` if the instance has never been a migration source.",
            "allOf": [
              {
                "$ref": "#/components/schemas/InstanceMigrationStatus"
              }
            ]
          }
        }
      },
      "InstanceMigrationStatus": {
        "description": "The status of an individual live migration.",
        "type": "object",
        "properties": {
          "id": {
            "description": "The ID of this migration, supplied either by the external migration requester (for targets) or the other side of the migration (for sources).",
            "type": "string",
            "format": "uuid"
          },
          "state": {
            "description": "The current phase the migration is in.",
            "allOf": [
              {
                "$ref": "#/components/schemas/MigrationState"
              }
            ]
          }
        },
        "required": [
          "id",
          "state"
        ]
      },
      "InstanceProperties": {
        "type": "object",
        "properties": {
          "description": {
            "description": "Free-form text description of an Instance.",
            "type": "string"
          },
          "id": {
            "description": "Unique identifier for this Instance.",
            "type": "string",
            "format": "uuid"
          },
          "metadata": {
            "description": "Metadata used to track statistics for this Instance.",
            "allOf": [
              {
                "$ref": "#/components/schemas/InstanceMetadata"
              }
            ]
          },
          "name": {
            "description": "Human-readable name of the Instance.",
            "type": "string"
          }
        },
        "required": [
          "description",
          "id",
          "metadata",
          "name"
        ]
      },
      "InstanceSerialConsoleHistoryResponse": {
        "description": "Contents of an Instance's serial console buffer.",
        "type": "object",
        "properties": {
          "data": {
            "description": "The bytes starting from the requested offset up to either the end of the buffer or the request's `max_bytes`. Provided as a u8 array rather than a string, as it may not be UTF-8.",
            "type": "array",
            "items": {
              "type": "integer",
              "format": "uint8",
              "minimum": 0
            }
          },
          "last_byte_offset": {
            "description": "The absolute offset since boot (suitable for use as `byte_offset` in a subsequent request) of the last byte returned in `data`.",
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          }
        },
        "required": [
          "data",
          "last_byte_offset"
        ]
      },
      "InstanceSpec": {
        "type": "object",
        "properties": {
          "board": {
            "$ref": "#/components/schemas/Board"
          },
          "components": {
            "type": "object",
            "additionalProperties": {
              "$ref": "#/components/schemas/Component"
            }
          },
          "smbios": {
            "nullable": true,
            "allOf": [
              {
                "$ref": "#/components/schemas/SmbiosType1Input"
              }
            ]
          }
        },
        "required": [
          "board",
          "components"
        ]
      },
      "InstanceSpecGetResponse": {
        "type": "object",
        "properties": {
          "properties": {
            "$ref": "#/components/schemas/InstanceProperties"
          },
          "spec": {
            "$ref": "#/components/schemas/InstanceSpecStatus"
          },
          "state": {
            "$ref": "#/components/schemas/InstanceState"
          }
        },
        "required": [
          "properties",
          "spec",
          "state"
        ]
      },
      "InstanceSpecStatus": {
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "WaitingForMigrationSource"
                ]
              }
            },
            "required": [
              "type"
            ]
          },
          {
            "type": "object",
            "properties": {
              "type": {
                "type": "string",
                "enum": [
                  "Present"
                ]
              },
              "value": {
                "$ref": "#/components/schemas/InstanceSpec"
              }
            },
            "required": [
              "type",
              "value"
            ]
          }
        ]
      },
      "InstanceState": {
        "description": "Current state of an Instance.",
        "type": "string",
        "enum": [
          "Creating",
          "Starting",
          "Running",
          "Stopping",
          "Stopped",
          "Rebooting",
          "Migrating",
          "Repairing",
          "Failed",
          "Destroyed"
        ]
      },
      "InstanceStateMonitorRequest": {
        "type": "object",
        "properties": {
          "gen": {
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          }
        },
        "required": [
          "gen"
        ]
      },
      "InstanceStateMonitorResponse": {
        "type": "object",
        "properties": {
          "gen": {
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          },
          "migration": {
            "$ref": "#/components/schemas/InstanceMigrateStatusResponse"
          },
          "state": {
            "$ref": "#/components/schemas/InstanceState"
          }
        },
        "required": [
          "gen",
          "migration",
          "state"
        ]
      },
      "InstanceStateRequested": {
        "type": "string",
        "enum": [
          "Run",
          "Stop",
          "Reboot"
        ]
      },
      "InstanceVCRReplace": {
        "type": "object",
        "properties": {
          "vcr_json": {
            "type": "string"
          }
        },
        "required": [
          "vcr_json"
        ]
      },
      "MigrationFailureInjector": {
        "description": "Describes a synthetic device that registers for VM lifecycle notifications and returns errors during attempts to migrate.\n\nThis is only supported by Propolis servers compiled with the `failure-injection` feature.",
        "type": "object",
        "properties": {
          "fail_exports": {
            "description": "The number of times this device should fail requests to export state.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "fail_imports": {
            "description": "The number of times this device should fail requests to import state.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          }
        },
        "required": [
          "fail_exports",
          "fail_imports"
        ],
        "additionalProperties": false
      },
      "MigrationState": {
        "type": "string",
        "enum": [
          "Sync",
          "RamPush",
          "Pause",
          "RamPushDirty",
          "Device",
          "Resume",
          "RamPull",
          "Server",
          "Finish",
          "Error"
        ]
      },
      "NvmeDisk": {
        "description": "A disk that presents an NVMe interface to the guest.",
        "type": "object",
        "properties": {
          "backend_id": {
            "description": "The name of the disk's backend component.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SpecKey"
              }
            ]
          },
          "pci_path": {
            "description": "The PCI bus/device/function at which this disk should be attached.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          },
          "serial_number": {
            "description": "The serial number to return in response to an NVMe Identify Controller command.",
            "type": "array",
            "items": {
              "type": "integer",
              "format": "uint8",
              "minimum": 0
            },
            "minItems": 20,
            "maxItems": 20
          }
        },
        "required": [
          "backend_id",
          "pci_path",
          "serial_number"
        ],
        "additionalProperties": false
      },
      "P9fs": {
        "description": "Describes a filesystem to expose through a P9 device.\n\nThis is only supported by Propolis servers compiled with the `falcon` feature.",
        "type": "object",
        "properties": {
          "chunk_size": {
            "description": "The chunk size to use in the 9P protocol. Vanilla Helios images should use 8192. Falcon Helios base images and Linux can use up to 65536.",
            "type": "integer",
            "format": "uint32",
            "minimum": 0
          },
          "pci_path": {
            "description": "The PCI path at which to attach the guest to this P9 filesystem.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          },
          "source": {
            "description": "The host source path to mount into the guest.",
            "type": "string"
          },
          "target": {
            "description": "The 9P target filesystem tag.",
            "type": "string"
          }
        },
        "required": [
          "chunk_size",
          "pci_path",
          "source",
          "target"
        ],
        "additionalProperties": false
      },
      "PciPath": {
        "description": "A PCI bus/device/function tuple.",
        "type": "object",
        "properties": {
          "bus": {
            "type": "integer",
            "format": "uint8",
            "minimum": 0
          },
          "device": {
            "type": "integer",
            "format": "uint8",
            "minimum": 0
          },
          "function": {
            "type": "integer",
            "format": "uint8",
            "minimum": 0
          }
        },
        "required": [
          "bus",
          "device",
          "function"
        ]
      },
      "PciPciBridge": {
        "description": "A PCI-PCI bridge.",
        "type": "object",
        "properties": {
          "downstream_bus": {
            "description": "The logical bus number of this bridge's downstream bus. Other devices may use this bus number in their PCI paths to indicate they should be attached to this bridge's bus.",
            "type": "integer",
            "format": "uint8",
            "minimum": 0
          },
          "pci_path": {
            "description": "The PCI path at which to attach this bridge.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "downstream_bus",
          "pci_path"
        ],
        "additionalProperties": false
      },
      "QemuPvpanic": {
        "type": "object",
        "properties": {
          "enable_isa": {
            "description": "Enable the QEMU PVPANIC ISA bus device (I/O port 0x505).",
            "type": "boolean"
          }
        },
        "required": [
          "enable_isa"
        ],
        "additionalProperties": false
      },
      "ReplaceResult": {
        "type": "string",
        "enum": [
          "started",
          "started_already",
          "completed_already",
          "missing",
          "vcr_matches"
        ]
      },
      "ReplacementComponent": {
        "description": "An instance spec component that should be replaced during a live migration.",
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "component": {
                "type": "string",
                "enum": [
                  "MigrationFailureInjector"
                ]
              },
              "spec": {
                "$ref": "#/components/schemas/MigrationFailureInjector"
              }
            },
            "required": [
              "component",
              "spec"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "type": "string",
                "enum": [
                  "CrucibleStorageBackend"
                ]
              },
              "spec": {
                "$ref": "#/components/schemas/CrucibleStorageBackend"
              }
            },
            "required": [
              "component",
              "spec"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "component": {
                "type": "string",
                "enum": [
                  "VirtioNetworkBackend"
                ]
              },
              "spec": {
                "$ref": "#/components/schemas/VirtioNetworkBackend"
              }
            },
            "required": [
              "component",
              "spec"
            ],
            "additionalProperties": false
          }
        ]
      },
      "SerialPort": {
        "description": "A serial port device.",
        "type": "object",
        "properties": {
          "num": {
            "description": "The serial port number for this port.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SerialPortNumber"
              }
            ]
          }
        },
        "required": [
          "num"
        ],
        "additionalProperties": false
      },
      "SerialPortNumber": {
        "description": "A serial port identifier, which determines what I/O ports a guest can use to access a port.",
        "type": "string",
        "enum": [
          "com1",
          "com2",
          "com3",
          "com4"
        ]
      },
      "SmbiosType1Input": {
        "type": "object",
        "properties": {
          "manufacturer": {
            "type": "string"
          },
          "product_name": {
            "type": "string"
          },
          "serial_number": {
            "type": "string"
          },
          "version": {
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          }
        },
        "required": [
          "manufacturer",
          "product_name",
          "serial_number",
          "version"
        ],
        "additionalProperties": false
      },
      "SoftNpuP9": {
        "description": "Describes a PCI device that shares host files with the guest using the P9 protocol.\n\nThis is only supported by Propolis servers compiled with the `falcon` feature.",
        "type": "object",
        "properties": {
          "pci_path": {
            "description": "The PCI path at which to attach the guest to this port.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "pci_path"
        ],
        "additionalProperties": false
      },
      "SoftNpuPciPort": {
        "description": "Describes a SoftNPU PCI device.\n\nThis is only supported by Propolis servers compiled with the `falcon` feature.",
        "type": "object",
        "properties": {
          "pci_path": {
            "description": "The PCI path at which to attach the guest to this port.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "pci_path"
        ],
        "additionalProperties": false
      },
      "SoftNpuPort": {
        "description": "Describes a port in a SoftNPU emulated ASIC.\n\nThis is only supported by Propolis servers compiled with the `falcon` feature.",
        "type": "object",
        "properties": {
          "backend_id": {
            "description": "The name of the port's associated DLPI backend.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SpecKey"
              }
            ]
          },
          "link_name": {
            "description": "The data link name for this port.",
            "type": "string"
          }
        },
        "required": [
          "backend_id",
          "link_name"
        ],
        "additionalProperties": false
      },
      "SpecKey": {
        "description": "A key identifying a component in an instance spec.",
        "oneOf": [
          {
            "title": "uuid",
            "allOf": [
              {
                "type": "string",
                "format": "uuid"
              }
            ]
          },
          {
            "title": "name",
            "allOf": [
              {
                "type": "string"
              }
            ]
          }
        ]
      },
      "UpstairsInfoStatus": {
        "type": "string",
        "enum": [
          "initializing",
          "go_active",
          "active",
          "deactivating",
          "disabled"
        ]
      },
      "VirtioDisk": {
        "description": "A disk that presents a virtio-block interface to the guest.",
        "type": "object",
        "properties": {
          "backend_id": {
            "description": "The name of the disk's backend component.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SpecKey"
              }
            ]
          },
          "pci_path": {
            "description": "The PCI bus/device/function at which this disk should be attached.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "backend_id",
          "pci_path"
        ],
        "additionalProperties": false
      },
      "VirtioNetworkBackend": {
        "description": "A network backend associated with a virtio-net (viona) VNIC on the host.",
        "type": "object",
        "properties": {
          "vnic_name": {
            "description": "The name of the viona VNIC to use as a backend.",
            "type": "string"
          }
        },
        "required": [
          "vnic_name"
        ],
        "additionalProperties": false
      },
      "VirtioNic": {
        "description": "A network card that presents a virtio-net interface to the guest.",
        "type": "object",
        "properties": {
          "backend_id": {
            "description": "The name of the device's backend.",
            "allOf": [
              {
                "$ref": "#/components/schemas/SpecKey"
              }
            ]
          },
          "interface_id": {
            "description": "A caller-defined correlation identifier for this interface. If Propolis is configured to collect network interface kstats in its Oximeter metrics, the metric series for this interface will be associated with this identifier.",
            "type": "string",
            "format": "uuid"
          },
          "pci_path": {
            "description": "The PCI path at which to attach this device.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "backend_id",
          "interface_id",
          "pci_path"
        ],
        "additionalProperties": false
      },
      "VirtioSocket": {
        "description": "A socket device that presents a virtio-socket interface to the guest.",
        "type": "object",
        "properties": {
          "guest_cid": {
            "description": "The guest's Context ID.",
            "type": "integer",
            "format": "uint64",
            "minimum": 0
          },
          "pci_path": {
            "description": "The PCI path at which to attach this device.",
            "allOf": [
              {
                "$ref": "#/components/schemas/PciPath"
              }
            ]
          }
        },
        "required": [
          "guest_cid",
          "pci_path"
        ],
        "additionalProperties": false
      },
      "VolumeInfo": {
        "description": "A tree representation of the info and status of all parts of a Volume.",
        "oneOf": [
          {
            "type": "object",
            "properties": {
              "volume": {
                "type": "object",
                "properties": {
                  "read_only_parent": {
                    "nullable": true,
                    "allOf": [
                      {
                        "$ref": "#/components/schemas/VolumeInfo"
                      }
                    ]
                  },
                  "sub_volumes": {
                    "type": "array",
                    "items": {
                      "$ref": "#/components/schemas/VolumeInfo"
                    }
                  }
                },
                "required": [
                  "sub_volumes"
                ]
              }
            },
            "required": [
              "volume"
            ],
            "additionalProperties": false
          },
          {
            "type": "object",
            "properties": {
              "upstairs": {
                "type": "object",
                "properties": {
                  "block_size": {
                    "nullable": true,
                    "type": "integer",
                    "format": "uint64",
                    "minimum": 0
                  },
                  "encrypted": {
                    "type": "boolean"
                  },
                  "generation": {
                    "type": "integer",
                    "format": "uint64",
                    "minimum": 0
                  },
                  "live_repair_in_progress": {
                    "type": "boolean"
                  },
                  "read_only": {
                    "type": "boolean"
                  },
                  "reconcile_in_progress": {
                    "type": "boolean"
                  },
                  "session_id": {
                    "type": "string",
                    "format": "uuid"
                  },
                  "state": {
                    "$ref": "#/components/schemas/UpstairsInfoStatus"
                  },
                  "targets": {
                    "type": "array",
                    "items": {
                      "$ref": "#/components/schemas/DownstairsInfo"
                    }
                  },
                  "upstairs_id": {
                    "type": "string",
                    "format": "uuid"
                  }
                },
                "required": [
                  "encrypted",
                  "generation",
                  "live_repair_in_progress",
                  "read_only",
                  "reconcile_in_progress",
                  "session_id",
                  "state",
                  "targets",
                  "upstairs_id"
                ]
              }
            },
            "required": [
              "upstairs"
            ],
            "additionalProperties": false
          }
        ]
      },
      "VolumeStatus": {
        "type": "object",
        "properties": {
          "volume_info": {
            "$ref": "#/components/schemas/VolumeInfo"
          }
        },
        "required": [
          "volume_info"
        ]
      }
    },
    "responses": {
      "Error": {
        "description": "Error",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            }
          }
        }
      }
    }
  }
}


================================================
FILE: packaging/package-manifest.toml
================================================
[package.propolis-server]
service_name = "propolis-server"
source.type = "local"
source.rust.binary_names = ["propolis-server"]
source.rust.release = true
source.paths = [
    { from = "packaging/smf/propolis-server", to = "/var/svc/manifest/site/propolis-server" },
    { from = "packaging/smf/method_script.sh", to = "/opt/oxide/lib/svc/manifest/propolis/propolis.sh" },
]
output.type = "zone"

# N.B. Should be kept in sync with phd-tests/artifacts.toml.
[[package.propolis-server.source.buildomat_blobs]]
repo = "edk2"
series = "image_debug"
commit = "bf64f45b1a58e69d126a3c6ca1e4512c88668132"
artifact = "OVMF_CODE.fd"
sha256 = "740187046a7267d0de72d3455070333547dbc0ea023531471fb2b2a61effa448"


================================================
FILE: packaging/propolis-package/Cargo.toml
================================================
[package]
name = "propolis-package"
version = "0.1.0"
edition = "2021"

[[bin]]
name = "propolis-package"
test = false
doctest = false

[dependencies]
anyhow.workspace = true
camino.workspace = true
omicron-zone-package.workspace = true
slog.workspace = true
slog-term.workspace = true
tokio.workspace = true


================================================
FILE: packaging/propolis-package/README.md
================================================
# Propolis Zone

This binary can be used to produce an Omicron-branded Zone image,
which consists of the Propolis Server binary (along
with some auxiliary files) in a specially-formatted tarball.

A manifest describing this Zone image exists in `package-manifest.toml`,
and the resulting image is created as `out/propolis-server.tar.gz`.

To create the Zone image:

```rust
$ cargo build --release
$ cargo run -p propolis-package
```


================================================
FILE: packaging/propolis-package/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::{anyhow, Context, Result};
use camino::Utf8Path;
use omicron_zone_package::config::{self, PackageName};
use omicron_zone_package::package::BuildConfig;
use omicron_zone_package::progress;
use slog::{o, Drain, Logger};
use std::fs::create_dir_all;

const PKG_NAME: PackageName = PackageName::new_const("propolis-server");

struct PrintProgress(slog::Logger);
impl Default for PrintProgress {
    fn default() -> Self {
        let deco = slog_term::PlainDecorator::new(std::io::stdout());
        let drain = slog_term::CompactFormat::new(deco).build();
        Self(Logger::root(std::sync::Mutex::new(drain).fuse(), o!()))
    }
}
impl progress::Progress for PrintProgress {
    fn get_log(&self) -> &Logger {
        &self.0
    }

    fn set_message(&self, msg: std::borrow::Cow<'static, str>) {
        slog::info!(self.0, "package progress"; "msg" => msg.as_ref());
    }
}

#[tokio::main]
async fn main() -> Result<()> {
    let cfg = config::parse("packaging/package-manifest.toml")?;

    let output_dir = Utf8Path::new("out");
    create_dir_all(output_dir)?;

    // We only expect a single package so just look it directly
    let pkg = cfg
        .packages
        .get(&PKG_NAME)
        .with_context(|| anyhow!("missing propolis-server package"))?;

    let progress = PrintProgress::default();
    println!("Building {PKG_NAME} package...");
    pkg.create(
        &PKG_NAME,
        output_dir,
        &BuildConfig { progress: &progress, ..Default::default() },
    )
    .await?;

    println!("Done!");
    Ok(())
}


================================================
FILE: packaging/smf/method_script.sh
================================================
#!/bin/bash

set -o errexit
set -o pipefail
set -o xtrace

. /lib/svc/share/smf_include.sh

DATALINK="$(svcprop -c -p config/datalink "${SMF_FMRI}")"
GATEWAY="$(svcprop -c -p config/gateway "${SMF_FMRI}")"
LISTEN_ADDR="$(svcprop -c -p config/listen_addr "${SMF_FMRI}")"
LISTEN_PORT="$(svcprop -c -p config/listen_port "${SMF_FMRI}")"
METRIC_ADDR="$(svcprop -c -p config/metric_addr "${SMF_FMRI}")"

if [[ $DATALINK == unknown ]] || [[ $GATEWAY == unknown ]]; then
    printf 'ERROR: missing datalink or gateway' >&2
    exit "$SMF_EXIT_ERR_CONFIG"
fi

# TODO remove when https://github.com/oxidecomputer/stlouis/issues/435 is addressed
ipadm delete-if "$DATALINK" || true
ipadm create-if -t "$DATALINK"

ipadm set-ifprop -t -p mtu=9000 -m ipv4 "$DATALINK"
ipadm set-ifprop -t -p mtu=9000 -m ipv6 "$DATALINK"

ipadm show-addr "$DATALINK/ll" || ipadm create-addr -t -T addrconf "$DATALINK/ll"
ipadm show-addr "$DATALINK/omicron6"  || ipadm create-addr -t -T static -a "$LISTEN_ADDR" "$DATALINK/omicron6"
route get -inet6 default -inet6 "$GATEWAY" || route add -inet6 default -inet6 "$GATEWAY"

args=(
  'run'
  '/opt/oxide/propolis-server/blob/OVMF_CODE.fd'
  "[$LISTEN_ADDR]:$LISTEN_PORT"
  '--metric-addr' "$METRIC_ADDR"
)

ctrun -l child -o noorphan,regent /opt/oxide/propolis-server/bin/propolis-server "${args[@]}" &


================================================
FILE: packaging/smf/propolis-server/manifest.xml
================================================
<?xml version="1.0"?>
<!DOCTYPE service_bundle SYSTEM "/usr/share/lib/xml/dtd/service_bundle.dtd.1">

<service_bundle type='manifest' name='propolis-server'>

<service name='system/illumos/propolis-server' type='service' version='1'>
  <create_default_instance enabled='true' />
  <dependency name='network' grouping='require_all' restart_on='none'
    type='service'>
  <service_fmri value='svc:/milestone/network:default' />
  </dependency>
  <dependency name='multi_user' grouping='require_all' restart_on='none'
    type='service'>
  <service_fmri value='svc:/milestone/multi-user:default' />
  </dependency>

  <method_context>
    <method_environment>
      <envvar name="LD_LIBRARY_PATH" value="/opt/ooce/pgsql-13/lib/amd64" />
    </method_environment>
  </method_context>
  <exec_method type='method' name='start'
    exec='/opt/oxide/lib/svc/manifest/propolis/propolis.sh'
    timeout_seconds='0' />
  <exec_method type='method' name='stop' exec=':kill' timeout_seconds='0' />

  <property_group name='config' type='application'>
    <propval name='datalink' type='astring' value='unknown' />
    <propval name='gateway' type='astring' value='unknown' />
    <propval name='listen_addr' type='astring' value='127.0.0.1' />
    <propval name='listen_port' type='astring' value='12400' />
    <propval name='metric_addr' type='astring' value='127.0.0.1' />
  </property_group>

  <property_group name='startd' type='framework'>
    <propval name='duration' type='astring' value='contract' />
  </property_group>

  <stability value='Unstable' />

  <template>
    <common_name>
      <loctext xml:lang='C'>Oxide Propolis Server</loctext>
    </common_name>
    <description>
      <loctext xml:lang='C'>Hypervisor</loctext>
    </description>
  </template>
</service>

</service_bundle>


================================================
FILE: phd-tests/.gitignore
================================================
/target
debug.out
core
out/


================================================
FILE: phd-tests/README.md
================================================
# Pheidippides (PHD): the Propolis test runner

[Pheidippides](https://en.wikipedia.org/wiki/Pheidippides), or "PHD" for short,
is a freestanding test framework and runner for testing Propolis.

PHD's test framework aims to make it easy to test that Propolis provides correct
abstractions to guest software. PHD's helpers let test authors concisely launch
VMs and interact with the guest OS via the guest serial console.

PHD is very much a work in progress. PHD issues in this repo bear the `testing`
label.

## Requirements

PHD launches "real" Propolis server instances in freestanding processes on the
system that hosts the PHD runner. The runner machine needs to have a prebuilt
Propolis server binary and needs to meet all the system requirements described
in the main Propolis repo (e.g. the runner must run on a Helios system that's
sufficiently modern to run the Propolis server of interest).

## Quickstart

The simplest way to get started running PHD tests is to use the [`cargo xtask
phd`](#cargo-xtask) Cargo [xtask](https://github.com/matklad/cargo-xtask). To
get started running PHD tests, run the following command:

```shell
pfexec cargo xtask phd run
```

That's it! The xtask will automatically build `propolis-server` and `phd-runner`
binaries, and PHD will obtain a guest OS image and a bootrom and run its tests
against them.

See [here](#cargo-xtask) for more details on using `cargo xtask phd`.

## Building & executing the runner

To build:

`cargo build -p phd-runner`

PHD requires the unwinding of stacks in order to properly catch assertions in
test cases, so building with a profile which sets `panic = "abort"` is not
supported.  This precludes the use of the `release` or `dev-abort` profiles.

To run:

`pfexec cargo run -p phd-runner -- [OPTIONS]`

Running under pfexec is required to allow PHD to ensure the host system is
correctly configured to run live migration tests.

## Runtime options

The runner takes one of two subcommands:

- `run` actually runs a set of tests.
- `list` lists the tests that would be run given the values of other parameters.

In `run` mode, the following sub-arguments are required:

- `--propolis-server-cmd $PROPOLIS_PATH` supplies the path to the
  `propolis-server` binary to execute.
- `--tmp-directory $TMPDIR` supplies a temporary directory to which the runner
  writes propolis-server log files and other temporary files generated during
  tests.
- `--artifact-toml-path` supplies the path to a TOML file defining the set of
  artifacts--guest OS and firmware images--to use for the current run. This
  file's format is described below. The `artifacts.toml` file in the repo
  contains the list of artifacts that are currently used in regular PHD test
  runs and that are meant to be compatible with PHD's guest adapters.

Other options are described in the runner's help text (`cargo run -- --help`).

### Specifying artifacts

The runner requires a TOML file that specifies the guest OS and firmware images
that are available for a test run to use. It has the following format:

```toml
# An array of URIs from which to try to fetch artifacts with the "remote_server"
# source type. The runner appends "/filename" to each of these URIs to generate
# a download URI for each such artifact.
remote_server_uris = ["http://foo.com", "http://bar.net"]

# Every artifact has a named entry in the "artifacts" table. The runner's
# `default_guest_artifact` and `default_bootrom_artifact` parameters name the
# guest OS and bootrom artifacts that will be used for a given test run.
#
# Every artifact has a kind, which is one of `guest_os`, `bootrom`, or
# `propolis_server`.
#
# Every artifact also has a source, which is one of `remote_server`,
# `local_path`, or `buildomat`.
#
# The following entry specifies a guest OS named "alpine" that searches the
# remote URI list for files named "alpine.iso":
[artifacts.alpine]
filename = "alpine.iso"

# Bootrom and Propolis server artifacts can put a `kind = "foo"` entry inline,
# but guest OSes need to use the structured data syntax to specify the guest OS
# adapter to use when booting a guest from this artifact.
[artifacts.alpine.kind]
guest_os = "alpine"

# Remote artifacts are required to specify an expected SHA256 digest as a
# string.
[artifacts.alpine.source.remote_server]
sha256 = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"

# The following entry specifies a debug bootrom pulled from Buildomat. Buildomat
# outputs are associated with a single repo and a commit therein; the jobs that
# create them also specify a 'series' that identifies the task that created the
# collateral.
[artifacts.bootrom]
filename = "OVMF_CODE.fd"
kind = "bootrom"

[artifacts.bootrom.source.buildomat]
repo = "oxidecomputer/edk2"
series = "image_debug"
commit = "commit_sha"
sha256 = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"

# This entry specifies a local directory in which an artifact can be found.
# SHA256 digests are optional for local artifacts. This allows you to create
# an entry for a local artifact that changes frequently (e.g. a Propolis build)
# without having to edit the digest every time it changes.
[artifacts.propolis]
filename = "propolis-server"
kind = "propolis_server"

[artifacts.propolis.source.local_path]
path = "/home/oxide/propolis/target/debug"
# sha256 = "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"
```

## Cargo `xtask`

A [Cargo `xtask` command](https://github.com/matklad/cargo-xtask), [`cargo xtask
phd`](../xtask/src/task_phd.rs) is provided to make running PHD tests locally in
development as simple as possible.

Using `cargo xtask phd` provides the following additional features compared to
running `phd-runner` directly:

- Automatically rebuilding the `propolis-server` binary whenever the source code
  changes, to prevent accidentally running tests against a stale build.
- Automatically [managing a PHD artifact store and test temporary
  directories](#artifact-store-and-temporary-directory-management) in
  `target/phd`.
- Providing [reasonable defaults](#default-arguments) (which may be overridden)
  for [arguments required by the `phd-runner` CLI](#runtime-options).

### Artifact store and temporary directory management

`cargo xtask phd` will automatically create and manage directories
for the PHD artifact store and test temporary directories.

By default, the the artifact store directory used by `cargo xtask phd` is
`target/phd/artifacts`[^1]. If the `--artifact-directory <PATH>` command-line
argument is present, `cargo xtask phd` will use the provided path for the
artifact store, instead. In both cases, if the artifact store directory or its
parent directories do not exist, `cargo xtask phd` will create them.

Every time `cargo xtask phd` is invoked, a new test temporary directory will be
created in `target/phd/tmp/{UNIX_TIMESTAMP_IN_SECONDS}`. By creating a new
temporary
directory for each test run, the logs and other output emitted by previous runs
are not overwritten, so that two test runs can be compared.

To limit the amount of disk space used for storing output from old test runs,
`cargo xtask phd run` will automatically delete any test temporary directories
that are older than one day. This behavior can be suppressed by setting the
`PHD_NOTIDY` environment variable to a value (e.g. `PHD_NOTIDY=1 cargo xtask phd
run`).

### Default arguments

`cargo xtask phd run` will set reasonable default values for
[`phd-runner`'s CLI arguments](#runtime-options), if those arguments
are not present when `cargo xtask phd run` is invoked. The following arguments
are given default values when using `cargo xtask phd run`:

- `--propolis-server-cmd`: The path to a `propolis-server` binary built by
  `cargo xtask phd`
- `--base-propolis-branch`: `master`
- `--crucible-downstairs-commit`: `auto`
- `--artifact-toml-path`:
  [`{WORKSPACE_ROOT}/phd-tests/artifacts.toml`](../artifacts.toml)
- `--artifact-directory`:
  [`target/phd/artifacts`](#artifact-store-and-temporary-directory-management)

Any additional command-line arguments for which `cargo xtask phd` does not
provide a default are passed directly to the `phd-runner` binary.

## Guest OS support

Different guest OS images may have different feature sets and login
requirements. The PHD framework abstracts these differences out guest OS
adapters that implement the `GuestOs` trait, whose methods supply PHD with
guest-specific information like the sequence of commands needed to log on or the
expected guest command prompt.

The full list of supported OSes is defined in the framework's
[guest OS module](framework/src/guest_os/mod.rs). Each guest OS artifact in the
artifact TOML (see above) must have a `kind` that corresponds to a variant of
the `GuestOsKind` enum in this module.

Some guest OSes are presumed to use password-based login credentials. These are
encoded into the logon sequences for each adapter and reproduced below:

| Guest adapter       | Username        | Password     |
|---------------------|-----------------|--------------|
| Alpine Linux        | `root`          |              |
| Debian 11 (nocloud) | `root`          |              |
| Ubuntu 20.04        | `ubuntu`        | `1!Passw0rd` |
| Windows Server 2019 | `Administrator` | `0xide#1Fan` |
| Windows Server 2022 | `Administrator` | `0xide#1Fan` |

If you add a custom image to your artifact file, you must make sure either to
configure the image to accept the credentials its adapter supplies or to change
the adapter to provide the correct credentials.

## Authoring tests

PHD's test cases live in the `tests` crate. To write a new test, add a function
of the form `fn my_test(ctx: &TestCtx)` and tag it with the
`#[phd_testcase]` attribute macro. The framework will automatically register the
test into the crate's test inventory for the runner to discover.

### Test outcomes

`#[phd_testcase]` wraps the function body in an immediately-executed closure
that returns an `anyhow::Result<()>` and that returns `Ok(())` if the end of the
function body is reached. This means that

- Tests that reach the end of the function body will automatically pass.
- Tests that want to pass early can return `Ok(())` immediately.
- Tests can propagate errors with the `?` operator. This causes the test to
  fail. Setting `RUST_BACKTRACE=1` will enable backtraces for eligible errors of
  this kind (e.g. errors raised by framework functions).
- Tests can also use the `panic!` or `assert!` macros; the runner will catch
  panics and convert them into test failures.

### Test context

Every test gets a `phd_testcase::Framework` that contains helper methods for
constructing VMs and their execution environments. See the module documentation
for more information.

The tests in `tests/src/smoke.rs` provide some simple examples of using the
factory to customize and launch a new VM.

### Test VMs

The VM factory yields objects of type `TestVm` that provide routines that change
a VM's state and interact with its serial console. The `run_shell_command`
routine attempts to run a command in the guest's serial console and returns the
command's output as a string. See the module documentation for more information.

## Source layout

PHD is arranged into the following crates:

- `framework` contains the bulk of the test framework and has the following
  modules:
  - `artifacts` implements the artifact store, which processes the artifact TOML
    and provides other modules a way to convert from artifact keys to paths on
    disk.
  - `guest_os` defines the `GuestOsKind` enumeration and the `GuestOs` trait.
    Each supported guest OS implements this trait to provide guest-specific
    adapters for high-level test VM operations to use.
  - `serial` provides a task that connects to a guest's serial console, sends
    commands to it, and processes the characters and terminal control sequences
    the guest sends back. It also provides routines that allow VMs to wait for a
    specific string to arrive in the console's back buffer.
  - `test_vm` implements the `TestVm` struct and the `VmFactory` that tests use
    to configure new VMs.
- `testcase_macro` and `testcase` contain the `TestContext` type, the definition
  of the `#[phd_testcase]` macro, and the support code for the test inventory.
  `testcase` re-exports `testcase_macro`, so consumers of these modules only
  need to import `testcase`.
- `tests` contains individual test cases.
- `runner` implements the test runner, its command-line configuration, and its
  test fixtures.

[^1]: Technically, this uses the value of
    [`$CARGO_TARGET_DIR`](https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-reads),
    so if that's overridden, `cargo xtask phd` will use whatever the Cargo
    target dir is.


================================================
FILE: phd-tests/artifacts.toml
================================================
remote_server_uris = ["https://oxide-omicron-build.s3.amazonaws.com"]

[artifacts.alpine]
filename = "alpine.iso"
[artifacts.alpine.kind]
guest_os = "alpine"
[artifacts.alpine.source.remote_server]
sha256 = "ba8007f74f9b54fbae3b2520da577831b4834778a498d732f091260c61aa7ca1"

[artifacts.ovmf]
filename = "OVMF_CODE.fd"
kind = "bootrom"
[artifacts.ovmf.source.buildomat]
repo = "oxidecomputer/edk2"
series = "image_debug"
commit = "bf64f45b1a58e69d126a3c6ca1e4512c88668132"
sha256 = "740187046a7267d0de72d3455070333547dbc0ea023531471fb2b2a61effa448"


================================================
FILE: phd-tests/framework/Cargo.toml
================================================
[package]
name = "phd-framework"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"

[lib]
doctest = false

[dependencies]
anyhow.workspace = true
backoff = { workspace = true, features = ["tokio"] }
base64.workspace = true
bytes.workspace = true
bhyve_api.workspace = true
camino = { workspace = true, features = ["serde1"] }
cfg-if.workspace = true
cpuid_utils.workspace = true
dropshot.workspace = true
errno.workspace = true
fatfs.workspace = true
futures.workspace = true
flate2.workspace = true
hex.workspace = true
libc.workspace = true
newtype_derive.workspace = true
omicron-common.workspace = true
oximeter.workspace = true
propolis-client.workspace = true
reqwest = { workspace = true, features = ["blocking"] }
ring.workspace = true
serde = { workspace = true, features = ["derive"] }
serde_derive.workspace = true
serde_json.workspace = true
slog.workspace = true
slog-async.workspace = true
slog-bunyan.workspace = true
slog-term.workspace = true
tar.workspace = true
termwiz.workspace = true
thiserror.workspace = true
tokio = { workspace = true, features = ["full"] }
tokio-tungstenite.workspace = true
toml.workspace = true
tracing.workspace = true
uuid.workspace = true
rand.workspace = true


================================================
FILE: phd-tests/framework/src/artifacts/buildomat.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
use super::DownloadConfig;
use anyhow::Context;
use camino::Utf8Path;
use serde::{Deserialize, Serialize};
use std::{borrow::Cow, fmt, str::FromStr, time::Duration};
use tracing::{debug, warn};

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(transparent)]
pub(super) struct Repo(Cow<'static, str>);

#[derive(Clone, Debug, Serialize, Eq, PartialEq)]
#[serde(transparent)]
pub struct Commit(String);

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(transparent)]
pub(super) struct Series(Cow<'static, str>);

#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct BuildomatArtifact {
    pub(super) repo: Repo,
    pub(super) series: Series,
    pub(super) commit: Commit,
    pub(super) sha256: String,
}

const BASE_URI: &str = "https://buildomat.eng.oxide.computer/public";

impl Repo {
    pub(super) const fn from_static(s: &'static str) -> Self {
        Self(Cow::Borrowed(s))
    }

    pub(super) async fn artifact_for_commit(
        self,
        series: Series,
        commit: Commit,
        filename: impl AsRef<Utf8Path>,
        downloader: &DownloadConfig,
    ) -> anyhow::Result<BuildomatArtifact> {
        let filename = filename.as_ref();
        let sha256 =
            self.get_sha256(&series, &commit, filename, downloader).await?;

        Ok(BuildomatArtifact { repo: self, series, commit, sha256 })
    }

    pub(super) async fn get_branch_head(
        &self,
        branch: &str,
    ) -> anyhow::Result<Commit> {
        async {
            let uri = format!("{BASE_URI}/branch/{self}/{branch}");
            let client = reqwest::ClientBuilder::new()
                .timeout(Duration::from_secs(5))
                .build()?;
            let req = client.get(uri).build()?;
            let rsp = client.execute(req).await?;
            let status = rsp.status();
            anyhow::ensure!(status.is_success(), "HTTP status: {status}");
            let bytes = rsp.bytes().await?;
            str_from_bytes(&bytes)?.parse::<Commit>()
        }
        .await
        .with_context(|| {
            format!("Failed to determine HEAD commit for {self}@{branch}")
        })
    }

    async fn get_sha256(
        &self,
        series: &Series,
        commit: &Commit,
        filename: &Utf8Path,
        downloader: &DownloadConfig,
    ) -> anyhow::Result<String> {
        async {
            let filename = filename
                .file_name()
                .ok_or_else(|| {
                    anyhow::anyhow!(
                        "Buildomat filename has no filename: {filename:?}"
                    )
                })?
                // Strip the file extension, if any.
                //
                // Note: we use `Utf8PathBuf::file_name` and then split on '.'s
                // rather than using `Utf8PathBuf::file_stem`, because the latter
                // only strips off the rightmost file extension, rather than all
                // extensions. So, "foo.tar.gz" has a `file_stem()` of "foo.tar",
                // rather than "foo".
                //
                // TODO(eliza): `std::path::Path` has an unstable `file_prefix()`
                // method, which does exactly what we would want here (see
                // https://github.com/rust-lang/rust/issues/86319). If this is
                // stabilized, and `camino` adds a `file_prefix()` method wrapping
                // it, this code can be replaced with just `filename.file_prefix()`.
                .split('.')
                .next()
                .ok_or_else(|| {
                    anyhow::anyhow!(
                        "Buildomat filename has no filename prefix: {filename:?}"
                    )
                })?;
            let uri = format!("{BASE_URI}/file/{self}/{series}/{commit}/{filename}.sha256.txt");
            let bytes = downloader.download_buildomat_uri(&uri).await?;
            str_from_bytes(&bytes).map(String::from)
        }.await.with_context(|| {
            format!("Failed to get SHA256 for {self}@{commit}, series: {series}, file: {filename})")
        })
    }
}

impl fmt::Display for Repo {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}

impl FromStr for Commit {
    type Err = anyhow::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let s = s.trim();

        // Ensure this looks like a valid Git commit.
        anyhow::ensure!(
            s.len() == 40,
            "Buildomat requires full (40-character) Git commit hashes"
        );

        for c in s.chars() {
            if !c.is_ascii_hexdigit() {
                anyhow::bail!(
                    "'{c}' is not a valid hexadecimal digit; Git \
                    commit hashes should consist of the characters \
                    [0-9, a-f, A-F]"
                );
            }
        }

        Ok(Self(s.to_string()))
    }
}

impl fmt::Display for Commit {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(&self.0)
    }
}

impl<'de> Deserialize<'de> for Commit {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let s = String::deserialize(deserializer)?;
        FromStr::from_str(&s).map_err(serde::de::Error::custom)
    }
}

impl Series {
    pub(super) const fn from_static(s: &'static str) -> Self {
        Self(Cow::Borrowed(s))
    }
}

impl fmt::Display for Series {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        self.0.fmt(f)
    }
}

impl BuildomatArtifact {
    pub(super) fn uri(&self, filename: impl AsRef<Utf8Path>) -> String {
        let Self {
            repo: Repo(ref repo),
            series: Series(ref series),
            commit: Commit(ref commit),
            ..
        } = self;
        let filename = filename.as_ref();
        format!("{BASE_URI}/file/{repo}/{series}/{commit}/{filename}")
    }
}

impl fmt::Display for BuildomatArtifact {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let Self {
            repo: Repo(ref repo),
            series: Series(ref series),
            commit: Commit(ref commit),
            ..
        } = self;
        write!(f, "Buildomat {repo}/{series}@{commit}")
    }
}

impl super::DownloadConfig {
    /// Download a file from the provided Buildomat URI.
    ///
    /// This method will retry the download if Buildomat returns an error that
    /// indicates a file does not yet exist, for up to the configurable maximum
    /// retry duration. This retry logic serves as a mechanism for PHD to wait
    /// for an artifact we expect to exist to be published, when the build that
    /// publishes that artifact is still in progress.
    pub(super) async fn download_buildomat_uri(
        &self,
        uri: &str,
    ) -> anyhow::Result<bytes::Bytes> {
        debug!(
            timeout = ?self.timeout,
            %uri,
            "Downloading file from Buildomat...",
        );
        let client =
            reqwest::ClientBuilder::new().timeout(self.timeout).build()?;
        let try_download = || async {
            let request = client
                .get(uri)
                .build()
                // failing to build the request is a permanent (non-retriable)
                // error, because any retries will use the same URI and request
                // configuration, so they'd fail as well.
                .map_err(|e| backoff::Error::permanent(e.into()))?;

            let response = client
                .execute(request)
                .await
                .map_err(|e| backoff::Error::transient(e.into()))?;

            if !response.status().is_success() {
                // when downloading a file from buildomat, we currently retry
                // all errors, since buildomat returns 500s when an artifact
                // doesn't exist. hopefully, this will be fixed upstream soon:
                // https://github.com/oxidecomputer/buildomat/pull/48
                let err = anyhow::anyhow!(
                    "Buildomat returned HTTP error {}",
                    response.status()
                );
                return Err(backoff::Error::transient(err));
            }
            Ok(response)
        };

        let log_retry = |error, wait| {
            warn!(
                %error,
                %uri,
                "Buildomat download failed, trying again in {wait:?}..."
            );
        };

        let bytes = backoff::future::retry_notify(
            self.buildomat_backoff.clone(),
            try_download,
            log_retry,
        )
        .await
        .with_context(|| format!("Failed to download '{uri}' from Buildomat"))?
        .bytes()
        .await?;

        Ok(bytes)
    }
}

fn str_from_bytes(bytes: &bytes::Bytes) -> anyhow::Result<&str> {
    Ok(std::str::from_utf8(bytes.as_ref())?.trim())
}


================================================
FILE: phd-tests/framework/src/artifacts/manifest.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Result;
use serde::Deserialize;
use std::collections::BTreeMap;

#[derive(Clone, Debug, Deserialize)]
pub(super) struct Manifest {
    pub(super) remote_server_uris: Vec<String>,
    pub(super) artifacts: BTreeMap<String, super::Artifact>,
}

impl Manifest {
    pub(super) fn from_toml_path(toml_path: &camino::Utf8Path) -> Result<Self> {
        let contents = std::fs::read(toml_path.as_str())?;
        let toml_contents = String::from_utf8_lossy(&contents);
        Ok(toml::from_str(&toml_contents)?)
    }
}


================================================
FILE: phd-tests/framework/src/artifacts/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support for working with files consumed by PHD test runs.

use serde::{Deserialize, Serialize};
use std::time::Duration;

pub mod buildomat;
mod manifest;
mod store;

pub use store::Store as ArtifactStore;

pub const DEFAULT_PROPOLIS_ARTIFACT: &str = "__DEFAULT_PROPOLIS";
pub const CRUCIBLE_DOWNSTAIRS_ARTIFACT: &str = "__DEFAULT_CRUCIBLE_DOWNSTAIRS";
pub const BASE_PROPOLIS_ARTIFACT: &str = "__BASE_PROPOLIS";

#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
#[serde(rename_all = "snake_case")]
enum ArtifactKind {
    GuestOs(crate::guest_os::GuestOsKind),
    Bootrom,
    PropolisServer,
    CrucibleDownstairs,
}

#[derive(Clone, Debug, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
enum ArtifactSource {
    /// Get the artifact from Buildomat. This downloads from
    /// https://buildomat.eng.oxide.computer/public/file/REPO/SERIES/COMMIT.
    Buildomat(buildomat::BuildomatArtifact),

    /// Get the artifact from the manifest's list of remote artifact servers.
    RemoteServer { sha256: String },

    /// Get the artifact from the local file system.
    LocalPath { path: camino::Utf8PathBuf, sha256: Option<String> },
}

/// An individual artifact.
#[derive(Clone, Debug, Serialize, Deserialize)]
struct Artifact {
    /// The artifact file's name. When reacquiring an artifact from its source,
    /// this filename is appended to the URI generated from that source.
    filename: camino::Utf8PathBuf,

    /// The kind of artifact this is.
    kind: ArtifactKind,

    /// The source to use to obtain this artifact if it's not present on the
    /// host system.
    source: ArtifactSource,

    /// If present, this artifact is a tarball, and the provided file should be
    /// extracted.
    untar: Option<camino::Utf8PathBuf>,
}

#[derive(Debug)]
struct DownloadConfig {
    timeout: Duration,
    /// Retry backoff settings used when downloading files from Buildomat.
    ///
    /// Retries for Buildomat artifact sources are configured separately from
    /// retries for remote URI artifact sources (which we don't currently retry;
    /// but probably should). This is because we use a very long maximum
    /// duration for retries for Buildomat artifacts, as a way of waiting for an
    /// in-progress build to complete (20 minutes by default). On the other
    /// hand, we probably don't want to retry a download from S3 for 20 minutes.
    buildomat_backoff: backoff::ExponentialBackoff,
    remote_server_uris: Vec<String>,
}


================================================
FILE: phd-tests/framework/src/artifacts/store.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use crate::{
    artifacts::{
        buildomat, manifest::Manifest, ArtifactKind, ArtifactSource,
        DownloadConfig, BASE_PROPOLIS_ARTIFACT, CRUCIBLE_DOWNSTAIRS_ARTIFACT,
        DEFAULT_PROPOLIS_ARTIFACT,
    },
    guest_os::GuestOsKind,
    BasePropolisSource,
};

use anyhow::{bail, Context};
use bytes::Bytes;
use camino::{Utf8Path, Utf8PathBuf};
use ring::digest::{Digest, SHA256};
use std::collections::BTreeMap;
use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
use std::time::Duration;
use tokio::sync::Mutex;
use tracing::{debug, info, warn};

#[derive(Debug)]
struct StoredArtifact {
    description: super::Artifact,
    cached_path: Option<Utf8PathBuf>,
}

impl StoredArtifact {
    fn new(description: super::Artifact) -> Self {
        Self { description, cached_path: None }
    }

    async fn ensure(
        &mut self,
        local_dir: &Utf8Path,
        downloader: &DownloadConfig,
    ) -> anyhow::Result<Utf8PathBuf> {
        // If the artifact already exists and has been verified, return the path
        // to it straightaway.
        if let Some(path) = &self.cached_path {
            debug!(%path, "Verified artifact already exists");
            return Ok(path.clone());
        }

        // If the manifest says to look for a local copy of the file, see if it
        // exists in the expected location and use it if it is.
        if let ArtifactSource::LocalPath { path, sha256 } =
            &self.description.source
        {
            let mut path = path.clone();
            path.push(self.description.filename.as_str());
            debug!(%path, ?sha256, "Examining locally-sourced artifact");

            // Local files can have a digest but aren't required to have one.
            // This facilitates the use of local build outputs whose hashes
            // frequently change. If a digest was passed, make sure it matches.
            if let Some(digest) = sha256 {
                file_hash_equals(&path, digest)?;
            } else if !path.is_file() {
                anyhow::bail!("artifact path {path} is not a file");
            }

            // The file is in the right place and has the right hash (if that
            // was checked), so mark it as cached and return the cached path.
            debug!(%path, "Locally-sourced artifact is valid, caching its path");
            self.cached_path = Some(path.clone());
            return Ok(path.clone());
        }

        let expected_digest = match &self.description.source {
            ArtifactSource::Buildomat(ref artifact) => &artifact.sha256,
            ArtifactSource::RemoteServer { sha256 } => sha256,
            ArtifactSource::LocalPath { .. } => {
                unreachable!("local path case handled above")
            }
        };

        // See if the artifact already exists in the expected location in the
        // local artifact storage directory. If it does and it has the correct
        // digest, mark the artifact as present.
        let mut maybe_path = local_dir.to_path_buf();
        maybe_path
            .push(format!("{}/{}", expected_digest, self.description.filename));

        debug!(%maybe_path, "checking for existing copy of artifact");
        if maybe_path.is_file() {
            if file_hash_equals(&maybe_path, expected_digest).is_ok() {
                debug!(%maybe_path,
                      "Valid artifact already exists, caching its path");
                return self.cache_path(maybe_path);
            } else {
                warn!(%maybe_path, "Existing artifact is invalid, deleting it");
                std::fs::remove_file(&maybe_path)?;
            }
        } else if maybe_path.exists() {
            anyhow::bail!(
                "artifact path {maybe_path} already exists but isn't a file"
            );
        }

        // The artifact is not in the expected place or has the wrong digest, so
        // reacquire it.
        let bytes = match &self.description.source {
            ArtifactSource::Buildomat(source) => {
                downloader
                    .download_buildomat_artifact(
                        source,
                        &self.description.filename,
                        expected_digest,
                    )
                    .await?
            }
            ArtifactSource::RemoteServer { .. } => {
                downloader
                    .download_remote_artifact(
                        &self.description.filename,
                        expected_digest,
                    )
                    .await?
            }
            ArtifactSource::LocalPath { .. } => {
                unreachable!("local path case handled above")
            }
        };

        // There is at least one plausible place from which to try to obtain the
        // artifact. Create the directory that will hold it.
        std::fs::create_dir_all(maybe_path.parent().unwrap())?;
        let mut new_file = std::fs::File::create(&maybe_path)?;
        new_file.write_all(&bytes)?;

        // Make the newly-downloaded artifact read-only to try to
        // ensure tests won't change it. Disks created from an
        // artifact can be edited to be writable.
        let mut permissions = new_file.metadata()?.permissions();
        permissions.set_readonly(true);
        new_file.set_permissions(permissions)?;

        self.cache_path(maybe_path)
    }

    fn cache_path(
        &mut self,
        mut path: Utf8PathBuf,
    ) -> anyhow::Result<Utf8PathBuf> {
        if let Some(ref untar_path) = self.description.untar {
            // This artifact is a tarball, and a file must be extracted from it.
            let filename = untar_path.file_name().ok_or_else(|| {
                anyhow::anyhow!(
                    "untar path '{untar_path}' has no file name component"
                )
            })?;
            let extracted_path = path.with_file_name(filename);

            path = if !extracted_path.exists() {
                debug!(%extracted_path, %untar_path, "Extracting artifact from tarball");

                extract_tar_gz(&path, untar_path)?
            } else {
                debug!(%extracted_path, "Artifact already extracted from tarball");
                extracted_path
            }
        };

        self.cached_path = Some(path.clone());
        Ok(path)
    }
}

#[derive(Debug)]
pub struct Store {
    local_dir: Utf8PathBuf,
    artifacts: BTreeMap<String, Mutex<StoredArtifact>>,
    downloader: DownloadConfig,
}

impl Store {
    pub fn from_toml_path(
        local_dir: Utf8PathBuf,
        toml_path: &Utf8Path,
        max_buildomat_wait: Duration,
    ) -> anyhow::Result<Self> {
        Ok(Self::from_manifest(
            local_dir,
            Manifest::from_toml_path(toml_path)?,
            max_buildomat_wait,
        ))
    }

    fn from_manifest(
        local_dir: Utf8PathBuf,
        manifest: Manifest,
        max_buildomat_wait: Duration,
    ) -> Self {
        let Manifest { artifacts, remote_server_uris } = manifest;
        let artifacts = artifacts
            .into_iter()
            .map(|(k, v)| (k, Mutex::new(StoredArtifact::new(v))))
            .collect();

        let buildomat_backoff = backoff::ExponentialBackoffBuilder::new()
            .with_max_elapsed_time(Some(max_buildomat_wait))
            .with_initial_interval(Duration::from_secs(1))
            .build();

        let store = Self {
            local_dir,
            artifacts,
            downloader: DownloadConfig {
                timeout: Duration::from_secs(600),
                buildomat_backoff,
                remote_server_uris,
            },
        };
        debug!(?store, "Created new artifact store from manifest");
        store
    }

    pub fn add_propolis_from_local_cmd(
        &mut self,
        propolis_server_cmd: &Utf8Path,
    ) -> anyhow::Result<()> {
        info!(%propolis_server_cmd, "Adding Propolis server from local command");
        self.add_local_artifact(
            propolis_server_cmd,
            DEFAULT_PROPOLIS_ARTIFACT,
            ArtifactKind::PropolisServer,
        )
    }

    pub async fn add_current_propolis(
        &mut self,
        source: crate::BasePropolisSource<'_>,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(
            !self.artifacts.contains_key(BASE_PROPOLIS_ARTIFACT),
            "artifact store already contains key {BASE_PROPOLIS_ARTIFACT}",
        );

        const REPO: buildomat::Repo =
            buildomat::Repo::from_static("oxidecomputer/propolis");
        let commit = match source {
            BasePropolisSource::BuildomatBranch(branch) => {
                info!("Adding 'current' Propolis server from Buildomat Git branch '{branch}'");
                REPO.get_branch_head(branch).await?
            }
            BasePropolisSource::BuildomatGitRev(commit) => {
                info!("Adding 'current' Propolis server from Buildomat Git commit '{commit}'");
                commit.clone()
            }
            BasePropolisSource::Local(cmd) => {
                info!("Adding 'current' Propolis server from local command '{cmd}'");
                return self.add_local_artifact(
                    cmd,
                    BASE_PROPOLIS_ARTIFACT,
                    ArtifactKind::PropolisServer,
                );
            }
        };

        // fetch the `phd_build` series, rather than the release `image` series,
        // to get a debug executable. the `phd_build` executable:
        // - contains debug assertions
        // - doesn't try to use the VMM kernel memory reservoir, which may not
        //   work nicely in the test environment
        let series = buildomat::Series::from_static("phd_build");
        let filename = Utf8PathBuf::from("propolis-server.tar.gz");
        let source = REPO
            .artifact_for_commit(series, commit, &filename, &self.downloader)
            .await?;
        let artifact = super::Artifact {
            filename,
            kind: ArtifactKind::PropolisServer,
            source: ArtifactSource::Buildomat(source),
            untar: Some("propolis-server".into()),
        };

        let _old = self.artifacts.insert(
            BASE_PROPOLIS_ARTIFACT.to_string(),
            Mutex::new(StoredArtifact::new(artifact)),
        );
        assert!(_old.is_none());
        Ok(())
    }

    pub async fn add_crucible_downstairs(
        &mut self,
        source: &crate::CrucibleDownstairsSource,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(
            !self.artifacts.contains_key(CRUCIBLE_DOWNSTAIRS_ARTIFACT),
            "artifact store already contains key {CRUCIBLE_DOWNSTAIRS_ARTIFACT}",
        );

        match source {
            crate::CrucibleDownstairsSource::Local(
                ref crucible_downstairs_cmd,
            ) => {
                info!(%crucible_downstairs_cmd, "Adding crucible-downstairs from local command");
                self.add_local_artifact(
                    crucible_downstairs_cmd,
                    CRUCIBLE_DOWNSTAIRS_ARTIFACT,
                    ArtifactKind::CrucibleDownstairs,
                )
            }
            crate::CrucibleDownstairsSource::BuildomatGitRev(ref commit) => {
                info!(%commit, "Adding crucible-downstairs from Buildomat Git revision");
                let repo =
                    buildomat::Repo::from_static("oxidecomputer/crucible");
                let series = buildomat::Series::from_static("nightly-image");
                let filename = Utf8PathBuf::from("crucible-nightly.tar.gz");
                let artifact = repo
                    .artifact_for_commit(
                        series,
                        commit.clone(),
                        &filename,
                        &self.downloader,
                    )
                    .await?;

                let artifact = super::Artifact {
                    filename,
                    kind: ArtifactKind::CrucibleDownstairs,
                    source: ArtifactSource::Buildomat(artifact),
                    untar: Some(
                        ["target", "release", "crucible-downstairs"]
                            .iter()
                            .collect::<Utf8PathBuf>(),
                    ),
                };

                let _old = self.artifacts.insert(
                    CRUCIBLE_DOWNSTAIRS_ARTIFACT.to_string(),
                    Mutex::new(StoredArtifact::new(artifact)),
                );
                assert!(_old.is_none());
                Ok(())
            }
        }
    }

    pub async fn get_guest_os_image(
        &self,
        artifact_name: &str,
    ) -> anyhow::Result<(Utf8PathBuf, GuestOsKind)> {
        let entry = self.get_artifact(artifact_name)?;
        let mut guard = entry.lock().await;
        match guard.description.kind {
            super::ArtifactKind::GuestOs(kind) => {
                let path =
                    guard.ensure(&self.local_dir, &self.downloader).await?;
                Ok((path, kind))
            }
            _ => Err(anyhow::anyhow!(
                "artifact {artifact_name} is not a guest OS image"
            )),
        }
    }

    pub async fn get_bootrom(
        &self,
        artifact_name: &str,
    ) -> anyhow::Result<Utf8PathBuf> {
        let entry = self.get_artifact(artifact_name)?;
        let mut guard = entry.lock().await;
        match guard.description.kind {
            super::ArtifactKind::Bootrom => {
                guard.ensure(&self.local_dir, &self.downloader).await
            }
            _ => Err(anyhow::anyhow!(
                "artifact {artifact_name} is not a bootrom"
            )),
        }
    }

    pub async fn get_propolis_server(
        &self,
        artifact_name: &str,
    ) -> anyhow::Result<Utf8PathBuf> {
        let entry = self.get_artifact(artifact_name)?;
        let mut guard = entry.lock().await;
        match guard.description.kind {
            super::ArtifactKind::PropolisServer => {
                guard.ensure(&self.local_dir, &self.downloader).await
            }
            _ => Err(anyhow::anyhow!(
                "artifact {artifact_name} is not a Propolis server"
            )),
        }
    }

    pub async fn get_crucible_downstairs(&self) -> anyhow::Result<Utf8PathBuf> {
        let entry = self.get_artifact(CRUCIBLE_DOWNSTAIRS_ARTIFACT)?;
        let mut guard = entry.lock().await;
        match guard.description.kind {
            super::ArtifactKind::CrucibleDownstairs => {
                guard.ensure(&self.local_dir, &self.downloader).await
            }
            _ => Err(anyhow::anyhow!(
                "artifact {CRUCIBLE_DOWNSTAIRS_ARTIFACT} is not a Crucible downstairs binary",
            )),
        }
    }

    fn get_artifact(
        &self,
        name: &str,
    ) -> anyhow::Result<&Mutex<StoredArtifact>> {
        self.artifacts.get(name).ok_or_else(|| {
            anyhow::anyhow!("artifact {name} not found in store")
        })
    }

    fn add_local_artifact(
        &mut self,
        cmd: &Utf8Path,
        artifact_name: &str,
        kind: super::ArtifactKind,
    ) -> anyhow::Result<()> {
        if self.artifacts.contains_key(artifact_name) {
            anyhow::bail!(
                "artifact store already contains key {artifact_name:?}"
            );
        }

        let full_path = cmd.canonicalize_utf8()?;
        let filename = full_path.file_name().ok_or_else(|| {
            anyhow::anyhow!(
                "local artifact command '{cmd}' contains no file component"
            )
        })?;
        let dir = full_path.parent().ok_or_else(|| {
            anyhow::anyhow!(
                "canonicalized local artifact path '{full_path}' has no directory component"
            )
        })?;

        let artifact = super::Artifact {
            filename: Utf8PathBuf::from(filename),
            kind,
            source: super::ArtifactSource::LocalPath {
                path: dir.to_path_buf(),
                sha256: None,
            },
            untar: None,
        };

        let _old: Option<Mutex<StoredArtifact>> = self.artifacts.insert(
            artifact_name.to_string(),
            Mutex::new(StoredArtifact::new(artifact)),
        );
        assert!(_old.is_none());

        Ok(())
    }
}

fn sha256_digest(mut reader: impl std::io::Read) -> anyhow::Result<Digest> {
    // file.seek(SeekFrom::Start(0))?;
    // let mut reader = BufReader::new(file);
    let mut context = ring::digest::Context::new(&SHA256);
    let mut buffer = [0; 1024];

    loop {
        let count = reader.read(&mut buffer)?;
        if count == 0 {
            break;
        }
        context.update(&buffer[..count]);
    }

    Ok(context.finish())
}

fn file_hash_equals(
    path: impl AsRef<std::path::Path>,
    expected_digest: &str,
) -> anyhow::Result<()> {
    let file = File::open(&path).with_context(|| {
        format!("checking hash for file {}", path.as_ref().display())
    })?;

    let mut reader = BufReader::new(file);
    hash_equals(&mut reader, expected_digest)
}

fn hash_equals<R: Read + Seek>(
    mut bytes: R,
    expected_digest: &str,
) -> anyhow::Result<()> {
    // let mut file = File::open(path)?;
    bytes.seek(SeekFrom::Start(0))?;
    let digest = hex::encode(sha256_digest(&mut bytes)?.as_ref());
    if digest != expected_digest {
        bail!("Digest was {digest}, expected {expected_digest}");
    }

    Ok(())
}

fn extract_tar_gz(
    tarball_path: &Utf8Path,
    bin_path: &Utf8Path,
) -> anyhow::Result<Utf8PathBuf> {
    (|| {
        let dir_path =
            tarball_path.parent().context("Tarball path missing parent")?;

        if tarball_path.extension() == Some("gz") {
            tracing::debug!("Extracting gzipped tarball...");
            let file = File::open(tarball_path)?;
            let gz = flate2::read::GzDecoder::new(file);
            return extract_tarball(bin_path, dir_path, gz);
        }

        if tarball_path.extension() == Some("tar") {
            tracing::debug!("Extracting tarball...");
            let file = File::open(tarball_path)?;
            return extract_tarball(bin_path, dir_path, file);
        }

        bail!("File '{tarball_path}' is (probably) not a tarball?")
    })()
    .with_context(|| {
        format!(
            "Failed to extract file '{bin_path}' from tarball '{tarball_path}'"
        )
    })
}

fn extract_tarball(
    bin_path: &Utf8Path,
    dir_path: &Utf8Path,
    file: impl std::io::Read,
) -> anyhow::Result<Utf8PathBuf> {
    let mut archive = tar::Archive::new(file);

    let entries =
        archive.entries().context("Failed to iterate over tarball entries")?;
    for entry in entries {
        let mut entry = match entry {
            Ok(e) => e,
            Err(error) => {
                warn!(%error, "skipping bad tarball entry");
                continue;
            }
        };
        let path = entry.path().context("Tarball entry path was not UTF-8")?;
        if path == bin_path {
            let filename = bin_path
                .file_name()
                .expect("binary path in tarball must include a filename");
            let out_path = dir_path.join(filename);
            entry.unpack(&out_path).with_context(|| {
                format!(
                    "Failed to unpack '{bin_path}' from tarball to {out_path}"
                )
            })?;
            return Ok(out_path);
        }
    }

    Err(anyhow::anyhow!("No file named '{bin_path}' found in tarball"))
}

impl DownloadConfig {
    /// Download the artifact named `filename` from the provided Buildomat
    /// `source`.
    ///
    /// This method will retry the download if Buildomat returns an error that
    /// indicates a file does not yet exist, for up to the configurable maximum
    /// retry duration. This retry logic serves as a mechanism for PHD to wait
    /// for an artifact we expect to exist to be published, when the build that
    /// publishes that artifact is still in progress.
    async fn download_buildomat_artifact(
        &self,
        source: &buildomat::BuildomatArtifact,
        filename: &Utf8Path,
        expected_digest: &str,
    ) -> anyhow::Result<bytes::Bytes> {
        let bytes = self.download_buildomat_uri(&source.uri(filename)).await?;
        hash_equals(Cursor::new(bytes.as_ref()), expected_digest)?;
        Ok(bytes)
    }

    /// Download the artifact named `filename` from one of the configured
    /// `remote_server_uris`.
    ///
    /// If downloading from one remote URI fails (such as due to a network
    /// error, the server returning a non-200 HTTP status code, or a hash
    /// mismatch), this method will try the next remote URI in the list until
    /// the file has been downloaded successfully or all remote server URIs have
    /// been tried.
    async fn download_remote_artifact(
        &self,
        filename: &Utf8Path,
        expected_digest: &str,
    ) -> anyhow::Result<Bytes> {
        anyhow::ensure!(
            !self.remote_server_uris.is_empty(),
            "can't acquire artifact {filename} from remote server with no \
            remote URIs"
        );

        let client =
            reqwest::ClientBuilder::new().timeout(self.timeout).build()?;

        for remote in &self.remote_server_uris {
            let uri = format!("{remote}/{filename}");
            debug!(timeout = ?self.timeout, "Downloading {filename} from {uri}");

            let request = client.get(&uri).build()?;
            let response = match client.execute(request).await {
                Ok(resp) => resp,
                Err(e) => {
                    warn!(?e, uri, "Error obtaining artifact from source");
                    continue;
                }
            };

            if !response.status().is_success() {
                warn!(status = %response.status(),
                      ?response,
                      "HTTP error downloading {filename} from {uri}");
                continue;
            }

            let bytes = response.bytes().await?;
            if let Err(error) =
                hash_equals(Cursor::new(bytes.as_ref()), expected_digest)
            {
                warn!(%error, "Hash mismatch downloading {filename} from {uri}");
                continue;
            } else {
                return Ok(bytes);
            }
        }

        Err(anyhow::anyhow!(
            "Failed to download {filename} from any remote URI",
        ))
    }
}


================================================
FILE: phd-tests/framework/src/disk/crucible.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Abstractions for Crucible-backed disks.

use std::{
    net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6},
    path::{Path, PathBuf},
    process::Stdio,
    sync::Mutex,
};

use anyhow::Context;
use propolis_client::{
    instance_spec::{Component, CrucibleStorageBackend},
    CrucibleOpts, VolumeConstructionRequest,
};
use rand::{rngs::StdRng, RngCore, SeedableRng};
use tracing::{error, info};
use uuid::Uuid;

use super::BlockSize;
use crate::{disk::DeviceName, guest_os::GuestOsKind, log_config::LogConfig};

/// An RAII wrapper around a directory containing Crucible data files. Deletes
/// the directory and its contents when dropped.
#[derive(Debug)]
struct DataDirectory {
    path: PathBuf,
}

impl Drop for DataDirectory {
    fn drop(&mut self) {
        info!(?self.path, "Deleting Crucible downstairs data directory");
        if let Err(e) = std::fs::remove_dir_all(&self.path) {
            error!(?e, ?self.path, "Failed to delete Crucible downstairs data");
        }
    }
}

/// An RAII wrapper around a Crucible downstairs process. Stops the process and
/// deletes the downstairs' data when dropped.
#[allow(dead_code)]
#[derive(Debug)]
struct Downstairs {
    process_handle: std::process::Child,

    /// The address on which this downstairs is serving its API.
    address: SocketAddr,

    /// The address to insert as a connection target when constructing a VCR
    /// that refers to this downstairs. If `None`, the downstairs's API address
    /// is used instead.
    vcr_address_override: Option<SocketAddr>,

    data_dir: DataDirectory,
}

impl Downstairs {
    fn vcr_address(&self) -> SocketAddr {
        self.vcr_address_override.unwrap_or(self.address)
    }
}

impl Drop for Downstairs {
    fn drop(&mut self) {
        info!(?self, "Stopping Crucible downstairs process");
        let _ = self.process_handle.kill();
        let _ = self.process_handle.wait();
    }
}

/// An RAII wrapper around a Crucible disk.
#[derive(Debug)]
pub struct CrucibleDisk {
    device_name: DeviceName,
    disk_id: Uuid,
    guest_os: Option<GuestOsKind>,
    inner: Mutex<Inner>,
}

impl CrucibleDisk {
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        device_name: DeviceName,
        min_disk_size_gib: u64,
        block_size: BlockSize,
        downstairs_binary_path: &impl AsRef<std::ffi::OsStr>,
        downstairs_ports: &[u16],
        data_dir_root: &impl AsRef<Path>,
        read_only_parent: Option<&impl AsRef<Path>>,
        guest_os: Option<GuestOsKind>,
        log_config: LogConfig,
        output_dir: &impl AsRef<Path>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            device_name,
            disk_id: Uuid::new_v4(),
            guest_os,
            inner: Mutex::new(Inner::new(
                min_disk_size_gib,
                block_size,
                downstairs_binary_path,
                downstairs_ports,
                data_dir_root,
                read_only_parent,
                log_config,
                output_dir,
            )?),
        })
    }

    /// Obtains the current volume construction request for this disk.
    pub fn vcr(&self) -> VolumeConstructionRequest {
        self.inner.lock().unwrap().vcr(self.disk_id)
    }

    /// Sets the generation number to use in subsequent calls to create a
    /// backend spec for this disk.
    pub fn set_generation(&self, generation: u64) {
        self.inner.lock().unwrap().generation = generation;
    }

    /// Changes this disk's downstairs configuration so that the returned IP
    /// address of the first downstairs is an IPv6 black hole instead of its
    /// actual address. This will prevent VMs from activating this disk until
    /// the VCR is replaced with one bearing the correct IP address.
    pub fn enable_vcr_black_hole(&self) {
        info!(disk = self.device_name.as_str(), "enabling vcr black hole");

        // 100::/64 is the IPv6 discard prefix (per RFC 6666).
        let address = SocketAddr::V6(SocketAddrV6::new(
            Ipv6Addr::new(0x100, 0, 0, 0, 0, 0, 0, 0),
            9000,
            0,
            0,
        ));

        let mut inner = self.inner.lock().unwrap();

        // Crucible rejects VCR replacement requests that change more than one
        // downstairs address, so just invalidate the first downstairs address.
        // This ensures that if the black hole is disabled, subsequent VCRs are
        // valid replacements for the ones produced while the black hole was
        // enabled.
        inner.downstairs_instances[0].vcr_address_override = Some(address);
    }

    /// Ensures that this disk's downstairs configuration will return the
    /// correct addresses for all its downstairs instances.
    pub fn disable_vcr_black_hole(&self) {
        info!(disk = self.device_name.as_str(), "disabling vcr black hole");
        let mut inner = self.inner.lock().unwrap();
        inner.downstairs_instances[0].vcr_address_override = None;
    }
}

impl super::DiskConfig for CrucibleDisk {
    fn device_name(&self) -> &DeviceName {
        &self.device_name
    }

    fn backend_spec(&self) -> Component {
        self.inner.lock().unwrap().backend_spec(self.disk_id)
    }

    fn guest_os(&self) -> Option<GuestOsKind> {
        self.guest_os
    }

    fn as_crucible(&self) -> Option<&CrucibleDisk> {
        Some(self)
    }
}

#[derive(Debug)]
struct Inner {
    /// The disk's block size.
    block_size: BlockSize,

    /// The number of blocks in this disk's region's extents.
    blocks_per_extent: u64,

    /// The number of extents in each of the disk's regions.
    extent_count: u32,

    /// The collection of downstairs process wrappers for this disk.
    downstairs_instances: Vec<Downstairs>,

    /// An optional path to a file to use as a read-only parent for this disk.
    read_only_parent: Option<PathBuf>,

    /// The base64-encoded encryption key to use for this disk.
    encryption_key: String,

    /// The generation number to insert into this disk's
    /// `VolumeConstructionRequest`s.
    generation: u64,
}

impl Inner {
    /// Constructs a new Crucible disk that stores its files in the supplied
    /// `data_dir`.
    #[allow(clippy::too_many_arguments)]
    pub(crate) fn new(
        min_disk_size_gib: u64,
        block_size: BlockSize,
        downstairs_binary_path: &impl AsRef<std::ffi::OsStr>,
        downstairs_ports: &[u16],
        data_dir_root: &impl AsRef<Path>,
        read_only_parent: Option<&impl AsRef<Path>>,
        log_config: LogConfig,
        output_dir: &impl AsRef<Path>,
    ) -> anyhow::Result<Self> {
        // To create a region, Crucible requires a block size, an extent size
        // given as a number of blocks, and an extent count. Compute the latter
        // two figures here. The extent size is semi-arbitrarily chosen to be 64
        // MiB (to match Omicron's extent size at the time this module was
        // authored); this can be parameterized later if needed.
        const EXTENT_SIZE: u64 = 64 << 20;
        const GIBIBYTE: u64 = 1 << 30;

        assert!(EXTENT_SIZE > block_size.bytes());
        assert!(EXTENT_SIZE.is_multiple_of(block_size.bytes()));

        let disk_size_gib = match read_only_parent {
            // If there's a read-only parent, ensure the disk is large enough to
            // fit the entire parent, even if its size exceeds the minimum
            // requested disk size.
            Some(parent) => {
                let path = parent.as_ref();
                let meta = std::fs::metadata(path).with_context(|| {
                    format!(
                        "Failed to get fs metadata for read-only parent '{}'",
                        path.display()
                    )
                })?;
                let parent_bytes = meta.len();
                let mut parent_gib = parent_bytes / GIBIBYTE;
                // if the parent's size is not evenly divisible by 1 GiB, add 1
                // GiB to the required size to ensure the parent is not truncated.
                if parent_bytes % GIBIBYTE > 0 {
                    parent_gib += 1;
                }

                if parent_gib > min_disk_size_gib {
                    info!(
                        parent.size_bytes = parent_bytes,
                        parent.size_gib = parent_gib,
                        min.size_gib = min_disk_size_gib,
                        "Increasing minimum disk size to ensure read-only \
                        parent is not truncated",
                    );
                    parent_gib
                } else {
                    min_disk_size_gib
                }
            }
            // If no read-only parent is specified, just use the minimum
            // requested disk size.
            None => min_disk_size_gib,
        };

        let blocks_per_extent: u64 = EXTENT_SIZE / block_size.bytes();
        let extents_in_disk = (disk_size_gib * GIBIBYTE) / EXTENT_SIZE;

        // Create the region directories for each region.
        let mut data_dirs = vec![];
        let disk_uuid = Uuid::new_v4();
        for port in downstairs_ports {
            let mut data_dir_path = data_dir_root.as_ref().to_path_buf();
            data_dir_path.push(format!("{disk_uuid}_{port}"));
            std::fs::create_dir_all(&data_dir_path)?;
            data_dirs.push(DataDirectory { path: data_dir_path });
        }

        for dir in &data_dirs {
            let dir_arg = dir.path.to_string_lossy();
            let crucible_args = [
                "create",
                "--block-size",
                &block_size.bytes().to_string(),
                "--data",
                dir_arg.as_ref(),
                "--encrypted",
                "--uuid",
                &disk_uuid.to_string(),
                "--extent-size",
                &blocks_per_extent.to_string(),
                "--extent-count",
                &extents_in_disk.to_string(),
            ];

            // This is a transient process, so just pipe stdout/stderr back into
            // the framework and trace the outputs instead of setting up full
            // log files.
            let create_stdout = Stdio::piped();
            let create_stderr = Stdio::piped();
            let create_proc = run_crucible_downstairs(
                &downstairs_binary_path,
                &crucible_args,
                create_stdout,
                create_stderr,
            )?;

            let create_output = create_proc.wait_with_output()?;
            info!(
                stdout = std::str::from_utf8(&create_output.stdout)?,
                stderr = std::str::from_utf8(&create_output.stderr)?,
                status = ?create_output.status,
                "Created Crucible region using crucible-downstairs"
            );

            if !create_output.status.success() {
                anyhow::bail!(
                    "Crucible region creation failed with exit code {:?}",
                    create_output.status.code()
                );
            }
        }

        // Spawn the downstairs processes that will serve requests from guest
        // VMs.
        let mut downstairs_instances = vec![];
        for (port, dir) in downstairs_ports.iter().zip(data_dirs.into_iter()) {
            let addr = SocketAddrV4::new(Ipv4Addr::new(127, 0, 0, 1), *port);
            let dir_arg = dir.path.to_string_lossy();
            let crucible_args = [
                "run",
                "--address",
                &addr.ip().to_string(),
                "--port",
                &addr.port().to_string(),
                "--data",
                dir_arg.as_ref(),
            ];

            // NOTE: `log_format` is ignored here because Crucible determines
            // Bunyan or plain formatting based on `atty::is()`. In practice
            // this is fine, and matches what we want right now, but it might be
            // nice to connect this more directly to the output desire expressed
            // by the test runner.
            let (stdout, stderr) = log_config.output_mode.get_handles(
                output_dir,
                &format!("crucible_{disk_uuid}_{port}"),
            )?;

            info!(?crucible_args, "Launching Crucible downstairs server");
            let downstairs = Downstairs {
                process_handle: run_crucible_downstairs(
                    &downstairs_binary_path,
                    &crucible_args,
                    stdout,
                    stderr,
                )?,
                address: SocketAddr::V4(addr),
                vcr_address_override: None,
                data_dir: dir,
            };

            downstairs_instances.push(downstairs);
        }

        Ok(Self {
            block_size,
            blocks_per_extent,
            extent_count: extents_in_disk as u32,
            downstairs_instances,
            read_only_parent: read_only_parent
                .map(|p| p.as_ref().to_path_buf()),
            encryption_key: base64::Engine::encode(
                &base64::engine::general_purpose::STANDARD,
                {
                    let mut bytes: [u8; 32] = [0; 32];
                    StdRng::from_os_rng().fill_bytes(&mut bytes);
                    bytes
                },
            ),
            generation: 1,
        })
    }

    fn backend_spec(&self, disk_id: Uuid) -> Component {
        let vcr = self.vcr(disk_id);

        Component::CrucibleStorageBackend(CrucibleStorageBackend {
            request_json: serde_json::to_string(&vcr)
                .expect("VolumeConstructionRequest should serialize"),
            readonly: false,
        })
    }

    fn vcr(&self, disk_id: Uuid) -> VolumeConstructionRequest {
        let downstairs_addrs = self
            .downstairs_instances
            .iter()
            .map(Downstairs::vcr_address)
            .collect();

        VolumeConstructionRequest::Volume {
            id: disk_id,
            block_size: self.block_size.bytes(),
            sub_volumes: vec![VolumeConstructionRequest::Region {
                block_size: self.block_size.bytes(),
                blocks_per_extent: self.blocks_per_extent,
                extent_count: self.extent_count,
                opts: CrucibleOpts {
                    id: disk_id,
                    target: downstairs_addrs,
                    lossy: false,
                    flush_timeout: None,
                    key: Some(self.encryption_key.clone()),
                    cert_pem: None,
                    key_pem: None,
                    root_cert_pem: None,
                    control: None,
                    read_only: false,
                },
                generation: self.generation,
            }],
            read_only_parent: self.read_only_parent.as_ref().map(|p| {
                Box::new(VolumeConstructionRequest::File {
                    id: Uuid::new_v4(),
                    block_size: self.block_size.bytes(),
                    path: p.to_string_lossy().to_string(),
                })
            }),
        }
    }
}

fn run_crucible_downstairs(
    binary_path: &impl AsRef<std::ffi::OsStr>,
    args: &[&str],
    stdout: impl Into<Stdio>,
    stderr: impl Into<Stdio>,
) -> anyhow::Result<std::process::Child> {
    info!(?args, "Running crucible-downstairs");
    let process_handle = std::process::Command::new(binary_path)
        .args(args)
        .stdout(stdout)
        .stderr(stderr)
        .spawn()?;

    Ok(process_handle)
}


================================================
FILE: phd-tests/framework/src/disk/fat.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tools for creating a FAT volume that can be used as the contents of a VM
//! disk.

use std::io::{Cursor, Write};

use anyhow::Context;
use fatfs::{FileSystem, FormatVolumeOptions, FsOptions};
use newtype_derive::*;
use thiserror::Error;

/// The maximum size of disk this module can create. This is fixed to put an
/// upper bound on the overhead the FAT filesystem requires.
///
/// This value must be less than or equal to 2,091,008 (4084 * 512). See the
/// docs for [`overhead_sectors`].
const MAX_DISK_SIZE_BYTES: usize = 1 << 20;

/// The size of a sector in this module's produced volumes.
///
/// This module assumes that each FAT cluster is one sector in size.
const BYTES_PER_SECTOR: usize = 512;

/// The size of a directory entry, given by the FAT specification.
const BYTES_PER_DIRECTORY_ENTRY: usize = 32;

/// The number of directory entries in the filesystem's root directory region.
const ROOT_DIRECTORY_ENTRIES: usize = 512;

/// The number of allocation tables that can be found in each FAT volume. This
/// is the default value used by the `fatfs` crate.
const TABLES_PER_VOLUME: usize = 2;

/// A number of disk sectors.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, PartialOrd, Ord)]
struct Sectors(usize);

NewtypeAdd! { () struct Sectors(usize); }
NewtypeAddAssign! { () struct Sectors(usize); }
NewtypeSub! { () struct Sectors(usize); }
NewtypeSubAssign! { () struct Sectors(usize); }
NewtypeMul! { () struct Sectors(usize); }
NewtypeMul! { (usize) struct Sectors(usize); }

impl Sectors {
    /// Yields the number of sectors needed to hold the supplied quantity of
    /// bytes.
    const fn needed_for_bytes(bytes: usize) -> Self {
        Self(bytes.div_ceil(BYTES_PER_SECTOR))
    }
}

/// Computes the number of sectors to reserve for overhead in this module's FAT
/// volumes.
///
/// FAT volumes consist of four regions:
///
/// 1. A reserved region for the BIOS parameter block (BPB)
/// 2. The file allocation tables themselves
/// 3. A set of root directory entries (FAT12/FAT16 only)
/// 4. File and directory data
///
/// The file and directory data is divided into clusters of one or more sectors.
/// The cluster size is given in the FAT metadata in the BPB; in this module
/// each cluster contains just one sector.
///
/// The specific format of a FAT volume (FAT12 vs. FAT16 vs. FAT32) depends on
/// the number of clusters in the file and directory data region:
///
/// - 0 <= clusters <= 4084: FAT12
/// - 4085 <= clusters <= 65524: FAT16
/// - clusters >= 65525: FAT32
///
/// There is a small catch-22 here: a volume's format depends on the number of
/// clusters it has, but the number of clusters in the volume depends on the
/// size of the filesystem metadata, which depends on the volume's format.
/// This module avoids the problem by requiring that the total volume size is
/// less than or equal to 4,084 sectors. The number of clusters will never be
/// greater than this, so the filesystem will always be FAT12, which makes it
/// possible to compute its overhead size.
fn overhead_sectors() -> Sectors {
    let dir_entry_bytes = ROOT_DIRECTORY_ENTRIES * BYTES_PER_DIRECTORY_ENTRY;
    let dir_entry_sectors = Sectors::needed_for_bytes(dir_entry_bytes);

    // To compute the size of the FAT, conservatively assume that all the
    // sectors on the disk are in addressable clusters, figure out how many
    // bytes that would take, and convert to sectors. (In practice, some of the
    // sectors are used for overhead and won't have entries in the FAT, but this
    // gives an upper bound.)
    let max_sectors = Sectors::needed_for_bytes(MAX_DISK_SIZE_BYTES);

    // FAT12 tables use 12 bits per cluster entry.
    let cluster_bits = max_sectors.0 * 12;
    let cluster_bytes = cluster_bits.div_ceil(8);
    let sectors_per_table = Sectors::needed_for_bytes(cluster_bytes);

    // The total overhead size is one sector (for the BPB) plus the sectors
    // needed for regions 2 and 3 (the tables themselves and the root directory
    // entries). Note that there are multiple tables per volume!
    Sectors(1) + (sectors_per_table * TABLES_PER_VOLUME) + dir_entry_sectors
}

/// Yields the number of sectors in this module's FAT volumes that can be used
/// by files.
fn total_usable_sectors() -> Sectors {
    Sectors::needed_for_bytes(MAX_DISK_SIZE_BYTES) - overhead_sectors()
}

/// Represents a file that should be inserted into the file system when it's
/// created.
#[derive(Clone, Debug)]
struct File {
    name: String,
    contents: Vec<u8>,
}

#[derive(Debug, Error)]
pub enum Error {
    #[error("supplied filename {0} contains path separator")]
    PathSeparatorInFilename(String),

    #[error(
        "insufficient space for new file: {required} sectors required, \
        {available} available"
    )]
    NoSpace { required: usize, available: usize },
}

/// Builds a collection of files that can be extruded as an array of bytes
/// containing a FAT filesystem with the collected files.
#[derive(Clone, Default, Debug)]
pub struct FatFilesystem {
    files: Vec<File>,
    sectors_remaining: Sectors,
}

impl FatFilesystem {
    /// Creates a new file collection.
    pub fn new() -> Self {
        Self { files: vec![], sectors_remaining: total_usable_sectors() }
    }

    /// Converts the supplied `contents` string slice to bytes and adds it to
    /// the filesystem using [`Self::add_file_from_bytes`].
    ///
    /// The supplied `filename` must not contain any path separators (the `/`
    /// character).
    pub fn add_file_from_str(
        &mut self,
        filename: &str,
        contents: &str,
    ) -> Result<(), Error> {
        self.add_file_from_bytes(filename, contents.as_bytes())
    }

    /// Adds a file with the supplied `contents` that will appear in the root
    /// directory of the generated file system. The given `filename` must not
    /// contain any path separators (the `/` character).
    pub fn add_file_from_bytes(
        &mut self,
        filename: &str,
        contents: &[u8],
    ) -> Result<(), Error> {
        // The `fatfs` crate will break paths containing separators into their
        // component directories before trying to create the requested file in
        // the appropriate leaf directory. This struct's interface (i.e.
        // FatFilesystem's, not anything in `fatfs`) doesn't give callers a way
        // to create directories, so creating a file with a '/' character in the
        // filename will lead to unexpected behavior (an error at disk
        // generation time at best, or a misnamed or missing file at worst).
        // Return an error to callers who supply such filenames.
        if filename.contains('/') {
            return Err(Error::PathSeparatorInFilename(filename.to_owned()));
        }

        let sectors_needed = Sectors::needed_for_bytes(contents.len());
        if sectors_needed > self.sectors_remaining {
            Err(Error::NoSpace {
                required: sectors_needed.0,
                available: self.sectors_remaining.0,
            })
        } else {
            self.files.push(File {
                name: filename.to_owned(),
                contents: contents.to_vec(),
            });

            self.sectors_remaining -= sectors_needed;
            Ok(())
        }
    }

    pub fn as_bytes(&self) -> anyhow::Result<Vec<u8>> {
        let file_sectors: usize = self
            .files
            .iter()
            .map(|f| Sectors::needed_for_bytes(f.contents.len()).0)
            .sum();

        assert!(file_sectors <= total_usable_sectors().0);

        // `fatfs` requires that the output volume has at least 42 sectors, no
        // matter what.
        let sectors = 42.max(file_sectors + overhead_sectors().0);

        assert!(sectors <= Sectors::needed_for_bytes(MAX_DISK_SIZE_BYTES).0);

        // Some guest software requires that a FAT disk's sector count be a
        // multiple of the sectors-per-track value in its BPB. Trivially enforce
        // this by ensuring there's one track containing all the sectors.
        let sectors_per_track: u16 = sectors.try_into().map_err(|_| {
            anyhow::anyhow!(
                "disk has {sectors} sectors, which is too many for one FAT track"
            )
        })?;

        let mut disk = Cursor::new(vec![0; sectors * BYTES_PER_SECTOR]);
        fatfs::format_volume(
            &mut disk,
            FormatVolumeOptions::new()
                .bytes_per_sector(BYTES_PER_SECTOR.try_into().unwrap())
                .bytes_per_cluster(BYTES_PER_SECTOR.try_into().unwrap())
                .sectors_per_track(sectors_per_track)
                .fat_type(fatfs::FatType::Fat12)
                .volume_label(*b"phd        "),
        )
        .context("formatting FAT volume")?;

        {
            let fs = FileSystem::new(&mut disk, FsOptions::new())
                .context("creating FAT filesystem")?;

            let root_dir = fs.root_dir();
            for file in &self.files {
                root_dir
                    .create_file(&file.name)
                    .with_context(|| format!("creating file {}", file.name))?
                    .write_all(file.contents.as_slice())
                    .with_context(|| {
                        format!("writing contents of file {}", file.name)
                    })?;
            }
        }

        Ok(disk.into_inner())
    }
}

#[cfg(test)]
mod test {
    use super::*;

    #[test]
    fn cannot_add_file_too_large_for_disk() {
        let mut fs = FatFilesystem::new();
        assert!(fs
            .add_file_from_str("too_big", &"a".repeat(MAX_DISK_SIZE_BYTES))
            .is_err());
    }

    #[test]
    fn cannot_exceed_disk_size_limit_with_multiple_files() {
        let mut fs = FatFilesystem::new();
        for idx in 0..total_usable_sectors().0 {
            assert!(
                fs.add_file_from_str(
                    &format!("file{idx}"),
                    &"a".repeat(BYTES_PER_SECTOR)
                )
                .is_ok(),
                "adding file {idx}"
            );
        }

        assert!(fs
            .add_file_from_str("file", &"a".repeat(BYTES_PER_SECTOR))
            .is_err());
    }
}


================================================
FILE: phd-tests/framework/src/disk/file.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Abstractions for disks with a raw file backend.

use camino::{Utf8Path, Utf8PathBuf};
use propolis_client::instance_spec::{Component, FileStorageBackend};
use std::num::NonZeroUsize;
use tracing::{debug, error, warn};
use uuid::Uuid;

use crate::{
    disk::DeviceName, guest_os::GuestOsKind, zfs::ClonedFile as ZfsClonedFile,
};

/// Describes the method used to create the backing file for a file-backed disk.
#[derive(Debug)]
enum BackingFile {
    /// The disk is a ZFS clone of the original artifact.
    Zfs(ZfsClonedFile),

    /// The disk is a hard copy of the original artifact.
    HardCopy(Utf8PathBuf),
}

impl BackingFile {
    /// Creates a new backing file from the artifact with the supplied
    /// `artifact_path`. If possible, this routine will create a ZFS clone of
    /// the dataset containing the file; otherwise it will fall back to creating
    /// a hard copy of the original artifact.
    fn create_from_source(
        artifact_path: &Utf8Path,
        data_dir: &Utf8Path,
    ) -> anyhow::Result<Self> {
        match ZfsClonedFile::create_from_path(artifact_path) {
            Ok(file) => return Ok(Self::Zfs(file)),
            Err(error) => warn!(
                %artifact_path,
                %error,
                "failed to make ZFS clone of backing artifact, will copy it"
            ),
        }

        let mut disk_path = data_dir.to_path_buf();
        disk_path.push(format!("{}.phd_disk", Uuid::new_v4()));
        debug!(
            source = %artifact_path,
            disk_path = %disk_path,
            "Copying source image to create temporary disk",
        );

        std::fs::copy(artifact_path, &disk_path)?;
        Ok(Self::HardCopy(disk_path))
    }

    /// Yields the path to this disk's backing file.
    fn path(&self) -> Utf8PathBuf {
        match self {
            BackingFile::Zfs(zfs) => zfs.path(),
            BackingFile::HardCopy(path) => path.clone(),
        }
    }
}

impl Drop for BackingFile {
    fn drop(&mut self) {
        // ZFS clones are cleaned up by their own drop impls.
        if let BackingFile::HardCopy(ref path) = self {
            debug!(%path, "deleting hard copy of guest disk image");
            if let Err(e) = std::fs::remove_file(path) {
                error!(
                    ?e,
                    %path,
                    "failed to delete hard copy of guest disk image"
                );
            }
        }
    }
}

/// An RAII wrapper for a disk wrapped by a file.
#[derive(Debug)]
pub struct FileBackedDisk {
    /// The name to use for instance spec backends that refer to this disk.
    device_name: DeviceName,

    /// The backing file for this disk.
    file: BackingFile,

    /// The kind of guest OS image this guest contains, or `None` if the disk
    /// was not initialized from a guest OS artifact.
    guest_os: Option<GuestOsKind>,
}

impl FileBackedDisk {
    /// Creates a new file-backed disk whose initial contents are copied from
    /// the specified artifact on the host file system.
    pub(crate) fn new_from_artifact(
        device_name: DeviceName,
        artifact_path: &impl AsRef<Utf8Path>,
        data_dir: &impl AsRef<Utf8Path>,
        guest_os: Option<GuestOsKind>,
    ) -> Result<Self, super::DiskError> {
        let artifact = BackingFile::create_from_source(
            artifact_path.as_ref(),
            data_dir.as_ref(),
        )?;

        // Make sure the disk is writable (the artifact may have been
        // read-only).
        let disk_file = std::fs::File::open(artifact.path())?;
        let mut permissions = disk_file.metadata()?.permissions();

        // TODO: Clippy is upset that `set_readonly(false)` results in
        // world-writable files on UNIX-like OSes.  Suppress the lint for now
        // until someone gets around to a more specific solution.
        #[allow(clippy::permissions_set_readonly_false)]
        permissions.set_readonly(false);
        disk_file.set_permissions(permissions)?;

        Ok(Self { device_name, file: artifact, guest_os })
    }
}

impl super::DiskConfig for FileBackedDisk {
    fn device_name(&self) -> &DeviceName {
        &self.device_name
    }

    fn backend_spec(&self) -> Component {
        Component::FileStorageBackend(FileStorageBackend {
            path: self.file.path().to_string(),
            readonly: false,
            block_size: 512,
            workers: Some(NonZeroUsize::new(8).unwrap()),
        })
    }

    fn guest_os(&self) -> Option<GuestOsKind> {
        self.guest_os
    }
}


================================================
FILE: phd-tests/framework/src/disk/in_memory.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Abstractions for disks with an in-memory backend.

use propolis_client::instance_spec::{BlobStorageBackend, Component};

use super::DiskConfig;
use crate::disk::DeviceName;

/// A disk with an in-memory backend.
#[derive(Debug)]
pub struct InMemoryDisk {
    device_name: DeviceName,
    contents: Vec<u8>,
    readonly: bool,
}

impl InMemoryDisk {
    /// Creates an in-memory backend that will present the supplied `contents`
    /// to the guest.
    pub fn new(
        device_name: DeviceName,
        contents: Vec<u8>,
        readonly: bool,
    ) -> Self {
        Self { device_name, contents, readonly }
    }
}

impl DiskConfig for InMemoryDisk {
    fn device_name(&self) -> &DeviceName {
        &self.device_name
    }

    fn backend_spec(&self) -> Component {
        let base64 = base64::Engine::encode(
            &base64::engine::general_purpose::STANDARD,
            self.contents.as_slice(),
        );

        Component::BlobStorageBackend(BlobStorageBackend {
            base64,
            readonly: self.readonly,
        })
    }

    fn guest_os(&self) -> Option<crate::guest_os::GuestOsKind> {
        None
    }
}


================================================
FILE: phd-tests/framework/src/disk/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Routines for creating and managing guest disks.
//!
//! Test cases create disks using the `DiskFactory` in their test contexts.
//! They can then pass these disks to the VM factory to connect them to a
//! specific guest VM.

use std::path::Path;
use std::sync::Arc;

use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use in_memory::InMemoryDisk;
use propolis_client::instance_spec::Component;
use thiserror::Error;

use crate::{
    artifacts::ArtifactStore,
    guest_os::GuestOsKind,
    log_config::LogConfig,
    port_allocator::{PortAllocator, PortAllocatorError},
};

use self::{crucible::CrucibleDisk, file::FileBackedDisk};

pub mod crucible;
pub mod fat;
mod file;
pub mod in_memory;

/// Errors that can arise while working with disks.
#[derive(Debug, Error)]
pub enum DiskError {
    #[error("Disk factory has no Crucible downstairs artifact")]
    NoCrucibleDownstairs,

    #[error("can't create a {disk_type} disk from source of type {src}")]
    SourceNotSupported { disk_type: &'static str, src: &'static str },

    #[error(transparent)]
    PortAllocatorError(#[from] PortAllocatorError),

    #[error(transparent)]
    IoError(#[from] std::io::Error),

    #[error(transparent)]
    FatFilesystemError(#[from] fat::Error),

    #[error(transparent)]
    Other(#[from] anyhow::Error),
}

#[derive(Copy, Clone, Debug)]
pub enum BlockSize {
    Bytes512,
    Bytes4096,
}

impl BlockSize {
    fn bytes(&self) -> u64 {
        match self {
            BlockSize::Bytes512 => 512,
            BlockSize::Bytes4096 => 4096,
        }
    }
}

/// The name for the device implementing a disk. This is the name provided for a
/// disk in constructing a VM spec for PHD tests. The disk by this name likely
/// also has a [`BackendName`] derived from this device name.
///
/// TODO: This exists largely to ensure that PHD matches the same spec
/// construction conventions as `propolis-server` when handling API requests: it
/// is another piece of glue that could reasonably be deleted if/when PHD and
/// sled-agent use the same code to build InstanceEnsureRequest. Until then,
/// carefully match the relationship between names with these newtypes.
///
/// Alternatively, `DeviceName` and `BackendName` could be pulled into
/// `propolis-api-types`.
#[derive(Clone, Debug)]
pub struct DeviceName(String);

impl DeviceName {
    pub fn new(name: String) -> Self {
        DeviceName(name)
    }

    pub fn into_backend_name(self) -> BackendName {
        // This must match `api_request.rs`' `parse_disk_from_request`.
        BackendName(format!("{}-backend", self.0))
    }

    pub fn into_string(self) -> String {
        self.0
    }

    pub fn as_str(&self) -> &str {
        self.0.as_str()
    }
}

/// The name for a backend implementing storage for a disk. This is derived
/// exclusively from a corresponding [`DeviceName`].
#[derive(Clone, Debug)]
pub struct BackendName(String);

impl BackendName {
    pub fn into_string(self) -> String {
        self.0
    }
}

/// A trait for functions exposed by all disk backends (files, Crucible, etc.).
pub trait DiskConfig: std::fmt::Debug + Send + Sync {
    /// Yields the device name for this disk.
    fn device_name(&self) -> &DeviceName;

    /// Yields the backend spec for this disk's storage backend.
    fn backend_spec(&self) -> Component;

    /// Yields the guest OS kind of the guest image the disk was created from,
    /// or `None` if the disk was not created from a guest image.
    fn guest_os(&self) -> Option<GuestOsKind>;

    /// If this disk is a Crucible disk, yields `Some` reference to that disk as
    /// a Crucible disk.
    fn as_crucible(&self) -> Option<&CrucibleDisk> {
        None
    }
}

/// The possible sources for a disk's initial data.
#[derive(Clone, Debug)]
pub enum DiskSource<'a> {
    /// A blank disk with the supplied size, in bytes.
    Blank(usize),

    /// A disk backed by the guest image artifact with the supplied key.
    Artifact(&'a str),

    /// A disk with the contents of the supplied filesystem.
    FatFilesystem(fat::FatFilesystem),
}

impl DiskSource<'_> {
    pub(crate) fn kind(&self) -> &'static str {
        match self {
            DiskSource::Blank(_) => "blank",
            DiskSource::Artifact(_) => "artifact",
            DiskSource::FatFilesystem(_) => "filesystem",
        }
    }
}

/// A factory that provides tests with the means to create a disk they can
/// attach to a guest VM.
///
/// The `create_foo` functions implemented by the factory create disk objects
/// whose initial contents are described by a supplied [`DiskSource`]. They
/// return disks wrapped in an `Arc` that can be passed to `ConfigRequest`
/// routines that add disks to a VM's configuration. This allows tests to manage
/// disks in two ways:
///
/// 1. Tests that don't need a disk's resources to outlive a VM can simply move
///    the disk reference into the VM config (which will move the reference to
///    the VM). In this way the disk is destroyed when its test VM goes away.
/// 2. Tests that want to preserve or reuse a disk after its VM stops can
///    instead clone the reference into the VM and reuse the source reference
///    later in the test. This can be used to, say, launch a VM, destroy it, and
///    attach the same disk to another VM to verify that changes to it are
///    persisted.
///
/// N.B. The disk objects the factory creates take no special care to ensure
///      that they can be used safely by multiple VMs at the same time. If
///      multiple VMs do use a single set of backend resources, the resulting
///      behavior will depend on the chosen backend's semantics and the way the
///      Propolis backend implementations interact with the disk.
pub(crate) struct DiskFactory {
    /// The directory in which disk files should be stored.
    storage_dir: Utf8PathBuf,

    /// A reference to the artifact store to use to look up guest OS artifacts
    /// when those are used as a disk source.
    artifact_store: Arc<ArtifactStore>,

    /// The port allocator to use to allocate ports to Crucible server
    /// processes.
    port_allocator: Arc<PortAllocator>,

    /// The logging discipline to use for Crucible server processes.
    log_config: LogConfig,
}

impl DiskFactory {
    /// Creates a new disk factory. The disks this factory generates will store
    /// their data in `storage_dir` and will look up guest OS images in the
    /// supplied `artifact_store`.
    pub fn new(
        storage_dir: &impl AsRef<Utf8Path>,
        artifact_store: Arc<ArtifactStore>,
        port_allocator: Arc<PortAllocator>,
        log_config: LogConfig,
    ) -> Self {
        Self {
            storage_dir: storage_dir.as_ref().to_path_buf(),
            artifact_store,
            port_allocator,
            log_config,
        }
    }
}

impl DiskFactory {
    async fn get_guest_artifact_info(
        &self,
        artifact_name: &str,
    ) -> Result<(Utf8PathBuf, GuestOsKind), DiskError> {
        self.artifact_store
            .get_guest_os_image(artifact_name)
            .await
            .with_context(|| {
                format!("failed to get guest OS artifact '{artifact_name}'")
            })
            .map_err(Into::into)
    }

    /// Creates a new disk backed by a file whose initial contents are specified
    /// by `source`.
    pub(crate) async fn create_file_backed_disk(
        &self,
        name: DeviceName,
        source: &DiskSource<'_>,
    ) -> Result<Arc<FileBackedDisk>, DiskError> {
        let artifact_name = match source {
            DiskSource::Artifact(name) => name,
            // It's possible in theory to have a file-backed disk that isn't
            // backed by an artifact by creating a temporary file and copying
            // the supplied disk contents to it, but for now this isn't
            // supported.
            DiskSource::Blank(_) | DiskSource::FatFilesystem(_) => {
                return Err(DiskError::SourceNotSupported {
                    disk_type: "file-backed",
                    src: source.kind(),
                });
            }
        };

        let (artifact_path, guest_os) =
            self.get_guest_artifact_info(artifact_name).await?;

        FileBackedDisk::new_from_artifact(
            name,
            &artifact_path,
            &self.storage_dir,
            Some(guest_os),
        )
        .map(Arc::new)
    }

    /// Creates a new Crucible-backed disk by creating three region files to
    /// hold the disk's data and launching a Crucible downstairs process to
    /// serve each one.
    ///
    /// # Parameters
    ///
    /// - source: The data source that supplies the disk's initial contents.
    ///   If the source data is stored as a file on the local disk, the
    ///   resulting disk's `VolumeConstructionRequest`s will specify that this
    ///   file should be used as a read-only parent volume.
    /// - min_disk_size_gib: The disk's minimum size in GiB. The disk's actual
    ///   size is the larger of this size and the source's size.
    /// - block_size: The disk's block size.
    pub(crate) async fn create_crucible_disk(
        &self,
        name: DeviceName,
        source: &DiskSource<'_>,
        mut min_disk_size_gib: u64,
        block_size: BlockSize,
        output_dir: &impl AsRef<Path>,
    ) -> Result<Arc<CrucibleDisk>, DiskError> {
        const BYTES_PER_GIB: u64 = 1024 * 1024 * 1024;

        let binary_path = self.artifact_store.get_crucible_downstairs().await?;

        let (artifact_path, guest_os) = match source {
            DiskSource::Artifact(name) => {
                let (path, os) = self.get_guest_artifact_info(name).await?;
                (Some(path), Some(os))
            }
            DiskSource::Blank(size) => {
                let blank_size =
                    u64::try_from(*size).map_err(anyhow::Error::from)?;

                let min_disk_size_b =
                    (min_disk_size_gib * BYTES_PER_GIB).max(blank_size);

                min_disk_size_gib = min_disk_size_b.div_ceil(BYTES_PER_GIB);
                (None, None)
            }
            // It's possible in theory to have a Crucible-backed disk with
            // caller-supplied initial contents by writing those contents out to
            // intermediate files and using them as a read-only parent (or just
            // importing them directly into the Crucible regions), but for now
            // this isn't supported.
            DiskSource::FatFilesystem(_) => {
                return Err(DiskError::SourceNotSupported {
                    disk_type: "Crucible-backed",
                    src: source.kind(),
                });
            }
        };

        let mut ports = [0u16; 3];
        for port in &mut ports {
            *port = self.port_allocator.next()?;
        }

        CrucibleDisk::new(
            name,
            min_disk_size_gib,
            block_size,
            &binary_path.as_std_path(),
            &ports,
            &self.storage_dir,
            artifact_path.as_ref(),
            guest_os,
            self.log_config,
            output_dir,
        )
        .map(Arc::new)
        .map_err(Into::into)
    }

    pub(crate) async fn create_in_memory_disk(
        &self,
        name: DeviceName,
        source: &DiskSource<'_>,
        readonly: bool,
    ) -> Result<Arc<InMemoryDisk>, DiskError> {
        let contents = match source {
            DiskSource::Artifact(name) => {
                let (path, _) = self.get_guest_artifact_info(name).await?;
                std::fs::read(&path).with_context(|| {
                    format!("reading source artifact {name} from {path}")
                })?
            }
            DiskSource::Blank(size) => vec![0; *size],
            DiskSource::FatFilesystem(fs) => fs.as_bytes()?,
        };

        Ok(Arc::new(InMemoryDisk::new(name, contents, readonly)))
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/alpine.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Alpine Linux's "virtual" image.

use super::{CommandSequence, CommandSequenceEntry, GuestOs};

pub(super) struct Alpine;

impl GuestOs for Alpine {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        CommandSequence(vec![
            CommandSequenceEntry::wait_for("localhost login: "),
            CommandSequenceEntry::write_str("root"),
            CommandSequenceEntry::wait_for(self.get_shell_prompt()),
        ])
        .extend(super::linux::stty_enable_long_lines(self))
    }

    fn get_shell_prompt(&self) -> &'static str {
        "localhost:~#"
    }

    fn read_only_fs(&self) -> bool {
        true
    }

    fn shell_command_sequence<'a>(&self, cmd: &'a str) -> CommandSequence<'a> {
        super::shell_commands::shell_command_sequence(
            std::borrow::Cow::Borrowed(cmd),
            crate::serial::BufferKind::Raw,
        )
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        // For Alpine guests we've looked at, `reboot` kicks off OpenRC behavior
        // to reboot the system. We *could* wait for a new shell prompt at this
        // point, but it's more reliable to wait for a guest to have fully
        // rebooted and log back in.
        self.shell_command_sequence("reboot")
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/debian11_nocloud.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Debian 11 nocloud images.

use super::{CommandSequence, CommandSequenceEntry, GuestOs};

pub(super) struct Debian11NoCloud;

impl GuestOs for Debian11NoCloud {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        CommandSequence(vec![
            CommandSequenceEntry::wait_for("debian login: "),
            CommandSequenceEntry::write_str("root"),
            CommandSequenceEntry::wait_for(self.get_shell_prompt()),
        ])
        .extend(super::linux::stty_enable_long_lines(self))
    }

    fn get_shell_prompt(&self) -> &'static str {
        "root@debian:~#"
    }

    fn read_only_fs(&self) -> bool {
        false
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        // On Debian 11, `reboot` does not seem to be the same wrapper for
        // `systemctl reboot` as it is on more recent Ubuntu. Whatever it *is*,
        // it does its job before a new prompt line is printed, so we can only
        // wait to see a new login sequence.
        //
        // While `systemctl reboot` does exist here, and is mechanically more
        // like Ubuntu's `reboot`, just using `reboot` on Debian gets the job
        // done and keeps our instructions consistent across Linuxes.
        self.shell_command_sequence("reboot")
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/linux.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Helper functions for building guest OS adaptations for Linux OSes.

use super::{CommandSequence, CommandSequenceEntry, GuestOs};

/// Yields an `stty` command that tells the guest terminal to behave as though
/// it is 9,999 columns wide.
pub(super) fn stty_enable_long_lines<'a>(
    guest_os: &impl GuestOs,
) -> CommandSequence<'a> {
    CommandSequence(vec![
        CommandSequenceEntry::write_str("stty -F `tty` cols 9999"),
        CommandSequenceEntry::wait_for(guest_os.get_shell_prompt()),
    ])
}


================================================
FILE: phd-tests/framework/src/guest_os/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Traits and objects that abstract over differences between guest OS
//! distributions.

use std::{borrow::Cow, str::FromStr};

use serde::{Deserialize, Serialize};

mod alpine;
mod debian11_nocloud;
mod linux;
mod shell_commands;
mod ubuntu22_04;
pub mod windows;
mod windows_server_2016;
mod windows_server_2019;
mod windows_server_2022;

/// An entry in a sequence of interactions with the guest's command prompt.
#[derive(Debug)]
pub(super) enum CommandSequenceEntry<'a> {
    /// Wait for the supplied string to appear on the guest serial console.
    WaitFor(Cow<'a, str>),

    /// Write the specified string as a command to the guest serial console.
    WriteStr(Cow<'a, str>),

    /// Attempt to establish consistent echoing of characters to the guest
    /// serial console by typing `send` and then waiting for `expect` to appear
    /// within the supplied `timeout`.
    EstablishConsistentEcho {
        send: Cow<'a, str>,
        expect: Cow<'a, str>,
        timeout: std::time::Duration,
    },

    /// Tell the serial console task to clear its buffer.
    ClearBuffer,

    /// Change the serial console buffering discipline to the supplied
    /// discipline.
    ChangeSerialConsoleBuffer(crate::serial::BufferKind),

    /// Set a delay between writing identical bytes to the guest serial console
    /// to avoid keyboard debouncing logic in guests.
    SetRepeatedCharacterDebounce(std::time::Duration),
}

impl<'a> CommandSequenceEntry<'a> {
    fn write_str(s: impl Into<Cow<'a, str>>) -> Self {
        Self::WriteStr(s.into())
    }

    fn wait_for(s: impl Into<Cow<'a, str>>) -> Self {
        Self::WaitFor(s.into())
    }
}

pub(super) struct CommandSequence<'a>(pub Vec<CommandSequenceEntry<'a>>);

impl<'a> CommandSequence<'a> {
    fn extend(mut self, other: CommandSequence<'a>) -> CommandSequence<'a> {
        self.0.extend(other.0);
        self
    }
}

pub(super) trait GuestOs: Send + Sync {
    /// Retrieves the command sequence used to wait for the OS to boot and log
    /// into it.
    fn get_login_sequence(&self) -> CommandSequence<'_>;

    /// Retrieves the default shell prompt for this OS.
    fn get_shell_prompt(&self) -> &'static str;

    /// Indicates whether the guest has a read-only filesystem.
    fn read_only_fs(&self) -> bool;

    /// Returns the sequence of serial console operations a test VM should issue
    /// in order to execute `cmd` in the guest's shell.
    fn shell_command_sequence<'a>(&self, cmd: &'a str) -> CommandSequence<'a> {
        shell_commands::shell_command_sequence(
            Cow::Borrowed(cmd),
            crate::serial::BufferKind::Raw,
        )
    }

    /// Returns the sequence of serial console operations a test VM must perform
    /// in order to perform a graceful (e.g. guest-initiated and expected)
    /// reboot. PHD's expectation following these commands will be to wait for
    /// the guest's login sequence.
    fn graceful_reboot(&self) -> CommandSequence<'_>;
}

#[allow(dead_code)]
#[derive(Clone, Copy, Debug, Serialize, Deserialize, Eq, PartialEq)]
#[serde(rename_all = "lowercase")]
pub enum GuestOsKind {
    Alpine,
    Debian11NoCloud,
    Ubuntu2204,
    WindowsServer2016,
    WindowsServer2019,
    WindowsServer2022,
}

impl GuestOsKind {
    pub fn is_linux(&self) -> bool {
        match self {
            GuestOsKind::Alpine
            | GuestOsKind::Debian11NoCloud
            | GuestOsKind::Ubuntu2204 => true,
            GuestOsKind::WindowsServer2016
            | GuestOsKind::WindowsServer2019
            | GuestOsKind::WindowsServer2022 => false,
        }
    }

    pub fn is_windows(&self) -> bool {
        match self {
            GuestOsKind::WindowsServer2016
            | GuestOsKind::WindowsServer2019
            | GuestOsKind::WindowsServer2022 => true,
            GuestOsKind::Alpine
            | GuestOsKind::Debian11NoCloud
            | GuestOsKind::Ubuntu2204 => false,
        }
    }
}

impl FromStr for GuestOsKind {
    type Err = std::io::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "alpine" => Ok(Self::Alpine),
            "debian11nocloud" => Ok(Self::Debian11NoCloud),
            "ubuntu2204" => Ok(Self::Ubuntu2204),
            "windowsserver2016" => Ok(Self::WindowsServer2016),
            "windowsserver2019" => Ok(Self::WindowsServer2019),
            "windowsserver2022" => Ok(Self::WindowsServer2022),
            _ => Err(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                format!("Unrecognized guest OS kind {s}"),
            )),
        }
    }
}

pub(super) fn get_guest_os_adapter(kind: GuestOsKind) -> Box<dyn GuestOs> {
    match kind {
        GuestOsKind::Alpine => Box::new(alpine::Alpine),
        GuestOsKind::Debian11NoCloud => {
            Box::new(debian11_nocloud::Debian11NoCloud)
        }
        GuestOsKind::Ubuntu2204 => Box::new(ubuntu22_04::Ubuntu2204),
        GuestOsKind::WindowsServer2016 => {
            Box::new(windows_server_2016::WindowsServer2016)
        }
        GuestOsKind::WindowsServer2019 => {
            Box::new(windows_server_2019::WindowsServer2019)
        }
        GuestOsKind::WindowsServer2022 => {
            Box::new(windows_server_2022::WindowsServer2022)
        }
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/shell_commands.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Common helper functions for issuing shell commands to guests and handling
//! their outputs.

use std::borrow::Cow;

use super::{CommandSequence, CommandSequenceEntry};

/// Produces the shell command sequence necessary to execute `cmd` in a guest's
/// shell, given that the guest is using the supplied serial console buffering
/// discipline.
///
/// This routine assumes that multi-line commands will be echoed with `> ` at
/// the start of each line in the command. This is technically shell-dependent
/// but is true for all the shell types in PHD's currently-supported guests.
pub(super) fn shell_command_sequence(
    cmd: Cow<'_, str>,
    buffer_kind: crate::serial::BufferKind,
) -> CommandSequence<'_> {
    let echo = cmd.trim_end().replace('\n', "\n> ");
    match buffer_kind {
        crate::serial::BufferKind::Raw => CommandSequence(vec![
            CommandSequenceEntry::write_str(cmd),
            CommandSequenceEntry::wait_for(echo),
            CommandSequenceEntry::ClearBuffer,
            CommandSequenceEntry::write_str("\n"),
        ]),

        crate::serial::BufferKind::Vt80x24 => {
            // In 80x24 mode, it's simplest to issue multi-line operations one
            // line at a time and wait for each line to be echoed before
            // starting the next. For very long commands (more than 24 lines),
            // this avoids having to deal with lines scrolling off the buffer
            // before they can be waited for.
            let cmd_lines = cmd.trim_end().lines();
            let echo_lines = echo.lines();
            let mut seq = vec![];

            let mut iter = cmd_lines.zip(echo_lines).peekable();
            while let Some((cmd, echo)) = iter.next() {
                seq.push(CommandSequenceEntry::write_str(cmd.to_owned()));
                seq.push(CommandSequenceEntry::wait_for(echo.to_owned()));

                if iter.peek().is_some() {
                    seq.push(CommandSequenceEntry::write_str("\n"));
                }
            }

            // Before issuing the command, clear any stale echoed characters
            // from the serial console buffer. This ensures that the next prompt
            // is preceded in the buffer only by the output of the issued
            // command.
            seq.push(CommandSequenceEntry::ClearBuffer);
            seq.push(CommandSequenceEntry::write_str("\n"));
            CommandSequence(seq)
        }
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/ubuntu22_04.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Ubuntu 22.04 images. These must be prepped with
//! a cloud-init disk that is configured with the appropriate user and password.

use super::{CommandSequence, CommandSequenceEntry, GuestOs};

pub(super) struct Ubuntu2204;

impl GuestOs for Ubuntu2204 {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        CommandSequence(vec![
            CommandSequenceEntry::wait_for("ubuntu login: "),
            CommandSequenceEntry::write_str("ubuntu"),
            CommandSequenceEntry::wait_for("Password: "),
            CommandSequenceEntry::write_str("1!Passw0rd"),
            CommandSequenceEntry::wait_for("ubuntu@ubuntu:~$"),
            CommandSequenceEntry::write_str("sudo bash\n"),
            CommandSequenceEntry::wait_for("root@ubuntu:/home/ubuntu#"),
            CommandSequenceEntry::write_str("cd ~\n"),
            CommandSequenceEntry::wait_for(self.get_shell_prompt()),
        ])
        .extend(super::linux::stty_enable_long_lines(self))
    }

    fn get_shell_prompt(&self) -> &'static str {
        "root@ubuntu:~#"
    }

    fn read_only_fs(&self) -> bool {
        false
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        // Ubuntu `reboot` seems to be mechanically similar to Alpine `reboot`,
        // except mediated by SystemD rather than OpenRC. We'll get a new shell
        // prompt, and then the system reboots shortly after. Just issuing
        // `reboot` and waiting for a login prompt is the lowest common
        // denominator across Linuxes.
        self.shell_command_sequence("reboot")
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/windows.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Functionality common to all Windows guests.

use crate::TestVm;

use super::{CommandSequence, CommandSequenceEntry, GuestOsKind};

use tracing::info;

/// A wrapper that provides Windows-specific extensions to the core `TestVm`
/// implementation.
pub struct WindowsVm<'a> {
    /// The VM being extended by this structure. The framework is required to
    /// ensure that the VM is actually configured to run a Windows guest OS.
    pub(crate) vm: &'a TestVm,
}

impl WindowsVm<'_> {
    /// Runs `cmd` as a Powershell command.
    pub async fn run_powershell_command(
        &self,
        cmd: &str,
    ) -> anyhow::Result<String> {
        assert!(self.vm.guest_os_kind().is_windows());

        info!(cmd, "executing Powershell command");

        // Use Powershell's -encodedCommand switch to keep important Powershell
        // sigils in the command (like "$") from being interpreted by whatever
        // shell is being used to invoke Powershell. This switch expects that
        // the encoded string will decode into a UTF-16 string; `str`s are, of
        // course, UTF-8, so switch encodings before converting to base64.
        let utf16 = cmd.encode_utf16().collect::<Vec<u16>>();
        let base64 = base64::Engine::encode(
            &base64::engine::general_purpose::STANDARD,
            unsafe { utf16.align_to::<u8>().1 },
        );

        let cmd = format!("powershell -encodedCommand {base64}");
        self.vm.run_shell_command(&cmd).await
    }
}

impl std::ops::Deref for WindowsVm<'_> {
    type Target = TestVm;

    fn deref(&self) -> &Self::Target {
        self.vm
    }
}

const CYGWIN_CMD: &str = "C:\\cygwin\\cygwin.bat\r";

/// Prepends a `reset` command to the shell command supplied in `cmd`. Windows
/// versions that drive a VT100 terminal can use this to try to force Windows to
/// clear and redraw the entire screen before displaying the command's output.
/// Without this, Windows may not render the post-output command prompt if the
/// post-command terminal state happens to place a prompt at a location that
/// already had onen pre-command.
pub(super) fn prepend_reset_to_shell_command(cmd: &str) -> String {
    format!("reset && {cmd}")
}

/// Emits the login seqeunce for the given `guest`, which must be one of the
/// Windows guest OS flavors.
///
/// This login sequence assumes the following:
///
/// - Cygwin is installed to C:\cygwin and can be launched by invoking
///   C:\cygwin\cygwin.bat.
/// - The local administrator account is enabled with password `0xide#1Fan`.
pub(super) fn get_login_sequence_for<'a>(
    guest: GuestOsKind,
) -> CommandSequence<'a> {
    assert!(matches!(
        guest,
        GuestOsKind::WindowsServer2016
            | GuestOsKind::WindowsServer2019
            | GuestOsKind::WindowsServer2022
    ));

    let mut commands = vec![
        // Look for `BdsDxe:` as a sign that we're actually seeing a fresh boot.
        // This is not terribly important in the case of a first boot, but in a
        // case such as logging out and waiting for reboot, exiting a cmd.exe
        // session causes Windows to redraw its previous screen - everything
        // past `Computer is booting, ...` below.
        //
        // A test that tries to boot and wait for a new login sequence would
        // then incorrectly identify the already-booted VM as the freshly-booted
        // OS it was waiting for, log in again, and at some point later finally
        // actually reboot.
        //
        // At least on Windows Server 2022, there is an XML prelude that is
        // printed to COM1 that we could look for here, but check for `BdsDxe: `
        // instead as that comes from OVMF and will be consistent regardless of
        // guest OS version.
        CommandSequenceEntry::wait_for("BdsDxe: loading "),
        CommandSequenceEntry::wait_for(
            "Computer is booting, SAC started and initialized.",
        ),
        CommandSequenceEntry::wait_for(
            "EVENT: The CMD command is now available.",
        ),
        CommandSequenceEntry::wait_for("SAC>"),
        CommandSequenceEntry::write_str("cmd"),
        CommandSequenceEntry::wait_for("Channel: Cmd0001"),
        CommandSequenceEntry::wait_for("SAC>"),
        CommandSequenceEntry::write_str("ch -sn Cmd0001"),
        CommandSequenceEntry::wait_for(
            "Use any other key to view this channel.",
        ),
        CommandSequenceEntry::write_str(""),
        CommandSequenceEntry::wait_for("Username:"),
        CommandSequenceEntry::write_str("Administrator"),
        CommandSequenceEntry::wait_for("Domain  :"),
        CommandSequenceEntry::write_str(""),
        CommandSequenceEntry::wait_for("Password:"),
        CommandSequenceEntry::write_str("0xide#1Fan"),
    ];

    // Earlier Windows Server versions' serial console-based command prompts
    // default to trying to drive a VT100 terminal themselves instead of
    // emitting characters and letting the recipient display them in whatever
    // style it likes. This only happens once the command prompt has been
    // activated, so only switch buffering modes after entering credentials.
    if matches!(
        guest,
        GuestOsKind::WindowsServer2016 | GuestOsKind::WindowsServer2019
    ) {
        commands.extend([
            CommandSequenceEntry::ChangeSerialConsoleBuffer(
                crate::serial::BufferKind::Vt80x24,
            ),
            // These versions also like to debounce keystrokes, so set a delay
            // between repeated characters to try to avoid this. This is a very
            // conservative delay to try to avoid test flakiness; fortunately,
            // it only applies when typing the same character multiple times in
            // a row.
            CommandSequenceEntry::SetRepeatedCharacterDebounce(
                std::time::Duration::from_millis(1500),
            ),
        ]);
    }

    commands.extend([
        // There appears (from observing Windows test reliability) to be some
        // kind of race at command prompt startup that can cause characters to
        // be eaten if they're typed too quickly after the command prompt
        // session launches. To get around this, try to send "serial console ok"
        // strings until one of them gets echoed back correctly or the entire
        // boot times out.
        CommandSequenceEntry::wait_for("C:\\Windows\\system32>"),
        CommandSequenceEntry::EstablishConsistentEcho {
            send: "echo serial console ok\\n\r\n".into(),
            expect: "serial console ok".into(),
            timeout: std::time::Duration::from_millis(250),
        },
        // Make sure there's a clean command prompt after establishing the echo.
        CommandSequenceEntry::write_str("cls\r"),
        CommandSequenceEntry::wait_for("C:\\Windows\\system32>"),
    ]);

    // Keep Cygwin from wrapping lines unexpectedly on Windows Server 2022 by
    // maximizing the effective console size before launching Cygwin. This just
    // confuses matters on Server 2016 and 2019, so on those guests just launch
    // Cygwin directly.
    if let GuestOsKind::WindowsServer2022 = guest {
        commands.push(CommandSequenceEntry::write_str(format!(
            "mode con cols=9999 lines=9999 && {CYGWIN_CMD}",
        )));
    } else {
        commands.push(CommandSequenceEntry::write_str(CYGWIN_CMD));
    }

    commands.extend([
        CommandSequenceEntry::wait_for("$ "),
        // Tweak the command prompt so that it appears on a single line with
        // no leading newlines.
        CommandSequenceEntry::write_str("PS1='$ '"),
    ]);

    CommandSequence(commands)
}


================================================
FILE: phd-tests/framework/src/guest_os/windows_server_2016.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Windows Server 2016 images. See [the general
//! Windows module](mod@super::windows) documentation for more information.

use std::borrow::Cow;

use super::{
    windows::prepend_reset_to_shell_command, CommandSequence, GuestOs,
    GuestOsKind,
};

/// The guest adapter for Windows Server 2016 images. See [the general
/// Windows module](mod@super::windows) documentation for more information about
/// the configuration this adapter requires.
pub(super) struct WindowsServer2016;

impl GuestOs for WindowsServer2016 {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        super::windows::get_login_sequence_for(GuestOsKind::WindowsServer2016)
    }

    fn get_shell_prompt(&self) -> &'static str {
        "$ "
    }

    fn read_only_fs(&self) -> bool {
        false
    }

    fn shell_command_sequence<'a>(&self, cmd: &'a str) -> CommandSequence<'a> {
        super::shell_commands::shell_command_sequence(
            Cow::Owned(prepend_reset_to_shell_command(cmd)),
            crate::serial::BufferKind::Vt80x24,
        )
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        self.shell_command_sequence("shutdown /r /t 0 /d p:0:0")
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/windows_server_2019.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Windows Server 2019 images. See [the general
//! Windows module](mod@super::windows) documentation for more information.

use std::borrow::Cow;

use super::{
    windows::prepend_reset_to_shell_command, CommandSequence, GuestOs,
    GuestOsKind,
};

/// The guest adapter for Windows Server 2019 images. See [the general
/// Windows module](mod@super::windows) documentation for more information about
/// the configuration this adapter requires.
pub(super) struct WindowsServer2019;

impl GuestOs for WindowsServer2019 {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        super::windows::get_login_sequence_for(GuestOsKind::WindowsServer2019)
    }

    fn get_shell_prompt(&self) -> &'static str {
        "$ "
    }

    fn read_only_fs(&self) -> bool {
        false
    }

    fn shell_command_sequence<'a>(&self, cmd: &'a str) -> CommandSequence<'a> {
        super::shell_commands::shell_command_sequence(
            Cow::Owned(prepend_reset_to_shell_command(cmd)),
            crate::serial::BufferKind::Vt80x24,
        )
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        self.shell_command_sequence("shutdown /r /t 0 /d p:0:0")
    }
}


================================================
FILE: phd-tests/framework/src/guest_os/windows_server_2022.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Guest OS adaptations for Windows Server 2022 images. See [the general
//! Windows module](mod@super::windows) documentation for more information.

use super::{CommandSequence, GuestOs, GuestOsKind};

/// The guest adapter for Windows Server 2022 images. See [the general
/// Windows module](mod@super::windows) documentation for more information about
/// the configuration this adapter requires.
pub(super) struct WindowsServer2022;

impl GuestOs for WindowsServer2022 {
    fn get_login_sequence(&self) -> CommandSequence<'_> {
        super::windows::get_login_sequence_for(GuestOsKind::WindowsServer2022)
    }

    fn get_shell_prompt(&self) -> &'static str {
        "$ "
    }

    fn read_only_fs(&self) -> bool {
        false
    }

    fn graceful_reboot(&self) -> CommandSequence<'_> {
        self.shell_command_sequence("shutdown /r /t 0 /d p:0:0")
    }
}


================================================
FILE: phd-tests/framework/src/host_api/kvm.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::{ffi::CString, fmt::Display};

use anyhow::{anyhow, Result};
use bhyve_api::ApiVersion;
use errno::errno;
use libc::{
    c_char, c_int, c_long, c_short, c_ushort, c_void, size_t, ssize_t,
    uintptr_t, O_RDWR,
};

/// The structure used to query symbol values from `kvm_nlist`. See the man page
/// for nm(1) for more context.
#[allow(non_camel_case_types)]
#[derive(Debug)]
#[repr(C)]
struct nlist {
    /// The name of the symbol to query, or NULL for the sentinel entry in an
    /// array of entries.
    name: *const c_char,

    /// The virtual address of the symbol.
    n_value: c_long,
    n_scnum: c_short,
    n_type: c_ushort,
    n_sclass: c_char,
    n_numaux: c_char,
}

impl Default for nlist {
    fn default() -> Self {
        Self {
            name: std::ptr::null(),
            n_value: 0,
            n_scnum: 0,
            n_type: 0,
            n_sclass: 0,
            n_numaux: 0,
        }
    }
}

#[link(name = "kvm")]
extern "C" {
    fn kvm_open(
        namelist: *const c_char,
        corefile: *const c_char,
        swapfile: *const c_char,
        flag: c_int,
        errstr: *const c_char,
    ) -> *const c_void;

    fn kvm_close(kd: *const c_void) -> c_int;

    fn kvm_nlist(kd: *const c_void, nl: *mut nlist) -> c_int;

    fn kvm_kread(
        kd: *const c_void,
        addr: uintptr_t,
        buf: *mut c_void,
        nbytes: size_t,
    ) -> ssize_t;

    fn kvm_kwrite(
        kd: *const c_void,
        addr: uintptr_t,
        buf: *mut c_void,
        nbytes: size_t,
    ) -> ssize_t;
}

/// RAII wrapper for libkvm handles.
struct KvmHdl {
    hdl: *const c_void,
}

impl KvmHdl {
    fn open() -> Result<Self> {
        // Per the docs, kvm_open(3KVM) defaults to using /dev/ksyms as a symbol
        // file when no symbol table is specified.
        let kvm_hdl = unsafe {
            kvm_open(
                std::ptr::null(),
                std::ptr::null(),
                std::ptr::null(),
                O_RDWR,
                std::ptr::null(),
            )
        };

        if kvm_hdl.is_null() {
            Err(anyhow!(
                "kvm_open failed with code {} ({})",
                errno().0,
                errno()
            ))
        } else {
            Ok(Self { hdl: kvm_hdl })
        }
    }
}

impl Drop for KvmHdl {
    fn drop(&mut self) {
        unsafe {
            kvm_close(self.hdl);
        }
    }
}

/// Returns the virtual address of the symbol with the supplied name, suitable
/// for later use in a call to kvm_kread or kvm_kwrite.
fn find_symbol_va(kvm_hdl: &KvmHdl, symbol: &str) -> Result<uintptr_t> {
    // N.B. This string must be created out-of-line so that it outlives the
    //      nlist that includes a pointer to its buffer.
    let symbol = CString::new(symbol)?;
    let nlist = vec![
        nlist { name: symbol.as_ptr(), ..Default::default() },
        // nlist expects an array of structures terminated by one with a NULL
        // name pointer (or an empty name string).
        nlist::default(),
    ];

    let mut slice = nlist.into_boxed_slice();
    let err = unsafe { kvm_nlist(kvm_hdl.hdl, slice.as_mut_ptr()) };
    if err != 0 {
        return Err(anyhow!(
            "kvm_nlist returned {}, errno {} ({})",
            err,
            errno().0,
            errno()
        ));
    }

    Ok(slice[0].n_value as uintptr_t)
}

/// Fills the supplied `buf` from the supplied kernel VA.
fn read_from_va(kvm_hdl: &KvmHdl, va: uintptr_t, buf: &mut [u8]) -> Result<()> {
    let bytes_read = unsafe {
        kvm_kread(kvm_hdl.hdl, va, buf.as_mut_ptr() as *mut c_void, buf.len())
    };

    if bytes_read < 0 {
        Err(anyhow!(
            "kvm_kread for VA {:x} returned {}, errno {} ({})",
            va,
            bytes_read,
            errno().0,
            errno()
        ))
    } else if bytes_read as usize == buf.len() {
        Ok(())
    } else {
        Err(anyhow!(
            "kvm_kread for VA {:x} read {} bytes, expected {}",
            va,
            bytes_read,
            buf.len()
        ))
    }
}

/// Writes the supplied `buf` to the supplied kernel VA.
fn write_to_va(kvm_hdl: &KvmHdl, va: uintptr_t, buf: &mut [u8]) -> Result<()> {
    let bytes_written = unsafe {
        kvm_kwrite(kvm_hdl.hdl, va, buf.as_mut_ptr() as *mut c_void, buf.len())
    };

    if bytes_written < 0 {
        Err(anyhow!(
            "kvm_kwrite of {} bytes for VA {:x} returned {}, errno {} ({})",
            buf.len(),
            va,
            bytes_written,
            errno().0,
            errno()
        ))
    } else if bytes_written as usize == buf.len() {
        Ok(())
    } else {
        Err(anyhow!(
            "kvm_kwrite of {} bytes for VA {:x} wrote {} bytes, expected {}",
            buf.len(),
            va,
            bytes_written,
            buf.len()
        ))
    }
}

/// Reads a u8-sized value from the supplied kernel VA.
fn read_u8_from_va(kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<u8> {
    let mut buf = [0u8; 1];
    read_from_va(kvm_hdl, va, &mut buf).map(|_| u8::from_le_bytes(buf))
}

/// Reads a u32-sized value from the supplied kernel VA.
fn read_u32_from_va(kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<u32> {
    let mut buf = [0u8; 4];
    read_from_va(kvm_hdl, va, &mut buf).map(|_| u32::from_le_bytes(buf))
}

/// Writes a u8-sized value to the supplied kernel VA.
fn write_u8_to_va(kvm_hdl: &KvmHdl, va: uintptr_t, value: u8) -> Result<()> {
    let mut buf = value.to_le_bytes();
    write_to_va(kvm_hdl, va, &mut buf)
}

/// Writes a u32-sized value to the supplied kernel VA.
fn write_u32_to_va(kvm_hdl: &KvmHdl, va: uintptr_t, value: u32) -> Result<()> {
    let mut buf = value.to_le_bytes();
    write_to_va(kvm_hdl, va, &mut buf)
}

/// A wrapper trait that allows fixed-size values to be read from and written
/// to a given VA while abstracting away their actual sizes.
trait SizedKernelGlobal: Sized + Default + Display {
    /// Populates `self` by reading from the supplied kernel VA.
    fn read_from_va(&mut self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()>;

    /// Writes the data wrapped in `self` to the supplied kernel VA.
    fn write_to_va(&self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()>;
}

impl SizedKernelGlobal for u8 {
    fn write_to_va(&self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()> {
        write_u8_to_va(kvm_hdl, va, *self)
    }

    fn read_from_va(&mut self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()> {
        *self = read_u8_from_va(kvm_hdl, va)?;
        Ok(())
    }
}

impl SizedKernelGlobal for u32 {
    fn write_to_va(&self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()> {
        write_u32_to_va(kvm_hdl, va, *self)
    }

    fn read_from_va(&mut self, kvm_hdl: &KvmHdl, va: uintptr_t) -> Result<()> {
        *self = read_u32_from_va(kvm_hdl, va)?;
        Ok(())
    }
}

/// An RAII wrapper that undoes changes to kernel globals.
struct KernelValueGuard<T: SizedKernelGlobal> {
    /// The name of the modified symbol.
    symbol: &'static str,

    /// The kernel VM handle used to access kernel memory.
    kvm_hdl: KvmHdl,

    /// The value to restore to this symbol when this wrapper is dropped.
    old_value: T,
}

impl<T: SizedKernelGlobal> KernelValueGuard<T> {
    /// Sets the supplied `symbol` to `value` and returns an RAII guard that
    /// restores `symbol`'s prior value when dropped.
    ///
    /// # Safety
    ///
    /// The caller must ensure that `value` can safely be written to the kernel
    /// VA described by `symbol`. For example, if `symbol` refers to a 2-byte
    /// value, the caller must ensure that `value` is of a type that will write
    /// no more than 2 bytes to kernel memory.
    fn new(symbol: &'static str, value: T) -> Result<Self> {
        let kvm_hdl = KvmHdl::open()?;
        let va = find_symbol_va(&kvm_hdl, symbol)?;
        let mut old_value = T::default();
        old_value.read_from_va(&kvm_hdl, va)?;

        tracing::info!(symbol, va, %old_value, %value, "Setting kernel global");

        value.write_to_va(&kvm_hdl, va)?;
        Ok(Self { symbol, kvm_hdl, old_value })
    }
}

impl<T: SizedKernelGlobal> Drop for KernelValueGuard<T> {
    fn drop(&mut self) {
        // It was possible to write this value before using the handle stored in
        // this guard, so unless something has gone terribly wrong, it should be
        // possible to look up the same symbol and restore its old value.
        let va = find_symbol_va(&self.kvm_hdl, self.symbol)
            .unwrap_or_else(|_| panic!("couldn't find symbol {}", self.symbol));

        self.old_value.write_to_va(&self.kvm_hdl, va).unwrap_or_else(|_| {
            panic!("couldn't reset value of {}", self.symbol)
        });
    }
}

/// Sets all of the kernel globals needed to run PHD tests. Returns a vector of
/// RAII guards that reset these values to their pre-test values when dropped.
pub fn set_vmm_globals() -> Result<Vec<Box<dyn std::any::Any>>> {
    let mut guards: Vec<Box<dyn std::any::Any>> = vec![];

    let ver = bhyve_api::api_version()?;

    if ver < ApiVersion::V13 {
        guards.push(Box::new(KernelValueGuard::new(
            "vmm_allow_state_writes",
            1u32,
        )?));
    }

    if ver < ApiVersion::V8 {
        // Enable global dirty tracking bit on systems where it exists.
        if let Ok(gpt_track_dirty) =
            KernelValueGuard::new("gpt_track_dirty", 1u8)
        {
            guards.push(Box::new(gpt_track_dirty));
        }
    }

    Ok(guards)
}


================================================
FILE: phd-tests/framework/src/host_api/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

cfg_if::cfg_if! {
    if #[cfg(target_os = "illumos")] {
        mod kvm;
        pub use kvm::*;
    } else {
        mod stubs;
        pub use stubs::*;
    }
}


================================================
FILE: phd-tests/framework/src/host_api/stubs.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::{anyhow, Result};

pub struct VmmStateWriteGuard;

pub fn set_vmm_globals() -> Result<Vec<Box<dyn std::any::Any>>> {
    Err(anyhow!("set_vmm_globals requires illumos"))
}


================================================
FILE: phd-tests/framework/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! The Pheidippides framework: interfaces for creating and interacting with
//! VMs.
//!
//! This module defines a `Framework` object that contains a set of default VM
//! parameters (shape, bootrom, boot disk image) and the context needed to
//! launch new guest VMs (paths, logging options, an artifact store, etc.). The
//! PHD runner process instantiates a `Framework` and then passes a reference to
//! it to each PHD test case. Test cases then use the `Framework`'s public
//! interface to create test VMs with various configurations.
//!
//! Tests are expected to access `Framework` functions to create VMs and public
//! `TestVm` functions to work directly with those VMs. Most other functionality
//! in this crate is private to the crate.
//!
//! To launch a VM, the framework needs to know how to configure the VM itself
//! and how to run the Propolis server process that will host it. The framework
//! supplies builders, `vm_builder` and `environment_builder`, that allow tests
//! to configure both of these options.
//!
//! Often, tests will want to spawn a "successor" to an existing VM that
//! maintains the VM's configuration and any related objects but that runs in a
//! separate Propolis server process that may have been spawned in a different
//! environment. The `spawn_successor_vm` function provides a shorthand way to
//! do this.

use std::{fmt, ops::Range, sync::Arc};

use anyhow::Context;
use artifacts::DEFAULT_PROPOLIS_ARTIFACT;
use camino::Utf8PathBuf;

use disk::DiskFactory;
use futures::{stream::FuturesUnordered, StreamExt};
use guest_os::GuestOsKind;
use log_config::LogConfig;
use port_allocator::PortAllocator;
pub use test_vm::TestVm;
use test_vm::{
    environment::EnvironmentSpec, spec::VmSpec, TestVmManualStop, VmConfig,
    VmLocation,
};
use tokio::{
    sync::mpsc::{UnboundedReceiver, UnboundedSender},
    task::JoinHandle,
};

pub mod artifacts;
pub mod disk;
pub mod guest_os;
pub mod host_api;
pub mod lifecycle;
pub mod log_config;
mod port_allocator;
mod serial;
pub mod test_vm;
pub(crate) mod zfs;

/// A test context for an individual PHD test, containing a `Framework` plus
/// test specific information.
pub struct TestCtx {
    pub(crate) framework: Arc<Framework>,
    pub(crate) output_dir: Utf8PathBuf,
    pub(crate) manual_stop: Option<TestVmManualStop>,
}

/// An instance of the PHD test framework.
pub struct Framework {
    pub(crate) tmp_directory: Utf8PathBuf,
    pub(crate) log_config: LogConfig,

    pub(crate) default_guest_cpus: u8,
    pub(crate) default_guest_memory_mib: u64,
    pub(crate) default_guest_os_artifact: String,
    pub(crate) default_bootrom_artifact: String,

    // The disk factory used to be a freestanding struct that took references to
    // an artifact store and port allocator that were owned by someone else.
    // Putting all these components into a single struct makes the struct
    // self-referencing. Since the runner is single-threaded, avoid arguing with
    // anyone about lifetimes by wrapping the relevant shared components in an
    // `Rc`.
    pub(crate) artifact_store: Arc<artifacts::ArtifactStore>,
    pub(crate) disk_factory: DiskFactory,
    pub(crate) port_allocator: Arc<PortAllocator>,

    pub(crate) crucible_enabled: bool,
    pub(crate) migration_base_enabled: bool,

    /// Buffers cleanup tasks that need to be run after a test case completes.
    /// [`Self::cleanup_task_channel`] returns a clone of this sender that
    /// framework users can use to register these tasks (without having to hold
    /// a reference to the `Framework`).
    cleanup_task_tx: UnboundedSender<JoinHandle<()>>,

    /// The receiver side of [`cleanup_task_tx`].
    cleanup_task_rx: tokio::sync::Mutex<UnboundedReceiver<JoinHandle<()>>>,
}

pub struct FrameworkParameters<'a> {
    pub propolis_server_path: Utf8PathBuf,
    pub crucible_downstairs: Option<CrucibleDownstairsSource>,
    pub base_propolis: Option<BasePropolisSource<'a>>,

    pub tmp_directory: Utf8PathBuf,
    pub artifact_directory: Utf8PathBuf,
    pub artifact_toml: Utf8PathBuf,
    pub log_config: LogConfig,

    pub default_guest_cpus: u8,
    pub default_guest_memory_mib: u64,
    pub default_guest_os_artifact: String,
    pub default_bootrom_artifact: String,

    pub port_range: Range<u16>,
    pub max_buildomat_wait: std::time::Duration,
}

#[derive(Debug)]
pub enum CrucibleDownstairsSource {
    BuildomatGitRev(artifacts::buildomat::Commit),
    Local(Utf8PathBuf),
}

#[derive(Debug, Copy, Clone)]
pub enum BasePropolisSource<'a> {
    BuildomatGitRev(&'a artifacts::buildomat::Commit),
    BuildomatBranch(&'a str),
    Local(&'a Utf8PathBuf),
}

impl TestCtx {
    /// Creates a new VM configuration builder using the default configuration
    /// from this framework instance.
    pub fn vm_config_builder(&self, vm_name: &str) -> VmConfig<'_> {
        self.framework.vm_config_builder(vm_name)
    }

    /// Yields an environment builder with default settings (run the VM on the
    /// test runner's machine using the default Propolis from the command line).
    pub fn environment_builder(&self) -> EnvironmentSpec {
        self.framework.environment_builder()
    }

    /// Yields this framework instance's default guest OS artifact name. This
    /// can be used to configure boot disks with different parameters than the
    /// builder defaults.
    pub fn default_guest_os_artifact(&self) -> &str {
        self.framework.default_guest_os_artifact()
    }

    /// Yields the guest OS adapter corresponding to the default guest OS
    /// artifact.
    pub async fn default_guest_os_kind(&self) -> anyhow::Result<GuestOsKind> {
        self.framework.default_guest_os_kind().await
    }

    /// Indicates whether the disk factory in this framework supports the
    /// creation of Crucible disks. This can be used to skip tests that require
    /// Crucible support.
    pub fn crucible_enabled(&self) -> bool {
        self.framework.crucible_enabled
    }

    /// Indicates whether a "migration base" Propolis server artifact is
    /// available for migration-from-base tests.
    pub fn migration_base_enabled(&self) -> bool {
        self.framework.migration_base_enabled
    }
    /// Spawns a test VM using the default configuration returned from
    /// `vm_builder` and the default environment returned from
    /// `environment_builder`.
    pub async fn spawn_default_vm(
        &self,
        vm_name: &str,
    ) -> anyhow::Result<TestVm> {
        self.spawn_vm(&self.vm_config_builder(vm_name), None).await
    }

    /// Spawns a new test VM using the supplied `config`. If `environment` is
    /// `Some`, the VM is spawned using the supplied environment; otherwise it
    /// is spawned using the default `environment_builder`.
    pub async fn spawn_vm(
        &self,
        config: &VmConfig<'_>,
        environment: Option<&EnvironmentSpec>,
    ) -> anyhow::Result<TestVm> {
        self.spawn_vm_with_spec(
            config
                .vm_spec(self)
                .await
                .context("building VM spec from VmConfig")?,
            environment,
        )
        .await
    }

    /// Spawns a new test VM using the supplied `spec`. If `environment` is
    /// `Some`, the VM is spawned using the supplied environment; otherwise it
    /// is spawned using the default `environment_builder`.
    pub async fn spawn_vm_with_spec(
        &self,
        spec: VmSpec,
        environment: Option<&EnvironmentSpec>,
    ) -> anyhow::Result<TestVm> {
        TestVm::new(
            self,
            spec,
            environment.unwrap_or(&self.environment_builder()),
        )
        .await
        .context("constructing test VM")
    }

    /// Spawns a "successor" to the supplied `vm`. The successor has the same
    /// configuration and takes additional references to all of its
    /// predecessor's backing objects (e.g. disk handles). If `environment` is
    /// `None`, the successor is launched using the predecessor's environment
    /// spec.
    pub async fn spawn_successor_vm(
        &self,
        vm_name: &str,
        vm: &TestVm,
        environment: Option<&EnvironmentSpec>,
    ) -> anyhow::Result<TestVm> {
        let mut vm_spec = vm.vm_spec().clone();
        vm_spec.set_vm_name(vm_name.to_owned());

        // Create new metadata for an instance based on this predecessor. It
        // should have the same project and silo IDs, but the sled identifiers
        // will be different.
        vm_spec.refresh_sled_identifiers();

        TestVm::new(
            self,
            vm_spec,
            environment.unwrap_or(&vm.environment_spec()),
        )
        .await
    }

    /// When phd-runner is configured to leave instances running on failed
    /// tests, the watch channel whose Receiver is passed to this function is
    /// used to indicate to the instance cleanup task that a test *has* failed.
    pub fn set_cleanup_task_outcome_receiver(
        &mut self,
        manual_stop: TestVmManualStop,
    ) {
        self.manual_stop = Some(manual_stop);
    }
}

// The framework implementation includes some "runner-only" functions
// (constructing and resetting a framework) that are marked `pub`. This could be
// improved by splitting the "test case" functions into a trait and giving test
// cases trait objects.
impl Framework {
    /// Builds a brand new framework. Called from the test runner, which creates
    /// one framework and then distributes it to tests.
    pub async fn new(params: FrameworkParameters<'_>) -> anyhow::Result<Self> {
        let mut artifact_store = artifacts::ArtifactStore::from_toml_path(
            params.artifact_directory.clone(),
            &params.artifact_toml,
            params.max_buildomat_wait,
        )
        .context("creating PHD framework")?;

        artifact_store
            .add_propolis_from_local_cmd(&params.propolis_server_path)
            .with_context(|| {
                format!(
                    "adding Propolis server '{}' from options",
                    &params.propolis_server_path
                )
            })?;

        let crucible_enabled = match params.crucible_downstairs {
            Some(source) => {
                artifact_store
                    .add_crucible_downstairs(&source)
                    .await
                    .with_context(|| {
                        format!(
                            "adding Crucible downstairs {source} from options",
                        )
                    })?;
                true
            }
            None => {
                tracing::warn!(
                    "Crucible disabled. Crucible tests will be skipped"
                );
                false
            }
        };

        let migration_base_enabled = match params.base_propolis {
            Some(source) => {
                artifact_store
                    .add_current_propolis(source)
                    .await
                    .with_context(|| format!("adding 'migration base' Propolis server {source} from options"))?;
                true
            }
            None => {
                tracing::warn!("No 'migration base' Propolis server provided. Migration-from-base tests will be skipped.");
                false
            }
        };

        let artifact_store = Arc::new(artifact_store);
        let port_allocator = Arc::new(PortAllocator::new(params.port_range));
        let disk_factory = DiskFactory::new(
            &params.tmp_directory,
            artifact_store.clone(),
            port_allocator.clone(),
            params.log_config,
        );

        let (cleanup_task_tx, cleanup_task_rx) =
            tokio::sync::mpsc::unbounded_channel();
        Ok(Self {
            tmp_directory: params.tmp_directory,
            log_config: params.log_config,
            default_guest_cpus: params.default_guest_cpus,
            default_guest_memory_mib: params.default_guest_memory_mib,
            default_guest_os_artifact: params.default_guest_os_artifact,
            default_bootrom_artifact: params.default_bootrom_artifact,
            artifact_store,
            disk_factory,
            port_allocator,
            crucible_enabled,
            migration_base_enabled,
            cleanup_task_tx,
            cleanup_task_rx: tokio::sync::Mutex::new(cleanup_task_rx),
        })
    }

    pub fn test_ctx(self: &Arc<Self>, fully_qualified_name: String) -> TestCtx {
        let output_dir =
            self.tmp_directory.as_path().join(&fully_qualified_name);
        TestCtx { framework: self.clone(), output_dir, manual_stop: None }
    }

    /// Resets the state of any stateful objects in the framework to prepare it
    /// to run a new test case.
    pub async fn reset(&self) {
        self.port_allocator.reset();
        self.wait_for_cleanup_tasks().await;
    }

    /// Creates a new VM configuration builder using the default configuration
    /// from this framework instance.
    pub fn vm_config_builder(&self, vm_name: &str) -> VmConfig<'_> {
        VmConfig::new(
            vm_name,
            self.default_guest_cpus,
            self.default_guest_memory_mib,
            &self.default_bootrom_artifact,
            &self.default_guest_os_artifact,
        )
    }
    /// Yields an environment builder with default settings (run the VM on the
    /// test runner's machine using the default Propolis from the command line).
    pub fn environment_builder(&self) -> EnvironmentSpec {
        EnvironmentSpec::new(VmLocation::Local, DEFAULT_PROPOLIS_ARTIFACT)
    }

    /// Yields this framework instance's default guest OS artifact name. This
    /// can be used to configure boot disks with different parameters than the
    /// builder defaults.
    pub fn default_guest_os_artifact(&self) -> &str {
        &self.default_guest_os_artifact
    }

    /// Yields the guest OS adapter corresponding to the default guest OS
    /// artifact.
    pub async fn default_guest_os_kind(&self) -> anyhow::Result<GuestOsKind> {
        Ok(self
            .artifact_store
            .get_guest_os_image(&self.default_guest_os_artifact)
            .await?
            .1)
    }

    /// Indicates whether the disk factory in this framework supports the
    /// creation of Crucible disks. This can be used to skip tests that require
    /// Crucible support.
    pub fn crucible_enabled(&self) -> bool {
        self.crucible_enabled
    }

    /// Indicates whether a "migration base" Propolis server artifact is
    /// available for migration-from-base tests.
    pub fn migration_base_enabled(&self) -> bool {
        self.migration_base_enabled
    }

    /// Yields a sender to which the caller can submit tasks that will be
    /// `await`ed after the calling test case completes.
    pub(crate) fn cleanup_task_channel(
        &self,
    ) -> UnboundedSender<JoinHandle<()>> {
        self.cleanup_task_tx.clone()
    }

    /// Runs any currently-queued cleanup tasks in this `Framework` to
    /// completion.
    ///
    /// This routine synchronizes access to the cleanup task queue such that
    /// when it returns, any cleanup tasks that were previously queued by the
    /// calling thread are guaranteed to have been completed (though not
    /// necessarily by the calling thread itself, i.e., another, earlier caller
    /// may have awaited the task).
    async fn wait_for_cleanup_tasks(&self) {
        let futs = FuturesUnordered::new();

        let mut guard = self.cleanup_task_rx.lock().await;
        while let Ok(task) = guard.try_recv() {
            futs.push(task);
        }

        // Hold the lock while awaiting the tasks to block subsequent callers.
        // This is needed to guarantee that all tasks submitted by a thread are
        // retired when that thread returns from this call: without the lock, T1
        // can submit a task, T2 can remove it from the queue but not fully
        // retire it, and a subsequent call from T1 will see an empty queue and
        // return immediately even though its task is still active.
        let _results: Vec<_> = futs.collect().await;
    }
}

impl fmt::Display for CrucibleDownstairsSource {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::BuildomatGitRev(commit) => {
                write!(f, "Buildomat Git commit '{commit}'")
            }
            Self::Local(path) => write!(f, "local path '{path}'"),
        }
    }
}

impl fmt::Display for BasePropolisSource<'_> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::BuildomatBranch(branch) => {
                write!(f, "Buildomat branch '{branch}'")
            }
            Self::BuildomatGitRev(commit) => {
                write!(f, "Buildomat Git commit '{commit}'")
            }
            Self::Local(path) => write!(f, "local path '{path}'"),
        }
    }
}


================================================
FILE: phd-tests/framework/src/lifecycle.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Context;
use futures::future::BoxFuture;
use tracing::info;
use uuid::Uuid;

use crate::{test_vm::MigrationTimeout, TestCtx, TestVm};

/// The set of actions that can be taken on a VM undergoing lifecycle testing.
pub enum Action<'a> {
    /// Reset the VM using the Propolis server reset API. This sort of reboot
    /// does not involve the guest OS. It can be used to verify that components'
    /// reset implementations don't change properties that shouldn't change
    /// without fully stopping and restarting a VM.
    Reset,

    /// Stop the VM and restart it in a successor Propolis using the same
    /// environment as its predecessor.
    StopAndStart,

    /// Migrate the VM to a new Propolis server. The wrapped `&str` names a
    /// Propolis server artifact to migrate to.
    //
    // N.B. This isn't used in any lifecycle tests yet, mostly because there are
    // no well-known, stable Propolis artifact names other than the name of the
    // default artifact supplied on the command line. This will change in the
    // future as new well-known artifacts (like "Buildomat HEAD") are added.
    MigrateToPropolis(&'a str),
}

impl TestCtx {
    /// Runs a lifecycle test on the supplied `vm` by iterating over the
    /// `actions`, performing the specified action, and then calling `check_fn`
    /// on the resulting VM to verify invariants.
    pub async fn lifecycle_test(
        &self,
        vm: TestVm,
        actions: &[Action<'_>],
        check_fn: impl for<'v> Fn(&'v TestVm) -> BoxFuture<'v, ()>,
    ) -> anyhow::Result<()> {
        let mut vm = vm;
        let original_name = vm.name().to_owned();
        for (idx, action) in actions.iter().enumerate() {
            match action {
                Action::Reset => {
                    info!(
                        vm_name = original_name,
                        "rebooting VM for lifecycle test"
                    );
                    vm.reset().await?;
                    vm.wait_to_boot().await?;
                }
                Action::StopAndStart => {
                    info!(
                        vm_name = original_name,
                        "stopping and starting VM for lifecycle test"
                    );
                    let new_vm_name =
                        format!("{original_name}_lifecycle_{idx}");
                    vm.stop().await?;
                    let mut new_vm = self
                        .spawn_successor_vm(&new_vm_name, &vm, None)
                        .await?;
                    new_vm.launch().await?;
                    new_vm.wait_to_boot().await?;
                    vm = new_vm;
                }
                Action::MigrateToPropolis(propolis) => {
                    use propolis_client::types::MigrationState;
                    info!(
                        vm_name = original_name,
                        propolis_artifact = propolis,
                        "migrating to new Propolis artifact for lifecycle test"
                    );

                    let new_vm_name =
                        format!("{original_name}_lifecycle_{idx}");

                    let mut env = self.environment_builder();
                    env.propolis(propolis);
                    let mut new_vm = self
                        .spawn_successor_vm(&new_vm_name, &vm, Some(&env))
                        .await?;
                    let migration_id = Uuid::new_v4();
                    new_vm
                        .migrate_from(
                            &vm,
                            migration_id,
                            MigrationTimeout::default(),
                        )
                        .await?;

                    // Explicitly check migration status on both the source and
                    // target to make sure it is available even after migration
                    // has finished.
                    let src_migration_state = vm
                        .get_migration_state()
                        .await
                        .context("Failed to get source VM migration state")?
                        .migration_out
                        .expect("source VM should have migrated out")
                        .state;
                    assert_eq!(src_migration_state, MigrationState::Finish);

                    let target_migration_state = new_vm
                        .get_migration_state()
                        .await
                        .context("Failed to get target VM migration state")?
                        .migration_in
                        .expect("target VM should have migrated in")
                        .state;
                    assert_eq!(target_migration_state, MigrationState::Finish);

                    vm = new_vm;
                }
            }

            check_fn(&vm).await;
        }

        Ok(())
    }
}


================================================
FILE: phd-tests/framework/src/log_config.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Types and helpers specifying how logs should be formatted and where they
//! should be directed.

use std::{path::Path, process::Stdio, str::FromStr};

use anyhow::Context;
use tracing::info;

/// Specifies how a test's logging should be managed.
#[derive(Debug, Clone, Copy)]
pub struct LogConfig {
    pub output_mode: OutputMode,
    pub log_format: LogFormat,
}

/// Specifies where a output for a test's processes should be written.
#[derive(Debug, Clone, Copy)]
pub enum OutputMode {
    /// Write to files in the server's factory's temporary directory.
    TmpFile,

    /// Write stdout/stderr to the console.
    Stdio,

    /// Redirect stdout/stderr to /dev/null.
    Null,
}

impl FromStr for OutputMode {
    type Err = std::io::Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "file" | "tmpfile" => Ok(OutputMode::TmpFile),
            "stdio" => Ok(OutputMode::Stdio),
            "null" => Ok(OutputMode::Null),
            _ => Err(std::io::Error::new(
                std::io::ErrorKind::InvalidData,
                s.to_string(),
            )),
        }
    }
}

impl OutputMode {
    /// Returns the stdout/stderr handles to pass to processes using the
    /// specified logging mode.
    ///
    /// # Parameters
    ///
    /// - `directory`: The directory in which to store any files written under
    ///   the selected discipline.
    ///
    ///   If this directory does not already exist, it (and any parents) will
    ///   be created.
    /// - `file_prefix`: The prefix to add to the names of any files written
    ///   under the selected discipline.
    pub(crate) fn get_handles(
        &self,
        directory: &impl AsRef<Path>,
        file_prefix: &str,
    ) -> anyhow::Result<(Stdio, Stdio)> {
        match self {
            OutputMode::TmpFile => {
                let directory = directory.as_ref();

                // Make sure the output dir actually exists before creating log
                // files.
                std::fs::create_dir_all(directory).with_context(|| {
                    format!(
                        "failed to create log file directory {}",
                        directory.display()
                    )
                })?;

                let stdout_path =
                    directory.join(format!("{file_prefix}.stdout.log"));
                let stderr_path =
                    directory.join(format!("{file_prefix}.stderr.log"));
                info!(?stdout_path, ?stderr_path, "Opening server log files");
                let stdout = create_file(&stdout_path)?.into();
                let stderr = create_file(&stderr_path)?.into();

                Ok((stdout, stderr))
            }
            OutputMode::Stdio => Ok((Stdio::inherit(), Stdio::inherit())),
            OutputMode::Null => Ok((Stdio::null(), Stdio::null())),
        }
    }
}

/// Specifies how output for a test's processes should be structured.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LogFormat {
    /// Format logs as plain hopefully human-readable output.
    Plain,

    /// Format logs as Bunyan output, more suitable for machine processing (such
    /// as in CI).
    Bunyan,
}

fn create_file(path: &impl AsRef<Path>) -> anyhow::Result<std::fs::File> {
    let path = path.as_ref();
    std::fs::File::create(path)
        .with_context(|| format!("failed to create file {}", path.display()))
}


================================================
FILE: phd-tests/framework/src/port_allocator.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! A small allocator for selecting port numbers.

use std::{
    ops::Range,
    sync::atomic::{AtomicU16, Ordering},
};

use thiserror::Error;

#[derive(Debug, Error)]
pub enum PortAllocatorError {
    #[error("No more ports available")]
    NoMorePorts,
}

pub struct PortAllocator {
    range: Range<u16>,

    /// PHD tests run in a `catch_unwind` block and so require mutable state
    /// that is created outside a test case to be unwind-safe. Allow the port
    /// allocator to be used in this context by guaranteeing that the next
    /// available port can be accessed atomically.
    next: AtomicU16,
}

impl PortAllocator {
    pub fn new(range: Range<u16>) -> Self {
        let start = range.start;
        Self { range, next: AtomicU16::new(start) }
    }

    pub fn next(&self) -> Result<u16, PortAllocatorError> {
        if self.next.load(Ordering::Relaxed) >= self.range.end {
            return Err(PortAllocatorError::NoMorePorts);
        }

        let port = self.next.fetch_add(1, Ordering::Relaxed);
        if port >= self.range.end {
            Err(PortAllocatorError::NoMorePorts)
        } else {
            Ok(port)
        }
    }

    pub fn reset(&self) {
        self.next.store(self.range.start, Ordering::Relaxed);
    }
}


================================================
FILE: phd-tests/framework/src/serial/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Interfaces to access a guest's serial console.

use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use futures::SinkExt;
use propolis_client::support::InstanceSerialConsoleHelper;
use tokio::sync::{
    mpsc::{UnboundedReceiver, UnboundedSender},
    oneshot,
};
use tokio_tungstenite::tungstenite::Message;
use tracing::{debug, error, info};

mod raw_buffer;
mod vt80x24;

/// Describes a request to wait for a string to appear on the serial console or
/// in the console's back buffer.
struct OutputWaiter {
    /// The string this waiter should wait for.
    wanted: String,

    /// When the wait is satisfied, send the contents of the buffer prior to
    /// (and exclusive of) the waited-for string to this channel.
    preceding_tx: oneshot::Sender<String>,
}

/// An interface for objects that handle and buffer characters and commands a
/// guest writes to its serial console.
trait Buffer: Send {
    /// Processes the supplied `bytes` as input to the buffer.
    fn process_bytes(&mut self, bytes: &[u8]);

    /// Clears the unprocessed contents of the buffer.
    fn clear(&mut self);

    /// Registers a new request to wait for a string to appear in the buffer.
    fn register_wait_for_output(&mut self, waiter: OutputWaiter);

    /// Ensures there is no active request to wait for a string to appear in
    /// this buffer. Returns the previous active request if there was one.
    fn cancel_wait_for_output(&mut self) -> Option<OutputWaiter>;
}

/// The kind of buffering discipline to use for a guest's serial output.
#[derive(Debug)]
pub enum BufferKind {
    /// Assume that the guest will output characters and command bytes (like
    /// carriage returns and line feeds) "in the raw" without trying to
    /// implement its own buffering or scrollback.
    Raw,

    /// Assume that the guest believes it is sending commands to drive a
    /// VT100-compatible 80x24 terminal and emulate that terminal.
    Vt80x24,
}

/// The set of commands that the serial console can send to its processing task.
enum TaskCommand {
    /// Send the supplied bytes to the VM.
    SendBytes { bytes: Vec<u8>, done: oneshot::Sender<()> },

    /// Clears the contents of the task's console buffer. This does not cancel
    /// the active wait, if there is one.
    Clear,

    /// Register to be notified if and when a supplied string appears in the
    /// serial console's buffer.
    RegisterWait(OutputWaiter),

    /// Cancel any outstanding wait for bytes to appear in the buffer.
    CancelWait,

    /// Change the buffer kind to the supplied kind. Note that this command
    /// discards the current buffer's contents and cancels any active waits.
    ChangeBufferKind(BufferKind),

    /// Insert the supplied delay between each byte written to the serial
    /// console (to avoid keyboard debouncing logic in the guest). If the delay
    /// is set to 0, the serial task will send Vecs of bytes to the guest in a
    /// single message.
    SetGuestWriteDelay(std::time::Duration),
}

/// A connection to a guest serial console made available on a particular guest
/// serial port.
#[derive(Clone)]
pub struct SerialConsole {
    /// Used to send commands to the worker thread for this console.
    cmd_tx: UnboundedSender<TaskCommand>,
}

impl SerialConsole {
    /// Creates a new serial console connection.
    ///
    /// # Arguments
    ///
    /// - `serial_conn`: An upgraded websocket connection obtained from
    ///   successfully connecting to Propolis's serial console API.
    /// - `buffer_kind`: Supplies the buffering discipline to start with.
    pub async fn new(
        serial_helper: InstanceSerialConsoleHelper,
        buffer_kind: BufferKind,
        log_path: Utf8PathBuf,
    ) -> Result<Self> {
        let (cmd_tx, cmd_rx) = tokio::sync::mpsc::unbounded_channel();
        tokio::spawn(serial_task(serial_helper, buffer_kind, log_path, cmd_rx));

        Ok(Self { cmd_tx })
    }

    /// Directs the console worker thread to send the supplied `bytes` to the
    /// guest. Returns a `oneshot::Receiver` that the console worker thread
    /// signals once all the bytes have been set.
    pub fn send_bytes(
        &self,
        bytes: Vec<u8>,
    ) -> anyhow::Result<oneshot::Receiver<()>> {
        let (done, done_rx) = oneshot::channel();
        self.cmd_tx.send(TaskCommand::SendBytes { bytes, done })?;
        Ok(done_rx)
    }

    /// Directs the console worker thread to clear the serial console buffer.
    pub fn clear(&self) -> anyhow::Result<()> {
        self.cmd_tx.send(TaskCommand::Clear)?;
        Ok(())
    }

    /// Registers with the current buffer a request to wait for `wanted` to
    /// appear in the console buffer. When a match is found, the buffer sends
    /// all buffered characters preceding the match to `preceding_tx`. If the
    /// buffer already contains one or more matches at the time the waiter is
    /// registered, the last match is used to satisfy the wait immediately.
    ///
    /// Note that this function *does not* clear any characters from the buffer.
    /// Callers who want to retire previously-echoed characters in the buffer
    /// must explicitly call `clear`.
    pub fn register_wait_for_string(
        &self,
        wanted: String,
        preceding_tx: oneshot::Sender<String>,
    ) -> Result<()> {
        self.cmd_tx.send(TaskCommand::RegisterWait(OutputWaiter {
            wanted,
            preceding_tx,
        }))?;
        Ok(())
    }

    /// Cancels the outstanding wait on the current buffer, if there was one.
    pub fn cancel_wait_for_string(&self) -> Result<()> {
        self.cmd_tx.send(TaskCommand::CancelWait)?;
        Ok(())
    }

    /// Changes the buffering discipline for this console.
    pub fn change_buffer_kind(&self, kind: BufferKind) -> Result<()> {
        self.cmd_tx.send(TaskCommand::ChangeBufferKind(kind))?;
        Ok(())
    }

    /// Sets the delay to insert between sending individual bytes to the guest.
    pub fn set_repeated_character_debounce(
        &self,
        delay: std::time::Duration,
    ) -> Result<()> {
        self.cmd_tx.send(TaskCommand::SetGuestWriteDelay(delay))?;
        Ok(())
    }
}

/// Creates a new serial console buffer of the supplied kind.
fn new_buffer(
    kind: BufferKind,
    log_path: impl AsRef<Utf8Path>,
) -> Result<Box<dyn Buffer>> {
    match kind {
        BufferKind::Raw => Ok(Box::new(raw_buffer::RawBuffer::new(
            log_path.as_ref().to_path_buf(),
        )?)),
        BufferKind::Vt80x24 => Ok(Box::new(vt80x24::Vt80x24::new())),
    }
}

/// Runs the serial websocket connection processing loop.
///
/// # Arguments
///
/// - `ws`: A bidirectional stream constructed over a websocket connection to
///   the target Propolis serial console.
/// - `buffer`: A reference to the buffer object backing this serial console.
///   The task posts newly-written bytes from the guest back to this buffer.
/// - `input_rx`: Receives bytes from a serial console's owner to send out to
///   the target Propolis's serial console.
#[tracing::instrument(level = "info", name = "serial console task", skip_all)]
async fn serial_task(
    mut stream: InstanceSerialConsoleHelper,
    initial_buffer_kind: BufferKind,
    log_path: Utf8PathBuf,
    mut cmd_rx: UnboundedReceiver<TaskCommand>,
) {
    let mut buffer = new_buffer(initial_buffer_kind, &log_path).unwrap();
    let mut debounce = std::time::Duration::from_secs(0);
    loop {
        tokio::select! {
            cmd = cmd_rx.recv() => {
                let Some(cmd) = cmd else {
                    debug!("serial console command channel was closed");
                    break;
                };
                match cmd {
                    TaskCommand::SendBytes { bytes, done } => {
                        if debounce.is_zero() {
                            if let Err(e) = stream.send(Message::Binary(bytes)).await {
                                error!(
                                    ?e,
                                    "failed to send input to serial console websocket"
                                );
                            }
                        } else {
                            let mut bytes = bytes.iter().peekable();
                            while let Some(b) = bytes.next() {
                                if let Err(e) = stream.send(Message::Binary(vec![*b])).await {
                                    error!(
                                        ?e,
                                        "failed to send input to serial console websocket"
                                    );
                                }

                                if let Some(next) = bytes.peek() {
                                    if *next == b {
                                        tokio::time::sleep(debounce).await;
                                    }
                                }
                            }
                        }

                        let _ = done.send(());
                    }
                    TaskCommand::Clear => buffer.clear(),
                    TaskCommand::RegisterWait(waiter) => {
                        buffer.register_wait_for_output(waiter);
                    }
                    TaskCommand::CancelWait => {
                        buffer.cancel_wait_for_output();
                    }
                    TaskCommand::ChangeBufferKind(kind) => {
                        buffer = new_buffer(kind, &log_path).unwrap();
                    }
                    TaskCommand::SetGuestWriteDelay(delay) => {
                        debounce = delay;
                    }
                }
            }
            msg = stream.recv() => {
                let Some(Ok(msg)) = msg else {
                    info!("serial websocket closed unexpectedly");
                    break;
                };

                match msg.process().await {
                    Ok(Message::Binary(bytes)) => {
                        buffer.process_bytes(&bytes);
                    }
                    Ok(Message::Close(..)) => {
                        debug!("serial websocket closed");
                        break;
                    }
                    Ok(Message::Text(s)) => {
                        info!(s, "serial socket control message");
                    }
                    _ => continue,
                }
            },

        };
    }
}


================================================
FILE: phd-tests/framework/src/serial/raw_buffer.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Implements a "raw" buffer for serial console output that processes
//! characters and newlines but ignores VT100 control characters.

use std::io::{BufWriter, Write};

use anyhow::{Context, Result};
use camino::Utf8PathBuf;
use termwiz::escape::{
    csi::{Cursor::Right, CSI},
    parser::Parser,
};
use tracing::{error, trace};

use super::{Buffer, OutputWaiter};

/// A "raw" serial console buffer that handles incoming characters and newline
/// control bytes and nothing else.
pub(super) struct RawBuffer {
    log: std::io::BufWriter<std::fs::File>,
    line_buffer: String,
    wait_buffer: String,
    waiter: Option<OutputWaiter>,
    parser: Parser,
}

impl RawBuffer {
    /// Constructs a new buffer.
    pub(super) fn new(log_path: Utf8PathBuf) -> Result<Self> {
        let log_file = std::fs::File::create(&log_path).with_context(|| {
            format!("opening serial console log file {log_path}")
        })?;
        let writer = BufWriter::new(log_file);
        Ok(Self {
            log: writer,
            line_buffer: String::new(),
            wait_buffer: String::new(),
            waiter: None,
            parser: Parser::new(),
        })
    }

    /// Pushes `c` to the buffer's contents and attempts to satisfy active
    /// waits.
    fn push_character(&mut self, c: char) {
        if c == '\n' {
            self.log.write_all(self.line_buffer.as_bytes()).unwrap();
            self.log.write_all(b"\n").unwrap();
            self.log.flush().unwrap();
            self.line_buffer.clear();
        } else {
            self.line_buffer.push(c);
        }

        self.wait_buffer.push(c);
        if let Some(waiter) = self.waiter.take() {
            self.satisfy_or_set_wait(waiter);
        }
    }

    /// Pushes `s` to the buffer's contents and attempts to satisfy active
    /// waits. `s` is presumed not to contain any control characters.
    fn push_str(&mut self, s: &str) {
        self.line_buffer.push_str(s);
        self.wait_buffer.push_str(s);
        if let Some(waiter) = self.waiter.take() {
            self.satisfy_or_set_wait(waiter);
        }
    }

    /// Attempts to satisfy the wait described by `waiter` or, if the wait
    /// cannot yet be satisfied, stores it to be checked again later.
    ///
    /// A wait is satisfied if the `wait_buffer` contains the string in the
    /// supplied waiter. When this happens, all of the characters preceding the
    /// match are sent to the output channel in the supplied `waiter`, the
    /// matching characters are removed, and the remainder of the wait buffer
    /// is preserved.
    ///
    /// If the buffer contains multiple matches, the *last* match is used to
    /// satisfy the wait.
    ///
    /// # Panics
    ///
    /// Panics if a wait is already set (irrespective of whether the new wait
    /// actually needs to be stored).
    fn satisfy_or_set_wait(&mut self, waiter: OutputWaiter) {
        assert!(self.waiter.is_none());
        trace!(
            contents = self.wait_buffer,
            target = waiter.wanted,
            "checking wait on raw serial buffer"
        );
        if let Some(idx) = self.wait_buffer.rfind(&waiter.wanted) {
            let out = self.wait_buffer.drain(..idx).collect();
            self.wait_buffer = self.wait_buffer.split_off(waiter.wanted.len());

            // Because incoming bytes from Propolis may be processed on a
            // separate task than the task that registered the wait, this
            // can race such that the wait is satisfied just as the waiter
            // times out and closes its half of the channel. There's nothing
            // to be done about this, so just ignore any errors here.
            let _ = waiter.preceding_tx.send(out);
        } else {
            self.waiter = Some(waiter);
        }
    }
}

impl Buffer for RawBuffer {
    fn process_bytes(&mut self, bytes: &[u8]) {
        use termwiz::escape::{Action, ControlCode};
        let actions = self.parser.parse_as_vec(bytes);
        for action in actions {
            match action {
                Action::Print(c) => self.push_character(c),
                Action::PrintString(s) => {
                    self.push_str(&s);
                }
                Action::Control(ControlCode::LineFeed) => {
                    self.push_character('\n');
                }
                Action::CSI(CSI::Cursor(Right(n))) => {
                    self.push_str(&" ".repeat(n as usize));
                }
                _ => {
                    trace!(?action, "raw buffer ignored action");
                }
            }
        }
    }

    fn clear(&mut self) {
        self.wait_buffer.clear();
    }

    fn register_wait_for_output(&mut self, waiter: OutputWaiter) {
        self.satisfy_or_set_wait(waiter);
    }

    fn cancel_wait_for_output(&mut self) -> Option<OutputWaiter> {
        self.waiter.take()
    }
}

impl Drop for RawBuffer {
    fn drop(&mut self) {
        if let Err(e) = self.log.flush() {
            error!(%e, "failed to flush serial console log during drop");
        }
    }
}

#[cfg(test)]
mod test {
    use tokio::sync::oneshot;

    use super::*;

    fn make_buffer() -> RawBuffer {
        let file =
            std::fs::OpenOptions::new().write(true).open("/dev/null").unwrap();

        RawBuffer {
            log: std::io::BufWriter::new(file),
            line_buffer: String::new(),
            wait_buffer: String::new(),
            waiter: None,
            parser: Parser::new(),
        }
    }

    #[tokio::test]
    async fn successful_wait_consumes_buffer_contents() {
        let mut buf = make_buffer();
        let (tx, mut rx) = oneshot::channel();
        buf.push_str("the quick brown fox jumped over the lazy propolis");
        buf.satisfy_or_set_wait(OutputWaiter {
            wanted: "jumped over".to_string(),
            preceding_tx: tx,
        });
        assert_eq!(rx.try_recv().unwrap(), "the quick brown fox ");
        assert_eq!(buf.wait_buffer, " the lazy propolis");

        // Repeat the test, but register the wait before the characters are
        // pushed.
        buf.clear();
        let (tx, mut rx) = oneshot::channel();
        buf.satisfy_or_set_wait(OutputWaiter {
            wanted: "jumped over".to_string(),
            preceding_tx: tx,
        });
        buf.push_str("the quick brown fox jumped over the lazy propolis");
        assert_eq!(rx.try_recv().unwrap(), "the quick brown fox ");
        assert_eq!(buf.wait_buffer, " the lazy propolis");
    }

    #[tokio::test]
    async fn successful_wait_consumes_last_match() {
        let mut buf = make_buffer();
        let (tx, mut rx) = oneshot::channel();
        buf.push_str(
            "I put some Oxide in your Oxide so you can Oxide while you Oxide",
        );
        buf.satisfy_or_set_wait(OutputWaiter {
            wanted: "you".to_string(),
            preceding_tx: tx,
        });
        assert_eq!(
            rx.try_recv().unwrap(),
            "I put some Oxide in your Oxide so you can Oxide while "
        );
        assert_eq!(buf.wait_buffer, " Oxide");
    }
}


================================================
FILE: phd-tests/framework/src/serial/vt80x24.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use termwiz::{
    color::ColorAttribute,
    escape::{
        csi::{Cursor, Edit, EraseInLine, CSI},
        parser::Parser,
        Action, ControlCode,
    },
    surface::{Change, Position, Surface},
};
use tracing::trace;

const SURFACE_ROWS: usize = 24;

use super::{Buffer, OutputWaiter};

/// Simulates a VT100-compatible 80-by-24 terminal to buffer console output from
/// guests that assume they are driving such a terminal.
pub(super) struct Vt80x24 {
    /// Contains the currently registered request to wait for a string to appear
    /// on the virtual string, if such a request exists.
    ///
    /// This buffer does not support text wrapping; that is, a wait for a string
    /// longer than 80 characters with no intervening newline will never be
    /// satisfied, because all lines in the virtual buffer implicitly break at
    /// 80 characters.
    waiter: Option<OutputWaiter>,

    /// The virtual terminal contents, represented as 24 rows of 80 character
    /// cells each.
    surface: Surface,

    /// The parsing state machine that converts incoming bytes into VT100
    /// commands.
    parser: Parser,
}

impl Vt80x24 {
    pub(super) fn new() -> Self {
        Self {
            waiter: None,
            parser: Parser::new(),
            surface: Surface::new(80, 24),
        }
    }

    /// Converts a list of VT100 actions into a set of changes to the 80x24
    /// buffer, applies those changes, and attempts to satisfy the active wait.
    fn apply_actions(&mut self, actions: &[Action]) {
        // Unfortunately, there is no one-to-one mapping between VT100 actions
        // and `Change`s to a termwiz `Surface`. The match below is enough to
        // make buffering work for simple terminals that only print characters,
        // position the cursor, and use the 'erase to end of line' VT100
        // command. These commands are (fortunately) each representable in a
        // single `Change`. More complex commands (e.g. 'erase to start of
        // line') will have to be composed of multiple `Change`s if support for
        // them is needed.
        let to_change = |action: &Action| -> Option<Change> {
            let change = match action {
                Action::Print(c) => Some(Change::from(*c)),
                Action::PrintString(s) => Some(Change::from(s)),
                Action::Control(ctrl) => match ctrl {
                    ControlCode::LineFeed => Some(Change::from('\n')),
                    ControlCode::CarriageReturn => Some(Change::from('\r')),
                    _ => None,
                },
                Action::CSI(csi) => match csi {
                    CSI::Cursor(Cursor::Position { line, col }) => {
                        Some(make_absolute_cursor_position(
                            col.as_zero_based() as usize,
                            line.as_zero_based() as usize,
                        ))
                    }
                    CSI::Edit(Edit::EraseInLine(
                        EraseInLine::EraseToEndOfLine,
                    )) => {
                        Some(Change::ClearToEndOfLine(ColorAttribute::Default))
                    }
                    _ => None,
                },
                _ => None,
            };

            trace!(?action, ?change, "termwiz VT100 action");
            change
        };

        let changes = actions.iter().filter_map(to_change).collect();
        let seq = self.surface.add_changes(changes);
        self.surface.flush_changes_older_than(seq);

        if tracing::enabled!(tracing::Level::TRACE) {
            let contents = self.surface.screen_chars_to_string();
            trace_buffer_contents(&contents);
        }

        if let Some(waiter) = self.waiter.take() {
            self.satisfy_or_set_wait(waiter);
        }
    }

    /// Attempts to satisfy the wait described by `waiter`. If the wait is not
    /// immediately satisfiable, stores `waiter` to try again later.
    fn satisfy_or_set_wait(&mut self, waiter: OutputWaiter) {
        assert!(self.waiter.is_none());

        let _too_long = waiter.wanted.lines().find(|line| line.len() > 80);
        assert_eq!(
            _too_long, None,
            "vt80x24 waits for lines of more than 80 characters will never be \
            satisfied"
        );

        let mut contents = self.surface.screen_chars_to_string();
        if let Some(idx) = contents.rfind(&waiter.wanted) {
            contents.truncate(idx);
            let _ = waiter.preceding_tx.send(contents);
        } else {
            self.waiter = Some(waiter);
        }
    }
}

impl Buffer for Vt80x24 {
    fn process_bytes(&mut self, bytes: &[u8]) {
        let actions = self.parser.parse_as_vec(bytes);
        self.apply_actions(&actions);
    }

    fn clear(&mut self) {
        let cursor_pos = self.surface.cursor_position();
        let seq = self.surface.add_changes(vec![
            Change::ClearScreen(ColorAttribute::Default),
            make_absolute_cursor_position(cursor_pos.0, cursor_pos.1),
        ]);
        self.surface.flush_changes_older_than(seq);
    }

    fn register_wait_for_output(&mut self, waiter: OutputWaiter) {
        self.satisfy_or_set_wait(waiter);
    }

    fn cancel_wait_for_output(&mut self) -> Option<OutputWaiter> {
        self.waiter.take()
    }
}

/// Provides shorthand to create a termwiz `CursorPosition` from zero-based
/// column and row indices.
fn make_absolute_cursor_position(col: usize, row: usize) -> Change {
    Change::CursorPosition {
        x: Position::Absolute(col),
        y: Position::Absolute(row),
    }
}

fn trace_buffer_contents(contents: &str) {
    // Find the index of the last line in the buffer that isn't blank.
    let last_non_empty = contents
        .lines()
        .rev()
        .position(|l| l.chars().any(|c| c != ' '))
        .map(|pos| SURFACE_ROWS - pos - 1);

    if let Some(last_non_empty) = last_non_empty {
        for (idx, line) in contents.lines().enumerate() {
            if idx > last_non_empty {
                break;
            }
            trace!(idx, line, "termwiz buffer contents");
        }
    } else {
        trace!("termwiz buffer is empty");
    }
}


================================================
FILE: phd-tests/framework/src/test_vm/config.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use anyhow::Context;
use cpuid_utils::CpuidIdent;
use propolis_client::{
    instance_spec::{
        Board, BootOrderEntry, BootSettings, Chipset, Component, Cpuid,
        CpuidEntry, CpuidVendor, GuestHypervisorInterface, InstanceMetadata,
        InstanceSpec, MigrationFailureInjector, NvmeDisk, PciPath, SerialPort,
        SerialPortNumber, SpecKey, VirtioDisk, VirtioSocket,
    },
    support::nvme_serial_from_str,
};
use uuid::Uuid;

use crate::{
    disk::{DeviceName, DiskConfig, DiskSource},
    test_vm::spec::VmSpec,
    TestCtx,
};

/// The disk interface to use for a given guest disk.
#[derive(Clone, Copy, Debug)]
pub enum DiskInterface {
    Virtio,
    Nvme,
}

#[derive(Clone, Copy, Debug)]
pub enum DiskBackend {
    File,
    Crucible { min_disk_size_gib: u64, block_size: crate::disk::BlockSize },
    InMemory { readonly: bool },
}

#[derive(Clone, Debug)]
struct DiskRequest<'a> {
    name: &'a str,
    interface: DiskInterface,
    backend: DiskBackend,
    source: DiskSource<'a>,
    pci_device_num: u8,
}

pub struct VmConfig<'dr> {
    vm_name: String,
    cpus: u8,
    memory_mib: u64,
    cpuid: Option<Vec<CpuidEntry>>,
    bootrom_artifact: String,
    boot_order: Option<Vec<&'dr str>>,
    disks: Vec<DiskRequest<'dr>>,
    migration_failure: Option<MigrationFailureInjector>,
    guest_hv_interface: Option<GuestHypervisorInterface>,
    vsock: Option<VirtioSocket>,
}

impl<'dr> VmConfig<'dr> {
    pub(crate) fn new(
        vm_name: &str,
        cpus: u8,
        memory_mib: u64,
        bootrom: &str,
        guest_artifact: &'dr str,
    ) -> Self {
        let mut config = Self {
            vm_name: vm_name.to_owned(),
            cpus,
            memory_mib,
            cpuid: None,
            bootrom_artifact: bootrom.to_owned(),
            boot_order: None,
            disks: Vec::new(),
            migration_failure: None,
            guest_hv_interface: None,
            vsock: None,
        };

        config.boot_disk(
            guest_artifact,
            DiskInterface::Nvme,
            DiskBackend::File,
            4,
        );

        config
    }

    pub fn cpus(&mut self, cpus: u8) -> &mut Self {
        self.cpus = cpus;
        self
    }

    pub fn memory_mib(&mut self, mem: u64) -> &mut Self {
        self.memory_mib = mem;
        self
    }

    pub fn bootrom(&mut self, artifact: &str) -> &mut Self {
        artifact.clone_into(&mut self.bootrom_artifact);
        self
    }

    pub fn named(&mut self, name: impl ToString) -> &mut Self {
        self.vm_name = name.to_string();
        self
    }

    pub fn cpuid(&mut self, entries: Vec<CpuidEntry>) -> &mut Self {
        self.cpuid = Some(entries);
        self
    }

    pub fn guest_hv_interface(
        &mut self,
        interface: GuestHypervisorInterface,
    ) -> &mut Self {
        self.guest_hv_interface = Some(interface);
        self
    }

    pub fn vsock(&mut self, guest_cid: u64, pci_device_num: u8) -> &mut Self {
        let pci_path = PciPath::new(0, pci_device_num, 0).unwrap();
        self.vsock = Some(VirtioSocket { guest_cid, pci_path });
        self
    }

    pub fn fail_migration_exports(&mut self, exports: u32) -> &mut Self {
        let injector =
            self.migration_failure.get_or_insert(MigrationFailureInjector {
                fail_exports: 0,
                fail_imports: 0,
            });
        injector.fail_exports = exports;
        self
    }

    pub fn fail_migration_imports(&mut self, imports: u32) -> &mut Self {
        let injector =
            self.migration_failure.get_or_insert(MigrationFailureInjector {
                fail_exports: 0,
                fail_imports: 0,
            });
        injector.fail_imports = imports;
        self
    }

    pub fn boot_order(&mut self, disks: Vec<&'dr str>) -> &mut Self {
        self.boot_order = Some(disks);
        self
    }

    pub fn clear_boot_order(&mut self) -> &mut Self {
        self.boot_order = None;
        self
    }

    /// Add a new disk to the VM config, and add it to the front of the VM's
    /// boot order.
    ///
    /// The added disk will have the name `boot-disk`, and replace the previous
    /// existing `boot-disk`.
    pub fn boot_disk(
        &mut self,
        artifact: &'dr str,
        interface: DiskInterface,
        backend: DiskBackend,
        pci_device_num: u8,
    ) -> &mut Self {
        let boot_order = self.boot_order.get_or_insert(Vec::new());
        if let Some(prev_boot_item) =
            boot_order.iter().position(|d| *d == "boot-disk")
        {
            boot_order.remove(prev_boot_item);
        }

        if let Some(prev_boot_disk) =
            self.disks.iter().position(|d| d.name == "boot-disk")
        {
            self.disks.remove(prev_boot_disk);
        }

        boot_order.insert(0, "boot-disk");

        self.data_disk(
            "boot-disk",
            DiskSource::Artifact(artifact),
            interface,
            backend,
            pci_device_num,
        );

        self
    }

    pub fn data_disk(
        &mut self,
        name: &'dr str,
        source: DiskSource<'dr>,
        interface: DiskInterface,
        backend: DiskBackend,
        pci_device_num: u8,
    ) -> &mut Self {
        self.disks.push(DiskRequest {
            name,
            interface,
            backend,
            source,
            pci_device_num,
        });
        self
    }

    pub async fn vm_spec(&self, ctx: &TestCtx) -> anyhow::Result<VmSpec> {
        let VmConfig {
            vm_name,
            cpus,
            memory_mib,
            cpuid,
            bootrom_artifact,
            boot_order,
            disks,
            migration_failure,
            guest_hv_interface,
            vsock,
        } = self;
        let framework = &ctx.framework;
        let bootrom_path = framework
            .artifact_store
            .get_bootrom(bootrom_artifact)
            .await
            .context("looking up bootrom artifact")?;

        // The first disk in the boot list might not be the disk a test
        // *actually* expects to boot.
        //
        // If there are multiple bootable disks in the boot order, we'll assume
        // they're all the same guest OS kind. So look for `boot-disk` - if
        // there isn't a disk named `boot-disk` then fall back to hoping the
        // first disk in the boot order is a bootable disk, and if *that* isn't
        // a bootable disk, maybe the first disk is.
        //
        // TODO: theoretically we might want to accept configuration of a
        // specific guest OS adapter and avoid the guessing games. So far the
        // above supports existing tests and makes them "Just Work", but a more
        // complicated test may want more control here.
        let boot_disk = disks
            .iter()
            .find(|d| d.name == "boot-disk")
            .or_else(|| {
                if let Some(boot_order) = boot_order.as_ref() {
                    boot_order
                        .first()
                        .and_then(|name| disks.iter().find(|d| &d.name == name))
                } else {
                    None
                }
            })
            .or_else(|| disks.first())
            .expect("VM config includes at least one disk");

        // XXX: assuming all bootable images are equivalent to the first, or at
        // least the same guest OS kind.
        let DiskSource::Artifact(boot_artifact) = boot_disk.source else {
            unreachable!("boot disks always use artifacts as sources");
        };

        let (_, guest_os_kind) = framework
            .artifact_store
            .get_guest_os_image(boot_artifact)
            .await
            .context("getting guest OS kind for boot disk")?;

        let mut disk_handles = Vec::new();
        for disk in disks.iter() {
            disk_handles.push(
                make_disk(disk.name.to_owned(), ctx, disk)
                    .await
                    .context("creating disk")?,
            );
        }

        let host_leaf_0 = cpuid_utils::host::query(CpuidIdent::leaf(0));
        let host_vendor = cpuid_utils::CpuidVendor::try_from(host_leaf_0)
            .map_err(|_| {
                anyhow::anyhow!(
                    "unknown host CPU vendor (leaf 0: {host_leaf_0:?})"
                )
            })?;

        let mut spec = InstanceSpec {
            board: Board {
                cpus: *cpus,
                memory_mb: *memory_mib,
                chipset: Chipset::default(),
                cpuid: cpuid.as_ref().map(|entries| Cpuid {
                    entries: entries.clone(),
                    vendor: match host_vendor {
                        cpuid_utils::CpuidVendor::Amd => CpuidVendor::Amd,
                        cpuid_utils::CpuidVendor::Intel => CpuidVendor::Intel,
                    },
                }),
                guest_hv_interface: guest_hv_interface
                    .as_ref()
                    .cloned()
                    .unwrap_or_default(),
            },
            components: Default::default(),
            smbios: None,
        };

        // Iterate over the collection of disks and handles and add spec
        // elements for all of them. This assumes the disk handles were created
        // in the correct order: boot disk first, then in the data disks'
        // iteration order.
        let all_disks = disks.iter().zip(disk_handles.iter());
        for (req, hdl) in all_disks {
            let pci_path = PciPath::new(0, req.pci_device_num, 0).unwrap();
            let backend_spec = hdl.backend_spec();
            let device_name = hdl.device_name().clone();
            let backend_name = device_name.clone().into_backend_name();
            let device_spec = match req.interface {
                DiskInterface::Virtio => Component::VirtioDisk(VirtioDisk {
                    backend_id: SpecKey::Name(
                        backend_name.clone().into_string(),
                    ),
                    pci_path,
                }),
                DiskInterface::Nvme => Component::NvmeDisk(NvmeDisk {
                    backend_id: SpecKey::Name(
                        backend_name.clone().into_string(),
                    ),
                    pci_path,
                    serial_number: nvme_serial_from_str(
                        device_name.as_str(),
                        // Omicron supplies (or will supply, as of this writing)
                        // 0 as the padding byte to maintain compatibility for
                        // existing disks. Match that behavior here so that PHD
                        // and Omicron VM configurations are as similar as
                        // possible.
                        0,
                    ),
                }),
            };

            let _old = spec
                .components
                .insert(device_name.into_string().into(), device_spec);
            assert!(_old.is_none());
            let _old = spec
                .components
                .insert(backend_name.into_string().into(), backend_spec);
            assert!(_old.is_none());
        }

        let _old = spec.components.insert(
            "com1".into(),
            Component::SerialPort(SerialPort { num: SerialPortNumber::Com1 }),
        );
        assert!(_old.is_none());

        if let Some(boot_order) = boot_order.as_ref() {
            let _old = spec.components.insert(
                "boot-settings".into(),
                Component::BootSettings(BootSettings {
                    order: boot_order
                        .iter()
                        .map(|item| BootOrderEntry {
                            id: SpecKey::Name(item.to_string()),
                        })
                        .collect(),
                }),
            );
            assert!(_old.is_none());
        }

        if let Some(vsock) = vsock {
            let _old = spec
                .components
                .insert("vsock".into(), Component::VirtioSocket(*vsock));
            assert!(_old.is_none());
        }

        if let Some(mig) = migration_failure.as_ref() {
            let _old = spec.components.insert(
                "migration-failure".into(),
                Component::MigrationFailureInjector(mig.clone()),
            );
            assert!(_old.is_none());
        }

        // Generate random identifiers for this instance's timeseries metadata.
        let sled_id = Uuid::new_v4();
        let metadata = InstanceMetadata {
            project_id: Uuid::new_v4(),
            silo_id: Uuid::new_v4(),
            sled_id,
            sled_model: "pheidippes".into(),
            sled_revision: 1,
            sled_serial: sled_id.to_string(),
        };

        Ok(VmSpec::new(
            vm_name.clone(),
            spec,
            disk_handles,
            guest_os_kind,
            bootrom_path,
            metadata,
        ))
    }
}

async fn make_disk(
    device_name: String,
    ctx: &TestCtx,
    req: &DiskRequest<'_>,
) -> anyhow::Result<Arc<dyn DiskConfig>> {
    let device_name = DeviceName::new(device_name);
    let framework = &ctx.framework;

    Ok(match req.backend {
        DiskBackend::File => framework
            .disk_factory
            .create_file_backed_disk(device_name, &req.source)
            .await
            .with_context(|| {
                format!("creating new file-backed disk from {:?}", req.source,)
            })? as Arc<dyn DiskConfig>,
        DiskBackend::Crucible { min_disk_size_gib, block_size } => framework
            .disk_factory
            .create_crucible_disk(
                device_name,
                &req.source,
                min_disk_size_gib,
                block_size,
                &ctx.output_dir,
            )
            .await
            .with_context(|| {
                format!(
                    "creating new Crucible-backed disk from {:?}",
                    req.source,
                )
            })?
            as Arc<dyn DiskConfig>,
        DiskBackend::InMemory { readonly } => framework
            .disk_factory
            .create_in_memory_disk(device_name, &req.source, readonly)
            .await
            .with_context(|| {
                format!("creating new in-memory disk from {:?}", req.source)
            })?
            as Arc<dyn DiskConfig>,
    })
}


================================================
FILE: phd-tests/framework/src/test_vm/environment.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::net::{Ipv4Addr, SocketAddrV4};

use anyhow::Context;

use crate::{test_vm::server::ServerProcessParameters, TestCtx};

/// Specifies where the framework should start a new test VM.
#[derive(Clone, Copy, Debug)]
pub enum VmLocation {
    /// Start the VM on the system where the test runner is executing.
    Local,
    // TODO: Support remote VMs.
}

/// Specifies where test VMs should report metrics to, if anywhere.
#[derive(Clone, Copy, Debug)]
pub enum MetricsLocation {
    /// Oximeter metrics should be reported to a server colocated with the test
    /// VM to be started.
    Local,
    // When the time comes to support remote VMs, it will presumably be useful
    // to have local and (perhaps multiple) remote VMs report metrics to the
    // same server. But we don't support remote VMs yet.
}

#[derive(Clone, Debug)]
pub struct EnvironmentSpec {
    pub(crate) location: VmLocation,
    pub(crate) propolis_artifact: String,
    pub(crate) metrics: Option<MetricsLocation>,
}

impl EnvironmentSpec {
    pub(crate) fn new(location: VmLocation, propolis_artifact: &str) -> Self {
        Self {
            location,
            propolis_artifact: propolis_artifact.to_owned(),
            metrics: None,
        }
    }

    pub fn location(&mut self, location: VmLocation) -> &mut Self {
        self.location = location;
        self
    }

    pub fn propolis(&mut self, artifact_name: &str) -> &mut Self {
        artifact_name.clone_into(&mut self.propolis_artifact);
        self
    }

    pub fn metrics(&mut self, metrics: Option<MetricsLocation>) -> &mut Self {
        self.metrics = metrics;
        self
    }

    pub(crate) async fn build<'a>(
        &self,
        ctx: &'a TestCtx,
    ) -> anyhow::Result<Environment<'a>> {
        Environment::from_builder(self, ctx).await
    }
}

/// Specifies all of the details the framework needs to stand up a VM in a
/// specific environment.
///
/// When tests want to spawn a new VM, they pass a `VmLocation` to the
/// framework, and the framework augments that with
#[derive(Clone, Debug)]
pub(crate) enum Environment<'a> {
    Local(ServerProcessParameters<'a>),
}

impl<'a> Environment<'a> {
    async fn from_builder(
        builder: &EnvironmentSpec,
        ctx: &'a TestCtx,
    ) -> anyhow::Result<Self> {
        let framework = &ctx.framework;
        match builder.location {
            VmLocation::Local => {
                let propolis_server = framework
                    .artifact_store
                    .get_propolis_server(&builder.propolis_artifact)
                    .await
                    .context("setting up VM execution environment")?;
                let server_port = framework
                    .port_allocator
                    .next()
                    .context("getting Propolis server port")?;
                let vnc_port = framework
                    .port_allocator
                    .next()
                    .context("getting VNC server port")?;
                let metrics_addr = builder.metrics.and_then(|m| match m {
                    MetricsLocation::Local => {
                        // If the test requests metrics are local, we'll start
                        // an Oximeter stand-in for this VM when setting up this
                        // environment later. `start_local_vm` will patch in the
                        // actual server address when it has created one.
                        //
                        // If the VM is to be started remotely but requests
                        // "Local" metrics, that's probably an error.
                        None
                    }
                });
                let params = ServerProcessParameters {
                    server_path: propolis_server,
                    output_dir: ctx.output_dir.as_path(),
                    server_addr: SocketAddrV4::new(
                        Ipv4Addr::new(127, 0, 0, 1),
                        server_port,
                    ),
                    metrics_addr,
                    vnc_addr: SocketAddrV4::new(
                        Ipv4Addr::new(127, 0, 0, 1),
                        vnc_port,
                    ),
                    log_config: framework.log_config,
                };
                Ok(Self::Local(params))
            }
        }
    }
}


================================================
FILE: phd-tests/framework/src/test_vm/metrics.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::net::SocketAddr;
use std::sync::{Arc, Mutex};
use std::time::Duration;

use crate::log_config::{LogConfig, LogFormat};
use dropshot::{
    endpoint, ApiDescription, ConfigDropshot, HttpError, HttpResponseCreated,
    HttpServer, HttpServerStarter, RequestContext, TypedBody,
};
use omicron_common::api::internal::nexus::{
    ProducerEndpoint, ProducerKind, ProducerRegistrationResponse,
};
use oximeter::types::ProducerResults;
use slog::{Drain, Logger};
use tokio::sync::watch;
use tracing::trace;
use uuid::Uuid;

// Re-registration interval for tests. A long value here helps avoid log spew
// from Oximeter, which will re-register after about 1/6th of this interval
// elapses.
const INTERVAL: Duration = Duration::from_secs(300);

fn oximeter_logger(log_config: LogConfig) -> Logger {
    // Morally the fake Oximeter server is a distinct process that happens to
    // cohabitate with the test process. If the log config is such that we want
    // to log supporting processes to their own files, the Oximeter server's
    // logs probably should be in distinct files too.
    if log_config.log_format == LogFormat::Bunyan {
        let drain = Arc::new(Mutex::new(slog_bunyan::default(
            slog_term::TestStdoutWriter,
        )))
        .fuse();
        Logger::root(drain, slog::o!("component" => "phd-oximeter-consumer"))
    } else {
        let dec =
            slog_term::PlainSyncDecorator::new(slog_term::TestStdoutWriter);
        let drain = slog_term::FullFormat::new(dec).build().fuse();
        Logger::root(drain, slog::o!("component" => "phd-oximeter-consumer"))
    }
}

struct OximeterProducerInfo {
    addr: std::net::SocketAddr,
    uuid: Uuid,
}

pub(crate) struct FakeOximeterServer {
    server: HttpServer<FakeOximeterServerState>,
}

pub(crate) struct FakeOximeterServerState {
    sampler_sender: watch::Sender<Option<OximeterProducerInfo>>,
    sampler: watch::Receiver<Option<OximeterProducerInfo>>,
}

impl FakeOximeterServer {
    pub fn local_addr(&self) -> SocketAddr {
        self.server.local_addr()
    }

    pub fn sampler(&self) -> FakeOximeterSampler {
        FakeOximeterSampler {
            sampler: self.server.app_private().sampler.clone(),
        }
    }
}

pub struct FakeOximeterSampler {
    sampler: watch::Receiver<Option<OximeterProducerInfo>>,
}

impl FakeOximeterServerState {
    fn new() -> Self {
        let (tx, rx) = watch::channel(None);

        Self { sampler_sender: tx, sampler: rx }
    }

    async fn set_producer_info(&self, info: ProducerEndpoint) {
        // Just don't know what to do with other ProducerKinds, if or when we'll
        // see them here..
        assert_eq!(info.kind, ProducerKind::Instance);

        let new_sampler =
            OximeterProducerInfo { addr: info.address, uuid: info.id };

        // There should always be at least one Receiver on the channel since we
        // hold one in `self`.
        self.sampler_sender
            .send(Some(new_sampler))
            .expect("channel is subscribed");
    }
}

impl FakeOximeterSampler {
    /// Sample Propolis' Oximeter metrics, taking some function that determines
    /// if a sample is satisfactory for the caller to proceed with.
    ///
    /// `wait_for_propolis_stats` will poll the corresponding Oximeter producer
    /// and call `f` with each returned set of results.
    ///
    /// Panics if `f` does not return `Some` after some number of retries and
    /// `ProducerResults` updates.
    pub async fn wait_for_propolis_stats<U>(
        &self,
        f: impl Fn(ProducerResults) -> Option<U>,
    ) -> U {
        let result = backoff::future::retry(
            backoff::ExponentialBackoff {
                max_interval: Duration::from_secs(1),
                max_elapsed_time: Some(Duration::from_secs(10)),
                ..Default::default()
            },
            || async {
                let producer_results = self.sample_propolis_stats().await
                    .map_err(backoff::Error::transient)?;

                if let Some(metrics) = f(producer_results) {
                    Ok(metrics)
                } else {
                    Err(backoff::Error::transient(anyhow::anyhow!(
                        "full metrics sample not available or fresh enough (yet?)"
                    )))
                }
            },
        )
        .await;

        result.expect("propolis-server Oximeter stats should become available")
    }

    /// Sample Propolis' Oximeter metrics, including the timestamp of the oldest
    /// metric reflected in the sample.
    ///
    /// Returns `None` for some kinds of incomplete stats or when no stats are
    /// available at all.
    async fn sample_propolis_stats(
        &self,
    ) -> Result<ProducerResults, anyhow::Error> {
        let metrics_url = {
            self.sampler
                .clone()
                .wait_for(Option::is_some)
                .await
                .expect("can recv");
            let sampler = self.sampler.borrow();
            let stats = sampler.as_ref().expect("sampler does not become None");
            format!("http://{}/{}", stats.addr, stats.uuid)
        };
        let res = reqwest::Client::new()
            .get(metrics_url)
            .send()
            .await
            .expect("can send oximeter stats request");
        assert!(
            res.status().is_success(),
            "failed to fetch stats from propolis-server"
        );
        trace!(?res, "got stats response");
        Ok(res.json::<ProducerResults>().await?)
    }
}

// Stub functionality for our fake Nexus that test Oximeter producers
// (`propolis-server`) will register with.
#[endpoint {
    method = POST,
    path = "/metrics/producers",
}]
async fn register_producer(
    rqctx: RequestContext<FakeOximeterServerState>,
    producer_info: TypedBody<ProducerEndpoint>,
) -> Result<HttpResponseCreated<ProducerRegistrationResponse>, HttpError> {
    let info = producer_info.into_inner();
    trace!(?info, "producer registration");
    rqctx.context().set_producer_info(info).await;

    Ok(HttpResponseCreated(ProducerRegistrationResponse {
        lease_duration: INTERVAL,
    }))
}

// Start a Dropshot server mocking the Oximeter registration endpoint we would
// expect from Nexus.
pub fn spawn_fake_oximeter_server(log_config: LogConfig) -> FakeOximeterServer {
    let log = oximeter_logger(log_config);

    let mut api = ApiDescription::new();
    api.register(register_producer).expect("Expected to register endpoint");
    let server = HttpServerStarter::new(
        &ConfigDropshot {
            bind_address: "[::1]:0".parse().unwrap(),
            default_request_body_max_bytes: 2048,
            ..Default::default()
        },
        api,
        FakeOximeterServerState::new(),
        &log,
    )
    .expect("Expected to start Dropshot server")
    .start();

    slog::info!(
        log,
        "fake oximeter test server listening";
        "address" => ?server.local_addr(),
    );

    FakeOximeterServer { server }
}


================================================
FILE: phd-tests/framework/src/test_vm/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Routines for starting VMs, changing their states, and interacting with their
//! guest OSes.

use std::{
    collections::HashMap, fmt::Debug, net::SocketAddr, sync::Arc,
    time::Duration,
};

use crate::{
    disk::{crucible::CrucibleDisk, DiskConfig},
    guest_os::{
        self, windows::WindowsVm, CommandSequence, CommandSequenceEntry,
        GuestOs, GuestOsKind,
    },
    serial::{BufferKind, SerialConsole},
    test_vm::{
        environment::Environment, server::ServerProcessParameters, spec::VmSpec,
    },
    TestCtx,
};

use anyhow::{anyhow, bail, Context, Result};
use camino::Utf8PathBuf;
use core::result::Result as StdResult;
use futures::FutureExt;
use propolis_client::{
    instance_spec::{
        Component, InstanceProperties, InstanceSpecGetResponse,
        ReplacementComponent,
    },
    support::{InstanceSerialConsoleHelper, WSClientOffset},
    types::{
        InstanceEnsureRequest, InstanceGetResponse,
        InstanceInitializationMethod, InstanceMigrateStatusResponse,
        InstanceSerialConsoleHistoryResponse, InstanceState,
        InstanceStateRequested, MigrationState,
    },
};
use propolis_client::{Client, ResponseValue};
use thiserror::Error;
use tokio::{
    sync::{mpsc::UnboundedSender, oneshot, watch},
    task::JoinHandle,
    time::timeout,
};
use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
use uuid::Uuid;

type PropolisClientError =
    propolis_client::Error<propolis_client::types::Error>;
type PropolisClientResult<T> = StdResult<ResponseValue<T>, PropolisClientError>;

pub(crate) mod config;
pub(crate) mod environment;
pub(crate) mod metrics;
mod server;
pub(crate) mod spec;

pub use config::*;
pub use environment::{MetricsLocation, VmLocation};
pub use metrics::FakeOximeterSampler;

use self::environment::EnvironmentSpec;

#[derive(Debug, Error)]
pub enum VmStateError {
    #[error("Operation can only be performed on a VM that has been ensured")]
    InstanceNotEnsured,

    #[error(
        "Operation can only be performed on a new VM that has not been ensured"
    )]
    InstanceAlreadyEnsured,
}

type ReplacementComponents = HashMap<String, ReplacementComponent>;

#[derive(Clone, Debug)]
struct MigrationInfo {
    migration_id: Uuid,
    src_addr: SocketAddr,
    replace_components: ReplacementComponents,
}

/// Specifies the timeout to apply to an attempt to migrate.
pub enum MigrationTimeout {
    /// Time out after the specified duration.
    Explicit(std::time::Duration),

    /// Allow MIGRATION_SECS_PER_GUEST_GIB seconds per GiB of guest memory.
    InferFromMemorySize,
}

/// The number of seconds to add to the migration timeout per GiB of memory in
/// the migrating VM.
const MIGRATION_SECS_PER_GUEST_GIB: u64 = 90;

impl Default for MigrationTimeout {
    fn default() -> Self {
        Self::InferFromMemorySize
    }
}

impl From<std::time::Duration> for MigrationTimeout {
    fn from(value: std::time::Duration) -> Self {
        Self::Explicit(value)
    }
}

/// Specifies the timeout to apply when waiting for output to appear on the
/// serial console.
#[derive(Debug)]
pub enum SerialOutputTimeout {
    /// Time out after the specified duration.
    Explicit(std::time::Duration),

    /// The caller is waiting for the serial console as part of a larger
    /// operation with its own timeout, so don't set an explicit timeout on this
    /// wait.
    CallerTimeout,
}

impl From<std::time::Duration> for SerialOutputTimeout {
    fn from(value: std::time::Duration) -> Self {
        Self::Explicit(value)
    }
}

impl From<SerialOutputTimeout> for std::time::Duration {
    fn from(value: SerialOutputTimeout) -> Self {
        match value {
            SerialOutputTimeout::Explicit(t) => t,
            SerialOutputTimeout::CallerTimeout => Duration::MAX,
        }
    }
}

/// Specifies the mechanism a new VM should use to obtain a serial console.
enum InstanceConsoleSource<'a> {
    /// Connect a new console to the VM's server's serial console endpoint.
    New,

    // Clone an existing console connection from the supplied VM.
    InheritFrom(&'a TestVm),
}

enum VmState {
    New,
    Ensured { serial: SerialConsole },
}

/// Description of the acceptable status codes from executing a command in a
/// [`TestVm::run_shell_command`].
// This could reasonably have a `Status(u16)` variant to check specific non-zero
// statuses, but specific codes are not terribly portable! In the few cases we
// can expect a specific status for errors, those specific codes change between
// f.ex illumos and Linux guests.
enum StatusCheck {
    Ok,
    NotOk,
}

pub struct ShellOutputExecutor<'ctx> {
    vm: &'ctx TestVm,
    cmd: &'ctx str,
    status_check: Option<StatusCheck>,
}

impl<'a> ShellOutputExecutor<'a> {
    pub fn ignore_status(mut self) -> ShellOutputExecutor<'a> {
        self.status_check = None;
        self
    }

    pub fn check_ok(mut self) -> ShellOutputExecutor<'a> {
        self.status_check = Some(StatusCheck::Ok);
        self
    }

    pub fn check_err(mut self) -> ShellOutputExecutor<'a> {
        self.status_check = Some(StatusCheck::NotOk);
        self
    }
}

impl<'a> std::future::IntoFuture for ShellOutputExecutor<'a> {
    type Output = Result<String>;
    type IntoFuture = futures::future::BoxFuture<'a, Result<String>>;

    fn into_future(self) -> Self::IntoFuture {
        Box::pin(async move {
            // Allow the guest OS to transform the input command into a
            // guest-specific command sequence. This accounts for the guest's
            // shell type (which affects e.g. affects how it displays multi-line
            // commands) and serial console buffering discipline.
            let command_sequence =
                self.vm.guest_os.shell_command_sequence(self.cmd);
            self.vm.run_command_sequence(command_sequence).await?;

            // `shell_command_sequence` promises that the generated command
            // sequence clears buffer of everything up to and including the
            // input command before actually issuing the final '\n' that issues
            // the command.  This ensures that the buffer contents returned by
            // this call contain only the command's output.
            let output = self
                .vm
                .wait_for_serial_output(
                    self.vm.guest_os.get_shell_prompt(),
                    Duration::from_secs(300),
                )
                .await?;

            // Trim any leading newlines inserted when the command was issued
            // and any trailing whitespace that isn't actually part of the
            // command output. Any other embedded whitespace is the caller's
            // problem.
            let output = output.trim().to_string();

            if let Some(check) = self.status_check {
                let status_command_sequence =
                    self.vm.guest_os.shell_command_sequence("echo $?");
                self.vm.run_command_sequence(status_command_sequence).await?;
                let status = self
                    .vm
                    .wait_for_serial_output(
                        self.vm.guest_os.get_shell_prompt(),
                        Duration::from_secs(300),
                    )
                    .await?;
                let status = status.trim().parse::<u16>()?;

                match check {
                    StatusCheck::Ok => {
                        if status != 0 {
                            bail!("expected status 0, got {}", status);
                        }
                    }
                    StatusCheck::NotOk => {
                        if status == 0 {
                            bail!("expected non-zero status, got {}", status);
                        }
                    }
                }
            }

            Ok(output)
        })
        .boxed()
    }
}

/// A virtual machine running in a Propolis server. Test cases create these VMs
/// using the `factory::VmFactory` embedded in their test contexts.
///
/// Once a VM has been created, tests will usually want to issue [`TestVm::run`]
/// and [`TestVm::wait_to_boot`] calls so they can begin interacting with the
/// serial console.
pub struct TestVm {
    id: Uuid,
    client: Client,
    server: Option<server::PropolisServer>,
    metrics: Option<metrics::FakeOximeterServer>,
    spec: VmSpec,
    environment_spec: EnvironmentSpec,
    output_dir: Utf8PathBuf,

    guest_os: Box<dyn GuestOs>,

    state: VmState,

    /// If we should wait for operator intervention before terminating this
    /// instance, this will be populated.
    manual_stop: Option<TestVmManualStop>,

    /// Sending a task handle to this channel will ensure that the task runs to
    /// completion as part of the post-test cleanup fixture (i.e. before any
    /// other tests run).
    cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
}

impl TestVm {
    /// Creates a new Propolis server, attaches a client to it, and issues an
    /// `instance_ensure` request to initialize the instance in the server, but
    /// does not actually run the instance.
    ///
    /// # Arguments
    ///
    /// - vm_name: A logical name to use to refer to this VM elsewhere in the
    ///   test harness.
    /// - process_params: The parameters to use to launch the server binary.
    /// - vm_config: The VM configuration (CPUs, memory, disks, etc.) the VM
    ///   will use.
    ///
    ///   Note that this routine currently only propagates the CPU and memory
    ///   configuration into the `instance_ensure` call. Device configuration
    ///   comes from the configuration TOML in the process parameters. The
    ///   caller is responsible for ensuring the correct config file lives in
    ///   this location.
    /// - guest_os_kind: The kind of guest OS this VM will host.
    #[instrument(skip_all)]
    pub(crate) async fn new(
        ctx: &TestCtx,
        spec: VmSpec,
        environment: &EnvironmentSpec,
    ) -> Result<Self> {
        let id = Uuid::new_v4();
        let guest_os_kind = spec.guest_os_kind;

        let vm_name = &spec.vm_name;

        // TODO(#735): It would be nice to log the instance spec here too, but
        // this is extremely noisy for disks with an in-memory disk backend. The
        // problem is that this spec is a propolis-client generated type with a
        // derived Debug impl. This can be fixed by making propolis-client
        // re-export the instance spec types from propolis_api_types (instead of
        // generating them) so that it can pick up the latter crate's explicit
        // Debug impls for verbose component types.
        info!(%vm_name, ?guest_os_kind, ?environment);

        match environment
            .build(ctx)
            .await
            .context("building environment for new VM")?
        {
            Environment::Local(params) => Self::start_local_vm(
                id,
                spec,
                environment.clone(),
                params,
                ctx.framework.cleanup_task_channel(),
                ctx.manual_stop.clone(),
            ),
        }
    }

    fn start_local_vm(
        vm_id: Uuid,
        vm_spec: VmSpec,
        environment_spec: EnvironmentSpec,
        mut params: ServerProcessParameters,
        cleanup_task_tx: UnboundedSender<JoinHandle<()>>,
        manual_stop: Option<TestVmManualStop>,
    ) -> Result<Self> {
        let metrics = environment_spec.metrics.as_ref().map(|m| match m {
            MetricsLocation::Local => {
                // Our fake oximeter server should have the same logging
                // discipline as any other subprocess we'd start in support of
                // the test, so copy the config from `ServerProcessParameters`.
                let metrics_server =
                    metrics::spawn_fake_oximeter_server(params.log_config);
                params.metrics_addr = Some(metrics_server.local_addr());
                metrics_server
            }
        });

        let output_dir = params.output_dir.to_path_buf();
        let server_addr = params.server_addr;
        let server = server::PropolisServer::new(
            &vm_spec.vm_name,
            params,
            &vm_spec.bootrom_path,
        )?;

        let client = Client::new(&format!("http://{server_addr}"));
        let guest_os = guest_os::get_guest_os_adapter(vm_spec.guest_os_kind);
        Ok(Self {
            id: vm_id,
            client,
            server: Some(server),
            metrics,
            spec: vm_spec,
            environment_spec,
            output_dir,
            guest_os,
            state: VmState::New,
            cleanup_task_tx,
            manual_stop,
        })
    }

    pub fn name(&self) -> &str {
        &self.spec.vm_name
    }

    pub fn cloned_disk_handles(&self) -> Vec<Arc<dyn crate::disk::DiskConfig>> {
        self.spec.disk_handles.clone()
    }

    pub fn vm_spec(&self) -> VmSpec {
        self.spec.clone()
    }

    pub fn environment_spec(&self) -> EnvironmentSpec {
        self.environment_spec.clone()
    }

    pub fn instance_properties(&self) -> InstanceProperties {
        InstanceProperties {
            id: self.id,
            name: format!("phd-vm-{}", self.id),
            metadata: self.spec.metadata.clone(),
            description: "Pheidippides-managed VM".to_string(),
        }
    }

    pub fn metrics_sampler(&self) -> Option<FakeOximeterSampler> {
        self.metrics.as_ref().map(|m| m.sampler())
    }

    /// Sends an instance ensure request to this VM's server, allowing it to
    /// transition into the running state.
    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    async fn instance_ensure_internal<'a>(
        &self,
        migrate: Option<MigrationInfo>,
        console_source: InstanceConsoleSource<'a>,
    ) -> Result<SerialConsole> {
        if let VmState::Ensured { .. } = self.state {
            return Err(VmStateError::InstanceAlreadyEnsured.into());
        }

        let init = match migrate {
            None => InstanceInitializationMethod::Spec {
                spec: self.spec.instance_spec(),
            },
            Some(info) => InstanceInitializationMethod::MigrationTarget {
                migration_id: info.migration_id,
                replace_components: info.replace_components,
                src_addr: info.src_addr.to_string(),
            },
        };
        let ensure_req = InstanceEnsureRequest {
            properties: self.instance_properties(),
            init,
        };

        // There is a brief period where the Propolis server process has begun
        // to run but hasn't started its Dropshot server yet. Ensure requests
        // that land in that window will fail, so retry them.
        //
        // The `instance_ensure` and `instance_spec_ensure` endpoints return the
        // same response type, so (with some gnarly writing out of the types)
        // it's possible to create a boxed future that abstracts over the
        // caller's chosen endpoint.
        let ensure_fn = || async {
            let result = self
                .client
                .instance_ensure()
                .body(ensure_req.clone())
                .send()
                .await;
            if let Err(e) = result {
                match e {
                    propolis_client::Error::CommunicationError(_) => {
                        info!(%e, "retriable error from instance_spec_ensure");
                        Err(backoff::Error::transient(e))
                    }
                    _ => {
                        error!(%e, "permanent error from instance_spec_ensure");
                        Err(backoff::Error::permanent(e))
                    }
                }
            } else {
                Ok(())
            }
        };

        // It shouldn't ever take more than a couple of seconds for the Propolis
        // server to come to life. (If it does, that should be considered a bug
        // impacting VM startup times.)
        backoff::future::retry(
            backoff::ExponentialBackoff {
                max_elapsed_time: Some(std::time::Duration::from_secs(2)),
                ..Default::default()
            },
            ensure_fn,
        )
        .await?;

        let helper = InstanceSerialConsoleHelper::new(
            std::net::SocketAddr::V4(
                self.server
                    .as_ref()
                    .expect("server should be alive")
                    .server_addr(),
            ),
            WSClientOffset::MostRecent(0),
            None,
        )
        .await?;

        let console = match console_source {
            InstanceConsoleSource::New => {
                SerialConsole::new(
                    helper,
                    BufferKind::Raw,
                    self.serial_log_file_path(),
                )
                .await?
            }
            InstanceConsoleSource::InheritFrom(vm) => match &vm.state {
                VmState::New => anyhow::bail!(
                    "tried to inherit console from an unstarted VM"
                ),
                VmState::Ensured { serial } => (*serial).clone(),
            },
        };

        let instance_description =
            self.client.instance_get().send().await.with_context(|| {
                anyhow!("failed to get instance properties")
            })?;

        info!(
            ?instance_description.instance,
            "Started instance"
        );

        Ok(console)
    }

    /// Returns the kind of guest OS running in this VM.
    pub fn guest_os_kind(&self) -> GuestOsKind {
        self.spec.guest_os_kind
    }

    /// If this VM is running a Windows guest, returns a wrapper that provides
    /// Windows-specific VM functions.
    pub fn get_windows_vm(&self) -> Option<WindowsVm<'_>> {
        self.guest_os_kind().is_windows().then_some(WindowsVm { vm: self })
    }

    /// Sets the VM to the running state. If the VM has not yet been launched
    /// (by sending a Propolis instance-ensure request to it), send that request
    /// first.
    pub async fn launch(&mut self) -> Result<()> {
        self.instance_ensure().await?;
        self.run().await?;
        Ok(())
    }

    /// Sends an instance ensure request to this VM's server, but does not run
    /// the VM.
    pub async fn instance_ensure(&mut self) -> Result<()> {
        match self.state {
            VmState::New => {
                let console = self
                    .instance_ensure_internal(None, InstanceConsoleSource::New)
                    .await?;
                self.state = VmState::Ensured { serial: console };
            }
            VmState::Ensured { .. } => {}
        }

        Ok(())
    }

    /// Sets the VM to the running state without first sending an instance
    /// ensure request.
    pub async fn run(&self) -> PropolisClientResult<()> {
        self.put_instance_state(InstanceStateRequested::Run).await
    }

    /// Stops the VM.
    pub async fn stop(&self) -> PropolisClientResult<()> {
        self.put_instance_state(InstanceStateRequested::Stop).await
    }

    /// Resets the VM by requesting the `Reboot` state from the server (as
    /// distinct from requesting a reboot from within the guest).
    pub async fn reset(&self) -> PropolisClientResult<()> {
        self.put_instance_state(InstanceStateRequested::Reboot).await
    }

    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    async fn put_instance_state(
        &self,
        state: InstanceStateRequested,
    ) -> PropolisClientResult<()> {
        info!(?state, "Requesting instance state change");
        self.client.instance_state_put().body(state).send().await
    }

    /// Issues a Propolis client `instance_get` request.
    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    pub async fn get(&self) -> Result<InstanceGetResponse> {
        info!("Sending instance get request to server");
        self.client
            .instance_get()
            .send()
            .await
            .map(ResponseValue::into_inner)
            .with_context(|| anyhow!("failed to query instance properties"))
    }

    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    pub async fn get_spec(&self) -> Result<InstanceSpecGetResponse> {
        info!("Sending instance spec get request to server");
        self.client
            .instance_spec_get()
            .send()
            .await
            .map(ResponseValue::into_inner)
            .with_context(|| anyhow!("failed to query instance spec"))
    }

    /// Starts this instance by issuing an ensure request that specifies a
    /// migration from `source` and then running the target.
    #[instrument(
        skip_all,
        fields(
            source = source.spec.vm_name,
            target = self.spec.vm_name,
            source_id = %source.id,
            target_id = %self.id
        )
    )]
    pub async fn migrate_from(
        &mut self,
        source: &Self,
        migration_id: Uuid,
        timeout: impl Into<MigrationTimeout>,
    ) -> Result<()> {
        let timeout_duration = match Into::<MigrationTimeout>::into(timeout) {
            MigrationTimeout::Explicit(val) => val,
            MigrationTimeout::InferFromMemorySize => {
                let mem_mib = self.spec.instance_spec().board.memory_mb;
                std::time::Duration::from_secs(
                    (MIGRATION_SECS_PER_GUEST_GIB * mem_mib) / 1024,
                )
            }
        };

        match self.state {
            VmState::New => {
                let server_addr = source
                    .server
                    .as_ref()
                    .expect("source server should be alive")
                    .server_addr();

                info!(
                    ?migration_id,
                    ?timeout_duration,
                    "Migrating from source at address {}",
                    server_addr
                );

                let serial = self
                    .instance_ensure_internal(
                        Some(MigrationInfo {
                            migration_id,
                            src_addr: SocketAddr::V4(server_addr),
                            replace_components: self
                                .generate_replacement_components(),
                        }),
                        InstanceConsoleSource::InheritFrom(source),
                    )
                    .await?;

                self.state = VmState::Ensured { serial };

                let span = info_span!("migrate", ?migration_id);
                let _guard = span.enter();
                let migrate_fn = || async {
                    let state = self
                        .get_migration_state()
                        .await
                        .map_err(backoff::Error::Permanent)?
                        .migration_in
                        .expect("instance should be migrating in")
                        .state;

                    match state {
                        MigrationState::Finish => {
                            info!("Migration completed successfully");
                            Ok(())
                        }
                        MigrationState::Error => {
                            info!("Instance reported error during migration");
                            Err(backoff::Error::Permanent(anyhow!(
                                "error during migration"
                            )))
                        }
                        _ => Err(backoff::Error::transient(anyhow!(
                            "migration not done yet"
                        ))),
                    }
                };

                backoff::future::retry(
                    backoff::ExponentialBackoff {
                        max_elapsed_time: Some(timeout_duration),
                        ..Default::default()
                    },
                    migrate_fn,
                )
                .await
                .context("live migration")?;

                Ok(())
            }
            VmState::Ensured { .. } => {
                Err(VmStateError::InstanceAlreadyEnsured.into())
            }
        }
    }

    fn generate_replacement_components(&self) -> ReplacementComponents {
        let mut map = ReplacementComponents::new();
        for (id, comp) in &self.spec.instance_spec().components {
            match comp {
                Component::MigrationFailureInjector(inj) => {
                    map.insert(
                        id.to_string(),
                        ReplacementComponent::MigrationFailureInjector(
                            inj.clone(),
                        ),
                    );
                }
                Component::CrucibleStorageBackend(be) => {
                    map.insert(
                        id.to_string(),
                        ReplacementComponent::CrucibleStorageBackend(
                            be.clone(),
                        ),
                    );
                }
                _ => {}
            }
        }

        map
    }

    pub async fn get_migration_state(
        &self,
    ) -> Result<InstanceMigrateStatusResponse> {
        Ok(self.client.instance_migrate_status().send().await?.into_inner())
    }

    pub async fn replace_crucible_vcr(
        &self,
        disk: &CrucibleDisk,
    ) -> anyhow::Result<()> {
        let vcr = disk.vcr();
        let body = propolis_client::types::InstanceVcrReplace {
            vcr_json: serde_json::to_string(&vcr)
                .with_context(|| format!("serializing VCR {vcr:?}"))?,
        };

        info!(
            disk_name = disk.device_name().as_str(),
            vcr = ?vcr,
            "issuing Crucible VCR replacement request"
        );

        let response_value = self
            .client
            .instance_issue_crucible_vcr_request()
            .id(disk.device_name().clone().into_backend_name().into_string())
            .body(body)
            .send()
            .await?;

        anyhow::ensure!(
            response_value.status().is_success(),
            "VCR replacement request returned an error value: \
            {response_value:?}"
        );

        Ok(())
    }

    pub async fn get_serial_console_history(
        &self,
        from_start: u64,
    ) -> Result<InstanceSerialConsoleHistoryResponse> {
        Ok(self
            .client
            .instance_serial_history_get()
            .from_start(from_start)
            .send()
            .await?
            .into_inner())
    }

    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    pub async fn wait_for_state(
        &self,
        target: InstanceState,
        timeout_duration: Duration,
    ) -> Result<()> {
        info!(
            "Waiting {:?} for server to reach state {:?}",
            timeout_duration, target
        );

        let wait_fn = || async {
            let current = self
                .get()
                .await
                .map_err(backoff::Error::Permanent)?
                .instance
                .state;

            if current == target {
                Ok(())
            } else {
                Err(backoff::Error::transient(anyhow!(
                    "not in desired state yet: current {current:?}, target {target:?}"
                )))
            }
        };

        backoff::future::retry(
            backoff::ExponentialBackoff {
                max_elapsed_time: Some(timeout_duration),
                ..Default::default()
            },
            wait_fn,
        )
        .await
        .context("waiting for instance state")?;

        Ok(())
    }

    /// Waits for the guest to reach a login prompt and then logs in. Note that
    /// login is not automated: this call is required to get to a shell prompt
    /// to allow the use of [`Self::run_shell_command`].
    ///
    /// This routine consumes all of the serial console input that precedes the
    /// initial login prompt and the login prompt itself.
    pub async fn wait_to_boot(&self) -> Result<()> {
        let timeout_duration = Duration::from_secs(300);
        let boot_sequence = self.guest_os.get_login_sequence();
        let boot = async move {
            info!(
                vm = self.spec.vm_name,
                vm_id = %self.id,
                ?timeout_duration,
                "waiting for guest to boot"
            );

            for step in boot_sequence.0 {
                debug!(?step, "executing command in boot sequence");
                match step {
                    CommandSequenceEntry::WaitFor(s) => {
                        self.wait_for_serial_output(
                            s.as_ref(),
                            SerialOutputTimeout::CallerTimeout,
                        )
                        .await?;
                    }
                    CommandSequenceEntry::WriteStr(s) => {
                        self.send_serial_str(s.as_ref()).await?;
                        self.send_serial_str("\n").await?;
                    }
                    CommandSequenceEntry::EstablishConsistentEcho {
                        send,
                        expect,
                        timeout,
                    } => {
                        self.establish_serial_console_echo(
                            send.as_ref(),
                            expect.as_ref(),
                            timeout,
                            SerialOutputTimeout::CallerTimeout,
                        )
                        .await?;
                    }
                    CommandSequenceEntry::ClearBuffer => {
                        self.clear_serial_buffer()?
                    }
                    CommandSequenceEntry::ChangeSerialConsoleBuffer(kind) => {
                        self.change_serial_buffer_kind(kind)?;
                    }
                    CommandSequenceEntry::SetRepeatedCharacterDebounce(
                        duration,
                    ) => {
                        self.set_serial_repeated_character_debounce(duration)?;
                    }
                }
            }

            info!("Guest has booted");
            Ok::<(), anyhow::Error>(())
        }
        .instrument(info_span!("wait_to_boot"));

        match timeout(timeout_duration, boot).await {
            Err(_) => {
                error!(
                    "Guest did not boot after {}ms! Collecting core..",
                    timeout_duration.as_millis()
                );
                let proc = self.server.as_ref().unwrap();
                proc.core();
                anyhow::bail!("timed out while waiting to boot")
            }
            Ok(inner) => {
                inner.context("executing guest login sequence")?;
            }
        };

        Ok(())
    }

    /// Waits for up to `timeout_duration` for `line` to appear on the guest
    /// serial console, then returns the contents of the console buffer that
    /// preceded the requested string.
    #[instrument(skip_all, fields(vm = self.spec.vm_name, vm_id = %self.id))]
    pub async fn wait_for_serial_output(
        &self,
        line: &str,
        timeout_duration: impl Into<SerialOutputTimeout>,
    ) -> Result<String> {
        let timeout_duration: SerialOutputTimeout = timeout_duration.into();
        info!(
            target = line,
            ?timeout_duration,
            "Waiting for output on serial console"
        );

        let received = {
            let line = line.to_string();
            let (preceding_tx, preceding_rx) = oneshot::channel();
            match &self.state {
                VmState::Ensured { serial } => {
                    serial
                        .register_wait_for_string(line.clone(), preceding_tx)?;
                    let t =
                        timeout(timeout_duration.into(), preceding_rx).await;
                    match t {
                        Err(timeout_elapsed) => {
                            serial.cancel_wait_for_string()?;
                            Err(anyhow!(timeout_elapsed))
                        }
                        Ok(Err(e)) => Err(e.into()),
                        Ok(Ok(received_string)) => Ok(Some(received_string)),
                    }
                }
                VmState::New => Err(VmStateError::InstanceNotEnsured.into()),
            }
        };

        received?.ok_or_else(|| {
            anyhow!("wait_for_serial_output recv channel unexpectedly closed")
        })
    }

    /// Attempts to establish that the guest serial console consistently echoes
    /// characters by writing `send` and waiting for `expect` to appear within
    /// the supplied `timeout`.
    ///
    /// This function will back off between attempts to send and await
    /// characters (but will *not* change the delay used to wait for characters
    /// to be echoed) and will retry for up to the duration specified by
    /// `overall_timeout`.
    async fn establish_serial_console_echo(
        &self,
        send: &str,
        expect: &str,
        expect_timeout: std::time::Duration,
        overall_timeout: impl Into<SerialOutputTimeout>,
    ) -> Result<()> {
        let overall_timeout: SerialOutputTimeout = overall_timeout.into();
        info!(
            send,
            expect,
            ?expect_timeout,
            ?overall_timeout,
            "establishing serial console echo"
        );

        let send_and_expect = || async {
            self.send_serial_str(send).await?;
            self.wait_for_serial_output(expect, expect_timeout)
                .await
                .map(|_| ())
                .map_err(backoff::Error::transient)
        };

        backoff::future::retry(
            backoff::ExponentialBackoff {
                max_elapsed_time: match overall_timeout {
                    SerialOutputTimeout::Explicit(d) => Some(d),
                    SerialOutputTimeout::CallerTimeout => None,
                },
                ..Default::default()
            },
            send_and_expect,
        )
        .await?;

        Ok(())
    }

    /// Runs the shell command `cmd` by sending it to the serial console, then
    /// waits for another shell prompt to appear using
    /// [`Self::wait_for_serial_output`] and returns any text that was buffered
    /// to the serial console after the command was sent.
    ///
    /// After running the shell command, sends `echo $?` to query and return the
    /// command's return status as well.
    ///
    /// This will return an error if the command returns a non-zero status by
    /// default; to ignore the status or expect a non-zero as a positive
    /// condition, see [`ShellOutputExecutor::ignore_status`] or
    /// [`ShellOutputExecutor::check_err`].
    pub fn run_shell_command<'a>(
        &'a self,
        cmd: &'a str,
    ) -> ShellOutputExecutor<'a> {
        ShellOutputExecutor {
            vm: self,
            cmd,
            status_check: Some(StatusCheck::Ok),
        }
    }

    pub async fn graceful_reboot(&self) -> Result<()> {
        self.run_command_sequence(self.guest_os.graceful_reboot()).await?;
        self.wait_to_boot().await
    }

    /// Run a [`CommandSequence`] in the context of a booted and logged-in
    /// guest. The guest is expected to be at a shell prompt when this sequence
    /// is begun.
    async fn run_command_sequence(
        &self,
        command_sequence: CommandSequence<'_>,
    ) -> Result<()> {
        for step in command_sequence.0 {
            match step {
                CommandSequenceEntry::WaitFor(s) => {
                    self.wait_for_serial_output(
                        s.as_ref(),
                        std::time::Duration::from_secs(15),
                    )
                    .await?;
                }
                CommandSequenceEntry::WriteStr(s) => {
                    self.send_serial_str(s.as_ref()).await?;
                }
                CommandSequenceEntry::ClearBuffer => {
                    self.clear_serial_buffer()?
                }
                _ => {
                    anyhow::bail!(
                        "Unexpected command sequence entry {step:?} while \
                        running shell command"
                    );
                }
            }
        }

        Ok(())
    }

    /// Sends `string` to the guest's serial console worker, then waits for the
    /// entire string to be sent to the guest before returning.
    pub async fn send_serial_str(&self, string: &str) -> Result<()> {
        if !string.is_empty() {
            self.send_serial_bytes(Vec::from(string.as_bytes()))?.await?;
        }
        Ok(())
    }

    fn serial_console(&self) -> Result<&SerialConsole> {
        match &self.state {
            VmState::Ensured { serial } => Ok(serial),
            VmState::New => Err(VmStateError::InstanceNotEnsured.into()),
        }
    }

    fn send_serial_bytes(
        &self,
        bytes: Vec<u8>,
    ) -> Result<oneshot::Receiver<()>> {
        self.serial_console()?.send_bytes(bytes)
    }

    fn clear_serial_buffer(&self) -> Result<()> {
        self.serial_console()?.clear()
    }

    fn change_serial_buffer_kind(&self, kind: BufferKind) -> Result<()> {
        self.serial_console()?.change_buffer_kind(kind)
    }

    fn set_serial_repeated_character_debounce(
        &self,
        delay: std::time::Duration,
    ) -> Result<()> {
        self.serial_console()?.set_repeated_character_debounce(delay)
    }

    /// Indicates whether this VM's guest OS has a read-only filesystem.
    pub fn guest_os_has_read_only_fs(&self) -> bool {
        self.guest_os.read_only_fs()
    }

    /// Generates a path to a file into which the VM's serial console adapter
    /// can log serial console output.
    fn serial_log_file_path(&self) -> Utf8PathBuf {
        let filename = format!("{}.serial.log", self.spec.vm_name);
        let mut path = self.output_dir.clone();
        path.push(filename);
        path
    }
}

impl Drop for TestVm {
    fn drop(&mut self) {
        if let VmState::New = self.state {
            return;
        }

        // Propolis processes don't automatically release their bhyve VMMs on
        // process shutdown--this has to be done explicitly by stopping the VM.
        // Unfortunately, the Propolis client is fully asynchronous, and because
        // VMs might get dropped from an async context, it's not possible to use
        // `block_on` here to guarantee that VMMs are synchronously cleaned up
        // when a `TestVm` is dropped.
        //
        // To drop the VMM safely, destructure this VM into its client, server,
        // and attached disk objects, and hand them all off to a separate
        // destructor task. Once the task is spawned, send it back to the
        // framework so that the test runner can wait for all the VMs destroyed
        // by a test case to be reaped before starting another test.
        let client = self.client.clone();
        let mut server = self.server.take().expect(
            "TestVm should always have a valid server until it's dropped",
        );

        let disks: Vec<_> = self.vm_spec().disk_handles.drain(..).collect();

        let manual_stop_opt = self.manual_stop.take();
        let vm_name = self.vm_spec().vm_name.to_owned();

        // The order in which the task destroys objects is important: the server
        // can't be killed until the client has gotten a chance to shut down
        // the VM, and the disks can't be destroyed until the server process has
        // been killed.
        let task = tokio::spawn(
            async move {
                // The task doesn't use the disks directly, but they need to be
                // kept alive until the server process is gone.
                let _disks = disks;

                // Check if we should let the user access the instance of a
                // failed testcase before ensuring its demolition
                if let Some(manual_stop) = manual_stop_opt {
                    manual_stop.wait_for_stop(vm_name, &client, &server).await;
                }

                // Try to make sure the server's kernel VMM is cleaned up before
                // killing the server process. This is best-effort; if it fails,
                // the kernel VMM is leaked. This generally indicates a bug in
                // Propolis (e.g. a VMM reference leak or an instance taking an
                // unexpectedly long time to stop).
                try_ensure_vm_destroyed(&client).await;

                // Make sure the server process is dead before trying to clean
                // up any disks. Otherwise, ZFS may refuse to delete a cloned
                // disk because the server process still has it open.
                server.kill();
            }
            .instrument(
                info_span!("VM cleanup", vm = self.spec.vm_name, vm_id = %self.id),
            ),
        );

        let _ = self.cleanup_task_tx.send(task);
    }
}

/// Attempts to ensure that the Propolis server referred to by `client` is in
/// the `Destroyed` state by stopping any VM that happens to be running in that
/// server.
///
/// This function is best-effort.
async fn try_ensure_vm_destroyed(client: &Client) {
    match client.instance_get().send().await.map(|r| r.instance.state) {
        Ok(InstanceState::Destroyed) => return,
        Err(error) => warn!(
            %error,
            "error getting instance state from dropped VM"
        ),
        Ok(_) => {}
    }

    debug!("trying to ensure Propolis server VM is destroyed");
    if let Err(error) = client
        .instance_state_put()
        .body(InstanceStateRequested::Stop)
        .send()
        .await
    {
        // If the put fails because the instance was already run down, there's
        // nothing else to do. If it fails for some other reason, there's
        // nothing else that *can* be done, but the error is unusual and should
        // be logged.
        match error.status() {
            Some(reqwest::StatusCode::FAILED_DEPENDENCY) => {}
            _ => {
                error!(
                    %error,
                    "error stopping VM to move it to Destroyed"
                );
            }
        }

        return;
    }

    let check_destroyed = || async {
        match client.instance_get().send().await.map(|r| r.instance.state) {
            Ok(InstanceState::Destroyed) => Ok(()),
            Ok(state) => Err(backoff::Error::transient(anyhow::anyhow!(
                "instance not destroyed yet (state: {state:?})"
            ))),
            Err(error) => {
                error!(
                    %error,
                    "failed to get state of VM being destroyed"
                );
                Err(backoff::Error::permanent(error.into()))
            }
        }
    };

    let destroyed = backoff::future::retry(
        backoff::ExponentialBackoff {
            max_elapsed_time: Some(std::time::Duration::from_secs(5)),
            ..Default::default()
        },
        check_destroyed,
    )
    .await;

    if let Err(error) = destroyed {
        error!(%error, "VM not destroyed after 5 seconds");
    }
}

/// For waiting for instances of failed testcases to be manually shut down,
/// when phd-runner is invoked with --manual-stop-on-failure
#[derive(Clone)]
pub struct TestVmManualStop {
    test_name: String,
    /// If we should wait for operator intervention before terminating this
    /// instance, this will be sent a `Some(false)`.
    success_rx: watch::Receiver<Option<bool>>,
    /// While waiting for instance shutdown, this may be sent a `true` if the
    /// user sends a keyboard interrupt to indicate we should stop waiting.
    sigint_rx: watch::Receiver<bool>,
}
impl TestVmManualStop {
    pub fn new(
        test_name: String,
        success_rx: watch::Receiver<Option<bool>>,
        sigint_rx: watch::Receiver<bool>,
    ) -> Self {
        Self { test_name, success_rx, sigint_rx }
    }
    async fn wait_for_stop(
        mut self,
        vm_name: String,
        client: &Client,
        server: &server::PropolisServer,
    ) {
        if self.success_rx.changed().await.is_ok() {
            let success_opt = *self.success_rx.borrow();
            if let Some(false) = success_opt {
                let sock = server.server_addr();
                let ip = sock.ip();
                let port = sock.port();
                let test_name = self.test_name;
                let mut uninformed = true;
                // States that might be worth inspecting out-of-band
                while let Ok(
                    InstanceState::Running
                    | InstanceState::Migrating
                    | InstanceState::Rebooting
                    | InstanceState::Repairing,
                ) = client
                    .instance_get()
                    .send()
                    .await
                    .map(|inst| inst.instance.state)
                {
                    if *self.sigint_rx.borrow() {
                        break;
                    }
                    if uninformed {
                        error!(
                            r#"
test {test_name:?} failed. propolis-server {vm_name:?} was left running,
with API accessible at http://{sock}
phd-runner will resume when this instance is shut down; e.g. by one of:

$ propolis-cli -s {ip} -p {port} serial
localhost:~# poweroff

$ propolis-cli -s {ip} -p {port} state stop
"#
                        );
                        uninformed = false;
                    }
                    tokio::time::sleep(Duration::from_secs(1)).await;
                }
            }
        }
    }
}


================================================
FILE: phd-tests/framework/src/test_vm/server.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Routines and data structures for working with Propolis server processes.

use std::{
    fmt::Debug,
    net::{SocketAddr, SocketAddrV4},
    os::unix::process::CommandExt,
    time::SystemTime,
};

use anyhow::Result;
use camino::{Utf8Path, Utf8PathBuf};
use tracing::{debug, info, warn};

use crate::log_config::LogConfig;

/// Parameters used to launch and configure the Propolis server process. These
/// are distinct from the parameters used to configure the VM that that process
/// will host.
#[derive(Clone, Debug)]
pub struct ServerProcessParameters<'a> {
    /// The path to the server binary to launch.
    pub server_path: Utf8PathBuf,

    /// The directory in which to place files that are written by this server
    /// process.
    pub output_dir: &'a Utf8Path,

    /// The address at which the server should serve.
    pub server_addr: SocketAddrV4,

    /// The address of HTTP server with which the spawned server should register
    /// as an Oximeter producer.
    pub metrics_addr: Option<SocketAddr>,

    /// The address at which the server should offer its VNC server.
    pub vnc_addr: SocketAddrV4,

    pub log_config: LogConfig,
}

pub struct PropolisServer {
    server: Option<std::process::Child>,
    address: SocketAddrV4,
    output_dir: Utf8PathBuf,
}

impl PropolisServer {
    pub(crate) fn new(
        vm_name: &str,
        process_params: ServerProcessParameters,
        bootrom_path: &Utf8Path,
    ) -> Result<Self> {
        let ServerProcessParameters {
            server_path,
            output_dir,
            server_addr,
            metrics_addr,
            vnc_addr,
            log_config,
        } = process_params;

        info!(
            ?server_path,
            ?bootrom_path,
            ?server_addr,
            "Launching Propolis server"
        );

        let (server_stdout, server_stderr) =
            log_config.output_mode.get_handles(&output_dir, vm_name)?;

        let mut args = vec![server_path.into_string(), "run".to_string()];

        if let Some(metrics_addr) = metrics_addr {
            args.extend_from_slice(&[
                "--metric-addr".to_string(),
                metrics_addr.to_string(),
            ]);
        }

        args.extend_from_slice(&[
            bootrom_path.as_str().to_string(),
            server_addr.to_string(),
            vnc_addr.to_string(),
        ]);

        let mut server_cmd = std::process::Command::new("pfexec");
        server_cmd.args(args).stdout(server_stdout).stderr(server_stderr);

        // Gracefully shutting down a Propolis server requires PHD to send an
        // instance stop request to the server before it is actually terminated.
        // This ensures that the server has a chance to clean up kernel VMM
        // resources. It's desirable for the server to do this and not PHD
        // because a failure to clean up VMMs on stop is a Propolis bug.
        //
        // The PHD runner sets up a SIGINT handler that tries to give the
        // framework an opportunity to issue these requests before the runner
        // exits. However, pressing Ctrl-C in a shell will typically broadcast
        // SIGINT to all of the processes in the foreground process's group, not
        // just to the foreground process itself. This means that a Ctrl-C press
        // will usually kill all of PHD's Propolis servers before the cleanup
        // logic can run.
        //
        // To avoid this problem, add a pre-`exec` hook that directs Propolis
        // servers to ignore SIGINT. On Ctrl-C, the runner will drop all active
        // `TestVm`s, and this drop path (if allowed to complete) will kill all
        // these processes.
        unsafe {
            server_cmd.pre_exec(move || {
                libc::signal(libc::SIGINT, libc::SIG_IGN);
                Ok(())
            });
        }

        let server = PropolisServer {
            server: Some(server_cmd.spawn()?),
            address: server_addr,
            // Stash the same output directory in case the framework has to
            // write any files on behalf of the test run.
            output_dir: output_dir.to_owned(),
        };

        info!(
            "Launched server with pid {}",
            server.server.as_ref().unwrap().id()
        );
        Ok(server)
    }

    pub(crate) fn server_addr(&self) -> SocketAddrV4 {
        self.address
    }

    /// Collect a core of this server process, placing it in the same output
    /// directory as other artifacts of this test.
    pub(super) fn core(&self) {
        let Some(server_proc) = self.server.as_ref() else {
            warn!("Tried to produce a core without a propolis-server?");
            return;
        };

        let core_name = format!(
            "core-{}",
            SystemTime::now()
                .duration_since(SystemTime::UNIX_EPOCH)
                .expect("Time is gone, the song is over")
                .as_millis()
        );
        let core_path = self.output_dir.join(core_name);

        std::process::Command::new("pfexec")
            .args([
                "gcore".as_ref(),
                "-o".as_ref(),
                core_path.as_os_str(),
                server_proc.id().to_string().as_ref(),
            ])
            .spawn()
            .expect("can try to gcore a process")
            .wait()
            .expect("can gcore a propolis-server we spawned");

        warn!("core written to {}", core_path);
    }

    /// Kills this server process if it hasn't been killed already.
    pub(super) fn kill(&mut self) {
        let Some(mut server) = self.server.take() else {
            return;
        };

        let pid = server.id();
        debug!(
            pid,
            %self.address,
            "Killing Propolis server process"
        );

        std::process::Command::new("pfexec")
            .args(["kill", &pid.to_string()])
            .spawn()
            .expect("should be able to kill a phd-spawned propolis")
            .wait()
            .expect("kill of phd-spawned propolis was run");

        server
            .wait()
            .expect("should be able to wait on a phd-spawned propolis");
    }
}

impl Drop for PropolisServer {
    fn drop(&mut self) {
        self.kill();
    }
}


================================================
FILE: phd-tests/framework/src/test_vm/spec.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use crate::{
    disk::{self, DiskConfig},
    guest_os::GuestOsKind,
};
use camino::Utf8PathBuf;
use propolis_client::instance_spec::{
    Component, InstanceMetadata, InstanceSpec,
};
use uuid::Uuid;

/// The set of objects needed to start and run a guest in a `TestVm`.
#[derive(Clone)]
pub struct VmSpec {
    pub vm_name: String,

    /// The instance spec to pass to the VM when starting the guest.
    base_instance_spec: InstanceSpec,

    /// A set of handles to disk files that the VM's disk backends refer to.
    pub disk_handles: Vec<Arc<dyn disk::DiskConfig>>,

    /// The guest OS adapter to use for the VM.
    pub guest_os_kind: GuestOsKind,

    /// The bootrom path to pass to this VM's Propolis server processes.
    pub bootrom_path: Utf8PathBuf,

    /// Metadata used to track instance timeseries data.
    pub metadata: InstanceMetadata,
}

impl VmSpec {
    pub fn get_disk_by_device_name(
        &self,
        name: &str,
    ) -> Option<&Arc<dyn disk::DiskConfig>> {
        self.disk_handles
            .iter()
            .find(|disk| disk.device_name().as_str() == name)
    }

    pub(crate) fn new(
        vm_name: String,
        instance_spec: InstanceSpec,
        disk_handles: Vec<Arc<dyn disk::DiskConfig>>,
        guest_os_kind: GuestOsKind,
        bootrom_path: Utf8PathBuf,
        metadata: InstanceMetadata,
    ) -> Self {
        Self {
            vm_name,
            base_instance_spec: instance_spec,
            disk_handles,
            guest_os_kind,
            bootrom_path,
            metadata,
        }
    }

    pub(crate) fn set_vm_name(&mut self, name: String) {
        self.vm_name = name
    }

    pub(crate) fn instance_spec(&self) -> InstanceSpec {
        let mut spec = self.base_instance_spec.clone();
        self.set_crucible_backends(&mut spec);
        spec
    }

    /// Update the Crucible backend specs in the instance spec to match the
    /// current backend specs given by this specification's disk handles.
    fn set_crucible_backends(&self, spec: &mut InstanceSpec) {
        for disk in &self.disk_handles {
            let disk = if let Some(disk) = disk.as_crucible() {
                disk
            } else {
                continue;
            };

            let backend_spec = disk.backend_spec();
            let backend_name = disk
                .device_name()
                .clone()
                .into_backend_name()
                .into_string()
                .into();
            if let Some(Component::CrucibleStorageBackend(_)) =
                spec.components.get(&backend_name)
            {
                spec.components.insert(backend_name, backend_spec);
            }
        }
    }

    /// Generate new sled-identifiers for self.
    ///
    /// This creates new metadata for the instance appropriate for a successor
    /// VM. In that case, the sled identifiers will be different, but the
    /// project / instance identifiers should be the same.
    ///
    /// This models the case we're interested in during a migration: the
    /// sled-agent will provide the same silo / project / instance IDs to that
    /// destination Propolis, but it will have different sled identifiers,
    /// because the hosting sled differs. In general, all the IDs could be
    /// different, but we only change the actual ID and serial number here.
    pub(crate) fn refresh_sled_identifiers(&mut self) {
        let id = Uuid::new_v4();
        self.metadata.sled_id = id;
        self.metadata.sled_serial = id.to_string();
    }
}


================================================
FILE: phd-tests/framework/src/zfs.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Support functions for working with ZFS snapshots and clones.

use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use tracing::{debug, error};
use uuid::Uuid;

#[derive(Debug)]
struct DatasetName(String);

#[derive(Debug)]
struct SnapshotName(String);

#[derive(Debug)]
struct CloneName(String);

/// Describes a dataset that's mounted at a specific point in the global
/// directory hierarchy.
#[derive(Debug)]
struct Dataset {
    /// The name of this dataset, used to refer to it as a subject of a ZFS
    /// operation.
    name: DatasetName,

    /// The mount point of this dataset. Stripping this prefix from the absolute
    /// path of a file that lies in this dataset yields the path to the file
    /// relative to the dataset root. This is needed to find the file if the
    /// dataset (or a clone of it) is mounted someplace else.
    mount_point: Utf8PathBuf,
}

/// Describes a snapshot of a specific dataset. When dropped, attempts to delete
/// itself using `zfs destroy`.
#[derive(Debug)]
struct Snapshot {
    /// The name of this snapshot, used to refer to it as a subject of a ZFS
    /// operation.
    name: SnapshotName,
}

impl Snapshot {
    /// Takes a snapshot of the supplied `dataset`.
    fn create_from_dataset(dataset: &DatasetName) -> anyhow::Result<Self> {
        let snapshot_name = format!("{}@phd-{}", dataset.0, Uuid::new_v4());
        zfs_command("snapshot", &[&snapshot_name])?;

        Ok(Self { name: SnapshotName(snapshot_name) })
    }
}

impl Drop for Snapshot {
    fn drop(&mut self) {
        debug!(name = self.name.0, "zfs snapshot dropped");
        let _ = zfs_command("destroy", &[&self.name.0]);
    }
}

/// Describes a clone of a specific snapshot. When dropped, attempts to delete
/// itself using `zfs destroy`.
#[derive(Debug)]
struct Clone {
    /// The name of this clone, used to refer to it as a subject of a ZFS
    /// operation.
    name: CloneName,

    /// The point at which this clone is mounted in the global directory
    /// hierarchy.
    mount_point: Utf8PathBuf,

    /// The snapshot this clone derives from. Snapshots can't be deleted until
    /// all their clones are gone; this reference helps to ensure that clones
    /// and snapshots are deleted in the correct order irrespective of when the
    /// clones are dropped.
    _snapshot: Snapshot,
}

impl Drop for Clone {
    fn drop(&mut self) {
        debug!(name = self.name.0, "zfs clone dropped");
        let _ = zfs_command("destroy", &[&self.name.0]);
    }
}

/// Represents a specific copy-on-write file within a ZFS clone. When this is
/// dropped, attempts to delete the associated clone.
#[derive(Debug)]
pub struct ClonedFile {
    /// The clone to which this file belongs.
    clone: Clone,

    /// The path to this file relative to the mount point of the clone.
    relative_path: Utf8PathBuf,
}

impl ClonedFile {
    /// Creates a snapshot and clone of the dataset that contains the canonical
    /// location of the file indicated by `path`.
    pub fn create_from_path(path: &Utf8Path) -> anyhow::Result<Self> {
        // Canonicalize the path to resolve any symbolic links before doing any
        // prefix matching.
        let canonical_path = path.canonicalize_utf8()?;

        let containing_dataset = Dataset::from_path(&canonical_path)
            .with_context(|| format!("getting dataset containing {path}"))?;

        let relative_file_path = canonical_path
            .strip_prefix(&containing_dataset.mount_point)
            .context("getting relative path to file to clone")?;

        let snapshot = Snapshot::create_from_dataset(&containing_dataset.name)?;
        Self::create_from_paths_and_snapshot(
            containing_dataset,
            relative_file_path,
            snapshot,
        )
        .with_context(|| {
            format!(
                "creating zfs clone of {canonical_path} with original path \
                {path}"
            )
        })
    }

    /// Yields the absolute path to this cloned file in the global directory
    /// hierarchy.
    pub fn path(&self) -> Utf8PathBuf {
        let mut path = self.clone.mount_point.clone();
        path.push(&self.relative_path);
        path
    }

    /// Given a path to a file relative to the root of its (mounted) dataset,
    /// and the name of a snapshot of that dataset, clones the snapshot and
    /// returns a handle to the clone. The [`path`] method can be used to find
    /// the absolute path to the file within the clone.
    fn create_from_paths_and_snapshot(
        dataset: Dataset,
        relative_file_path: &Utf8Path,
        snapshot: Snapshot,
    ) -> anyhow::Result<Self> {
        let clone_name =
            format!("{}/phd-clone-{}", dataset.name.0, Uuid::new_v4());

        zfs_command("clone", &[&snapshot.name.0, &clone_name])?;

        // If any errors occur between this point and the construction of a
        // `Clone` wrapper, this function needs to destroy the new clone
        // manually. The only thing needed to construct a `Clone` is its mount
        // point, so put that logic in a function and clean up manually if it
        // fails.
        fn get_clone_mount_point(
            clone_name: &str,
        ) -> anyhow::Result<Utf8PathBuf> {
            let output = zfs_command("list", &[clone_name])?;
            let (object_name, mount_point) = parse_zfs_list_output(output)?;

            anyhow::ensure!(
                object_name == clone_name,
                "zfs list returned object {object_name} when asked about clone \
                {clone_name}"
            );

            let Some(mount_point) = mount_point else {
                anyhow::bail!("new zfs clone {clone_name} not mounted");
            };

            Ok(mount_point)
        }

        let mount_point = match get_clone_mount_point(&clone_name) {
            Ok(mount_point) => mount_point,
            Err(e) => {
                let _ = zfs_command("destroy", &[&clone_name]);
                return Err(e);
            }
        };

        Ok(Self {
            clone: Clone {
                name: CloneName(clone_name),
                mount_point,
                _snapshot: snapshot,
            },
            relative_path: relative_file_path.to_path_buf(),
        })
    }
}

impl Dataset {
    /// Looks up the dataset containing `path`.
    ///
    /// This routine fails if any `zfs` command line operations fail or return
    /// no output. It also fails if the found dataset's mount point is not a
    /// prefix of the supplied `path`, e.g. because of a symbolic link somewhere
    /// in the path.
    fn from_path(path: &Utf8Path) -> anyhow::Result<Self> {
        let output = zfs_command("list", &[path.as_str()])?;
        let (name, mount_point) = parse_zfs_list_output(output)
            .with_context(|| format!("parsing output from zfs list {path}"))?;

        let Some(mount_point) = mount_point else {
            anyhow::bail!(
                "`zfs list {path}` produced a dataset with no mount point"
            );
        };

        // The rest of this module needs to be able to strip the mount point
        // from the original path to get a dataset-relative path to the target
        // file. If the file path isn't prefixed by the mount point, this won't
        // work. This should generally not happen if the caller was diligent
        // about providing canonicalized paths.
        anyhow::ensure!(
            path.starts_with(&mount_point),
            "zfs dataset containing '{path}' is not prefixed by dataset mount
            point {mount_point} (is the path canonicalized?)"
        );

        Ok(Self { name: DatasetName(name.to_owned()), mount_point })
    }
}

/// Parses the output returned from a `zfs list` command into an object name and
/// a mountpoint.
///
/// This routine assumes the caller scoped its `zfs list` command so that it
/// returns exactly one (non-header) line of output. If it finds more, this
/// routine fails.
fn parse_zfs_list_output(
    output: std::process::Output,
) -> anyhow::Result<(String, Option<Utf8PathBuf>)> {
    let output = String::from_utf8(output.stdout)
        .context("converting `zfs list` output to string")?;

    debug!(stdout = output, "parsing zfs list output");

    // The expected output format from this command is
    //
    // NAME              USED  AVAIL     REFER  MOUNTPOINT
    // rpool/home/user   263G  549G       263G  /home/user
    let mut lines = output.lines();

    // Consume the header line and make sure it looks like it's sensibly
    // formatted. In particular, if the supplied path isn't part of a dataset,
    // `zfs` will return `cannot open 'path'`.
    let header = lines.next().ok_or_else(|| {
        anyhow::anyhow!("`zfs list` unexpectedly printed nothing")
    })?;

    anyhow::ensure!(
        header.starts_with("NAME"),
        "expected first line of `zfs list` output to start with NAME \
        (got '{header}')"
    );

    // Capture the first line of actual output for splitting and parsing. If
    // there are more output lines than this, fail instead of ignoring some
    // output.
    let answer = lines.next().ok_or_else(|| {
        anyhow::anyhow!("`zfs list` didn't have an output line")
    })?;

    if lines.next().is_some() {
        anyhow::bail!("`zfs list` returned more than one output line");
    }

    // `zfs list` output looks something like this (with a header line for
    // reference):
    //
    // NAME              USED  AVAIL     REFER  MOUNTPOINT
    // rpool/home/user   263G  549G       263G  /home/user
    //
    // The object name is the first token and the mount point is the fifth
    // (fourth after consuming the name).
    let mut words = answer.split_whitespace();
    let name = words.next().ok_or_else(|| {
        anyhow::anyhow!("`zfs list` didn't produce a dataset name")
    })?;

    // An unmounted object's mount point displays as "-", so this token should
    // always be present, even for unmounted objects.
    let mount_point = words.nth(3).ok_or_else(|| {
        anyhow::anyhow!("`zfs list` didn't produce a mount point")
    })?;

    let mount_point = mount_point
        .starts_with('/')
        .then_some(mount_point)
        .map(Utf8PathBuf::from);

    Ok((name.to_owned(), mount_point))
}

/// Executes `zfs <verb>` with the supplied `args` as trailing arguments.
/// Returns the full command output on success. Fails if `zfs` returned a
/// nonzero error code.
fn zfs_command(
    verb: &str,
    args: &[&str],
) -> anyhow::Result<std::process::Output> {
    debug!(verb, ?args, "executing ZFS command");

    let output = std::process::Command::new("pfexec")
        .arg("zfs")
        .arg(verb)
        .args(args)
        .output()
        .with_context(|| format!("running `zfs {verb}` with args {args:?}"))?;

    if !output.status.success() {
        let stdout = String::from_utf8_lossy(&output.stdout);
        let stderr = String::from_utf8_lossy(&output.stderr);
        error!(
            verb,
            ?args,
            error_code = output.status.code(),
            %stdout,
            %stderr,
            "zfs command failed"
        );
        anyhow::bail!(
            "`zfs {verb}` with args {args:?} returned error {:?}",
            output.status.code()
        );
    }

    Ok(output)
}


================================================
FILE: phd-tests/quickstart.sh
================================================
#!/bin/bash

PHD_QUICKSTART_DIR=/tmp/propolis-phd
if [ -f "$PHD_QUICKSTART_DIR" ]; then
	echo "$PHD_QUICKSTART_DIR exists and is not a directory"
	exit 1
fi

if [ ! -d "$PHD_QUICKSTART_DIR" ]; then
	mkdir $PHD_QUICKSTART_DIR
fi

pfexec cargo run -p phd-runner -- \
	run \
	--artifact-toml-path ./artifacts.toml \
	--tmp-directory $PHD_QUICKSTART_DIR \
	--artifact-directory $PHD_QUICKSTART_DIR \
	--propolis-server-cmd $1


================================================
FILE: phd-tests/runner/Cargo.toml
================================================
[package]
name = "phd-runner"
version = "0.1.0"
license = "MPL-2.0"
edition = "2021"

[[bin]]
name = "phd-runner"
test = false
doctest = false

[dependencies]
anyhow.workspace = true
backtrace.workspace = true
camino.workspace = true
clap = { workspace = true, features = ["derive"] }
phd-framework.workspace = true
phd-tests.workspace = true
tokio = { workspace = true, features = ["full"] }
tracing.workspace = true
tracing-appender.workspace = true
tracing-bunyan-formatter.workspace = true
tracing-subscriber = { workspace = true, features = ["env-filter"] }
uuid.workspace = true

[build-dependencies]
anyhow.workspace = true
cargo_metadata.workspace = true


================================================
FILE: phd-tests/runner/build.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Context;

fn main() -> anyhow::Result<()> {
    set_crucible_git_rev()
        .context("Failed to determine Crucible Git revision")?;

    Ok(())
}

fn set_crucible_git_rev() -> anyhow::Result<()> {
    const CRUCIBLE_REPO: &str = "https://github.com/oxidecomputer/crucible";
    fn extract_crucible_dep_sha(
        src: &cargo_metadata::Source,
    ) -> anyhow::Result<&str> {
        let src = src.repr.strip_prefix("git+").ok_or_else(|| {
            anyhow::anyhow!("Crucible is not a Git dependency")
        })?;

        if !src.starts_with(CRUCIBLE_REPO) {
            println!("cargo:warning=expected Crucible package's source to be {CRUCIBLE_REPO:?}, but is {src:?}");
        }

        let rev = src.split("?rev=").nth(1).ok_or_else(|| {
            anyhow::anyhow!("Crucible package's source did not have a revision")
        })?;
        let mut parts = rev.split('#');
        let sha = parts.next().ok_or_else(|| {
            anyhow::anyhow!("Crucible package's source did not have a revision")
        })?;
        assert_eq!(Some(sha), parts.next());
        Ok(sha)
    }

    let metadata = cargo_metadata::MetadataCommand::new()
        .exec()
        .context("Failed to get cargo metadata")?;

    let crucible_pkg = metadata
        .packages
        .iter()
        .find(|pkg| pkg.name == "crucible")
        .ok_or_else(|| {
            anyhow::anyhow!("Failed to find Crucible package in cargo metadata")
        })?;

    let mut errmsg = String::new();
    let crucible_sha = crucible_pkg
        .source
        .as_ref()
        .ok_or_else(|| {
            anyhow::anyhow!(
                "Crucible dependency is patched with a local checkout"
            )
        })
        .and_then(extract_crucible_dep_sha)
        .unwrap_or_else(|err| {
            println!(
                "cargo:warning={err}, so the `--crucible-downstairs-commit auto` \
                 flag will be disabled in this PHD build",
            );
            errmsg = format!("CANT_GET_YE_CRUCIBLE_SHA{err}");
            &errmsg
        });

    println!("cargo:rustc-env=PHD_CRUCIBLE_GIT_REV={crucible_sha}");

    Ok(())
}


================================================
FILE: phd-tests/runner/src/config.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Context;
use camino::Utf8PathBuf;
use clap::{Args, Parser, Subcommand};
use phd_framework::{
    artifacts, log_config::OutputMode, BasePropolisSource,
    CrucibleDownstairsSource,
};
use std::str::FromStr;

#[derive(Debug, Subcommand)]
#[allow(clippy::large_enum_variant)]
pub enum Command {
    Run(RunOptions),
    List(ListOptions),
}

/// Runtime configuration options for the runner.
#[derive(Debug, Parser)]
#[clap(verbatim_doc_comment)]
pub struct ProcessArgs {
    #[clap(subcommand)]
    pub command: Command,

    /// Suppress emission of terminal control codes in the runner's log output.
    #[clap(long, conflicts_with = "emit_bunyan")]
    pub disable_ansi: bool,

    /// Emit Bunyan-formatted logs.
    #[clap(long)]
    pub emit_bunyan: bool,
}

#[derive(Args, Debug)]
#[clap(verbatim_doc_comment)]
pub struct RunOptions {
    /// The command to use to launch the Propolis server.
    #[clap(long, value_parser)]
    pub propolis_server_cmd: Utf8PathBuf,

    /// Git branch name to use for the "migration base" Propolis server artifact
    /// for migration-from-base tests.
    ///
    /// If this argument is provided, PHD will download the latest Propolis
    /// server artifact from Buildomat for the provided branch name, and use it
    /// to test migration from that Propolis version to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-commit` and
    /// `--base-propolis-cmd` arguments. If none of these arguments are
    /// provided, no "base" Propolis server artifact will be added to the
    /// artifact store, and migration-from-base tests will be skipped.
    #[clap(
        long,
        conflicts_with("base_propolis_commit"),
        conflicts_with("base_propolis_cmd"),
        value_parser
    )]
    base_propolis_branch: Option<String>,

    /// Git commit hash to use for the "migration base" Propolis server artifact for
    /// migration from base tests.
    ///
    /// If this argument is provided, PHD will download the Propolis server
    /// artifact from Buildomat for the provided commit hash, and use it
    /// to test migration from that Propolis version to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-branch` and
    /// `--base-propolis-cmd` arguments. If none of these arguments are
    /// provided, no "base" Propolis server artifact will be added to the
    /// artifact store, and migration-from-base tests will be skipped.
    #[clap(
        long,
        conflicts_with("base_propolis_branch"),
        conflicts_with("base_propolis_cmd"),
        value_parser
    )]
    base_propolis_commit: Option<artifacts::buildomat::Commit>,

    /// The path of a local command to use as the "migration base" Propolis
    /// server for migration-from-base tests.
    ///
    /// If this argument is provided, PHD will use the provided command to run
    /// to test migration from that Propolis binary to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-branch` and
    /// `--base-propolis-commit` arguments. If none of these arguments are
    /// provided, no "base" Propolis server artifact will be added to the
    /// artifact store, and migration-from-base tests will be skipped.
    #[clap(
        long,
        conflicts_with("base_propolis_commit"),
        conflicts_with("base_propolis_branch"),
        value_parser
    )]
    base_propolis_cmd: Option<Utf8PathBuf>,

    /// The path of a local command to use to launch Crucible downstairs
    /// servers.
    ///
    /// This argument conflicts with the `--crucible-downstairs-commit`
    /// argument, which configures PHD to download a Crucible downstairs
    /// artifact from Buildomat. If neither the `--crucible-downstairs-cmd` OR
    /// `--crucible-downstairs-commit` arguments are provided, then PHD will not
    /// run tests that require Crucible.
    #[clap(long, value_parser)]
    crucible_downstairs_cmd: Option<Utf8PathBuf>,

    /// Git revision to use to download Crucible downstairs artifacts from
    /// Buildomat.
    ///
    /// This may either be the string 'auto' or a 40-character Git commit
    /// hash. If this is 'auto', then the Git revision of Crucible is determined
    /// automatically based on the Propolis workspace's Cargo git dependency on
    /// the `crucible` crate (determined when `phd-runner` is built). If an
    /// explicit commit hash is provided, that commit is downloaded from
    /// Buildomat, regardless of which version of the `crucible` crate Propolis
    /// depends on.
    ///
    /// This argument conflicts with the `--crucible-downstairs-cmd`
    /// argument, which configures PHD to use a local command for running
    /// Crucible downstairs servers. If neither the `--crucible-downstairs-cmd`
    /// OR `--crucible-downstairs-commit` arguments are provided, then PHD will
    /// not run tests that require Crucible.
    #[clap(long, conflicts_with("crucible_downstairs_cmd"), value_parser)]
    crucible_downstairs_commit: Option<ArtifactCommit>,

    /// The directory into which to write temporary files (config TOMLs, log
    /// files, etc.) generated during test execution.
    #[clap(long, value_parser)]
    pub tmp_directory: Utf8PathBuf,

    /// The directory in which artifacts (guest OS images, bootroms, etc.)
    /// are to be stored.
    ///
    /// If this argument is not provided, artifacts will be stored in the
    /// directory passed to `--tmp-directory`.
    #[clap(long, value_parser)]
    artifact_directory: Option<Utf8PathBuf>,

    /// Configure where Propolis servers and other processes created by the
    /// runner to log to.
    ///
    /// Valid options are:
    ///
    /// - file, tmpfile: Log to a temporary file under tmp-directory.
    ///
    /// - stdio: Log to stdout/stderr.
    ///
    /// - null: Don't log anywhere.
    #[clap(long, default_value = "file")]
    pub output_mode: OutputMode,

    /// The number of CPUs to assign to the guest in tests where the test is
    /// using the default machine configuration.
    #[clap(long, value_parser, default_value = "2")]
    pub default_guest_cpus: u8,

    /// The amount of memory, in MiB, to assign to the guest in tests where the
    /// test is using the default machine configuration.
    #[clap(long, value_parser, default_value = "512")]
    pub default_guest_memory_mib: u64,

    /// The path to a TOML file describing the artifact store to use for this
    /// run.
    #[clap(long, value_parser)]
    pub artifact_toml_path: Utf8PathBuf,

    /// The default artifact store key to use to load a guest OS image in tests
    /// that do not explicitly specify one.
    #[clap(long, value_parser, default_value = "alpine")]
    pub default_guest_artifact: String,

    /// The default artifact store key to use to load a guest bootrom in tests
    /// that do not explicitly specify one.
    #[clap(long, value_parser, default_value = "ovmf")]
    pub default_bootrom_artifact: String,

    /// Only run tests whose fully-qualified names contain this string.
    /// Can be specified multiple times.
    #[clap(long, value_parser)]
    pub include_filter: Vec<String>,

    /// Only run tests whose fully-qualified names do not contain this
    /// string. Can be specified multiple times.
    #[clap(long, value_parser)]
    pub exclude_filter: Vec<String>,

    /// Maximum duration (in seconds) to wait for an artifact to become
    /// available in Buildomat.
    ///
    /// This determines the total amount of time that PHD will spend retrying a
    /// failed attempts to download a particular artifact from Buildomat. A
    /// fairly generous duration allows PHD to wait for some time in case an
    /// artifact that does not currently exist is in the process of being built.
    // TODO(eliza): this could parse a `Duration` with units, instead of a
    // number of seconds, but i'm lazy...
    #[clap(long, value_parser, default_value_t = 60 * 20)]
    pub max_buildomat_wait_secs: u64,

    /// When a testcase fails while this is enabled, any instances started by
    /// the failed test are left running and phd-runner waits until they are
    /// manually shut down out-of-band by the operator.
    ///
    /// This feature is intended to give the operator a chance to inspect the
    /// state of the guest(s) easily without necessarily having to reconstruct
    /// the scenario by hand.
    #[clap(long, value_parser)]
    pub manual_stop_on_failure: bool,
}

#[derive(Args, Debug)]
#[clap(verbatim_doc_comment)]
pub struct ListOptions {
    /// Only list tests whose fully-qualified names contain this string.
    /// Can be specified multiple times.
    #[clap(long, value_parser)]
    pub include_filter: Vec<String>,

    /// Only list tests whose fully-qualified names do not contain this
    /// string. Can be specified multiple times.
    #[clap(long, value_parser)]
    pub exclude_filter: Vec<String>,
}

#[derive(Debug, Clone)]
enum ArtifactCommit {
    Auto,
    Explicit(artifacts::buildomat::Commit),
}

impl FromStr for ArtifactCommit {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        let s = s.trim();

        if s.eq_ignore_ascii_case("auto") {
            return Ok(ArtifactCommit::Auto);
        }

        s.parse().context(
            "Crucible commit must be either 'auto' or a valid Git commit hash",
        ).map(ArtifactCommit::Explicit)
    }
}

impl RunOptions {
    pub fn artifact_directory(&self) -> Utf8PathBuf {
        self.artifact_directory.as_ref().unwrap_or(&self.tmp_directory).clone()
    }

    pub fn crucible_downstairs(
        &self,
    ) -> anyhow::Result<Option<CrucibleDownstairsSource>> {
        // If a local crucible-downstairs command was provided on the command
        // line, use that.
        if let Some(cmd) = self.crucible_downstairs_cmd.clone() {
            return Ok(Some(CrucibleDownstairsSource::Local(cmd)));
        }

        match self.crucible_downstairs_commit {
            Some(ArtifactCommit::Explicit(ref commit)) => Ok(Some(
                CrucibleDownstairsSource::BuildomatGitRev(commit.clone()),
            )),
            Some(ArtifactCommit::Auto) => {
                // Otherwise, use the Git revision of the workspace's Cargo git dep on
                // crucible-upstairs, and use the same revision for the downstairs
                // binary artifact.
                //
                // The Git revision of Crucible we depend on is determined when building
                // `phd-runner` by the build script, so that the `phd-runner` binary can
                // be run even after moving it out of the Propolis cargo workspace.
                let commit = env!("PHD_CRUCIBLE_GIT_REV");
                if let Some(reason) =
                    commit.strip_prefix("CANT_GET_YE_CRUCIBLE_SHA")
                {
                    anyhow::bail!(
                        "Because {reason}, phd-runner's build script could not determine \
                         the Crucible Git SHA, so the `--crucible-downstairs-commit auto` \
                         option has been disabled.\n\tYou can provide a local Crucible \
                         binary using `--crucible-downstairs-cmd`.",
                    )
                }

                let commit = commit.parse().context(
                    "PHD_CRUCIBLE_GIT_REV must be set to a valid Git \
                        revision by the build script",
                )?;
                Ok(Some(CrucibleDownstairsSource::BuildomatGitRev(commit)))
            }
            None => Ok(None),
        }
    }

    pub fn base_propolis(&self) -> Option<BasePropolisSource<'_>> {
        // If a local command for the "base" propolis artifact was provided,
        // use that.
        if let Some(ref cmd) = self.base_propolis_cmd {
            return Some(BasePropolisSource::Local(cmd));
        }

        if let Some(ref branch) = self.base_propolis_branch {
            return Some(BasePropolisSource::BuildomatBranch(branch));
        }

        if let Some(ref commit) = self.base_propolis_commit {
            return Some(BasePropolisSource::BuildomatGitRev(commit));
        }

        None
    }
}


================================================
FILE: phd-tests/runner/src/execute.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;
use std::time::{Duration, Instant};

use phd_framework::test_vm::TestVmManualStop;
use phd_tests::phd_testcase::{Framework, TestCase, TestOutcome};
use tokio::signal::unix::{signal, SignalKind};
use tokio::sync::watch;
use tracing::{error, info, warn};

use crate::config::RunOptions;
use crate::fixtures::TestFixtures;

/// Statistics returned after executing a set of tests.
pub struct ExecutionStats {
    /// The number of tests that passed.
    pub tests_passed: u32,

    /// The number of tests that failed.
    pub tests_failed: u32,

    /// The number of tests that marked themselves as skipped.
    pub tests_skipped: u32,

    /// The number of tests that the runner decided not to run (e.g. because of
    /// a failure in a fixture).
    pub tests_not_run: u32,

    /// The total time spent running tests and fixtures. This spans the time
    /// from just before the first test setup fixture runs to the time just
    /// after the last fixture finishes.
    pub duration: Duration,

    /// A collection of test cases that returned a failed result.
    pub failed_test_cases: Vec<&'static TestCase>,
}

#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Status {
    Ran(TestOutcome),
    NotRun,
}

struct Execution {
    tc: &'static TestCase,
    status: Status,
}

/// Executes a set of tests using the supplied test context.
pub async fn run_tests_with_ctx(
    ctx: &Arc<Framework>,
    mut fixtures: TestFixtures,
    run_opts: &RunOptions,
) -> ExecutionStats {
    let mut executions = Vec::new();

    for tc in phd_tests::phd_testcase::filtered_test_cases(
        &run_opts.include_filter,
        &run_opts.exclude_filter,
    ) {
        executions.push(Execution { tc, status: Status::NotRun });
    }

    let mut stats = ExecutionStats {
        tests_passed: 0,
        tests_failed: 0,
        tests_skipped: 0,
        tests_not_run: executions.len() as u32,
        duration: Duration::default(),
        failed_test_cases: Vec::new(),
    };

    if executions.is_empty() {
        info!("No tests selected for execution");
        return stats;
    }

    fixtures.execution_setup().unwrap();
    let sigint_rx = set_sigint_handler();
    info!("Running {} test(s)", executions.len());
    let start_time = Instant::now();
    for execution in &mut executions {
        if *sigint_rx.borrow() {
            info!("Test run interrupted by SIGINT");
            break;
        }

        info!("Starting test {}", execution.tc.fully_qualified_name());

        // Failure to run a setup fixture is fatal to the rest of the run, but
        // it's still possible to report results, so return gracefully instead
        // of panicking.
        if let Err(e) = fixtures.test_setup() {
            error!("Error running test setup fixture: {}", e);
            break;
        }

        stats.tests_not_run -= 1;
        let framework = ctx.clone();
        let tc = execution.tc;
        let mut sigint_rx_task = sigint_rx.clone();
        let mut test_ctx = framework.test_ctx(tc.fully_qualified_name());
        let mut success_tx = None;
        if run_opts.manual_stop_on_failure {
            let (tx, success_rx) = watch::channel(None);
            test_ctx.set_cleanup_task_outcome_receiver(TestVmManualStop::new(
                tc.fully_qualified_name(),
                success_rx,
                sigint_rx.clone(),
            ));
            success_tx = Some(tx);
        }
        let test_outcome = tokio::spawn(async move {
            tokio::select! {
                // Ensure interrupt signals are always handled instead of
                // continuing to run the test.
                biased;
                result = sigint_rx_task.changed() => {
                    assert!(
                        result.is_ok(),
                        "SIGINT channel shouldn't drop while tests are running"
                    );

                    TestOutcome::Failed(
                        Some("test interrupted by SIGINT".to_string())
                    )
                }
                outcome = tc.run(&test_ctx) => outcome
            }
        })
        .await
        .unwrap_or_else(|_| {
            TestOutcome::Failed(Some(
                "test task panicked, see test logs".to_string(),
            ))
        });

        info!(
            "test {} ... {}{}",
            execution.tc.fully_qualified_name(),
            match test_outcome {
                TestOutcome::Passed => "ok",
                TestOutcome::Failed(_) => "FAILED: ",
                TestOutcome::Skipped(_) => "skipped: ",
            },
            match &test_outcome {
                TestOutcome::Failed(Some(s))
                | TestOutcome::Skipped(Some(s)) => s,
                TestOutcome::Failed(None) | TestOutcome::Skipped(None) =>
                    "[no message]",
                _ => "",
            }
        );

        if let Some(tx) = success_tx {
            let succeeded = !matches!(&test_outcome, TestOutcome::Failed(_));
            let _: Result<_, _> = tx.send(Some(succeeded));
        }

        match test_outcome {
            TestOutcome::Passed => stats.tests_passed += 1,
            TestOutcome::Failed(_) => {
                stats.tests_failed += 1;
                stats.failed_test_cases.push(execution.tc);
            }
            TestOutcome::Skipped(_) => stats.tests_skipped += 1,
        }

        execution.status = Status::Ran(test_outcome);
        if let Err(e) = fixtures.test_cleanup().await {
            error!("Error running cleanup fixture: {}", e);
            break;
        }
    }
    stats.duration = start_time.elapsed();

    fixtures.execution_cleanup().unwrap();

    stats
}

/// Sets a global handler for SIGINT and hands the resulting signal channel over
/// to a task that handles this signal. Returns a receiver to which the signal
/// handler task publishes `true` to the channel when SIGINT is received.
fn set_sigint_handler() -> watch::Receiver<bool> {
    let mut sigint =
        signal(SignalKind::interrupt()).expect("failed to set SIGINT handler");

    let (sigint_tx, sigint_rx) = watch::channel(false);
    tokio::spawn(async move {
        loop {
            sigint.recv().await;

            // If a signal was previously dispatched to the channel, exit
            // immediately with the customary SIGINT exit code (130 is 128 +
            // SIGINT). This allows users to interrupt tests even if they aren't
            // at an await point (at the cost of not having destructors run).
            if *sigint_tx.borrow() {
                error!(
                    "SIGINT received while shutting down, rudely terminating"
                );
                error!("some processes and resources may have been leaked!");
                std::process::exit(130);
            }

            warn!("SIGINT received, sending shutdown signal to tests");
            let _ = sigint_tx.send(true);
        }
    });

    sigint_rx
}


================================================
FILE: phd-tests/runner/src/fixtures.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::sync::Arc;

use anyhow::Result;
use tracing::instrument;

use crate::Framework;

/// A wrapper containing the objects needed to run the executor's test fixtures.
pub struct TestFixtures {
    test_context: Arc<Framework>,
}

impl TestFixtures {
    /// Creates a new set of test fixtures using the supplied command-line
    /// parameters and artifact store.
    pub fn new(test_context: Arc<Framework>) -> Result<Self> {
        Ok(Self { test_context })
    }

    /// Calls fixture routines that need to run before any tests run.
    #[instrument(skip_all)]
    pub fn execution_setup(&mut self) -> Result<()> {
        Ok(())
    }

    /// Calls fixture routines that need to run after all tests run.
    ///
    /// Unless the runner panics, or a test panics in a way that can't be caught
    /// during unwinding, this cleanup fixture will run even if a test run is
    /// interrupted.
    #[instrument(skip_all)]
    pub fn execution_cleanup(&mut self) -> Result<()> {
        Ok(())
    }

    /// Calls fixture routines that run before each test case is invoked.
    #[instrument(skip_all)]
    pub fn test_setup(&mut self) -> Result<()> {
        Ok(())
    }

    /// Calls fixture routines that run after each test case is invoked.
    ///
    /// Unless the runner panics, or a test panics in a way that can't be caught
    /// during unwinding, this cleanup fixture will run whenever the
    /// corresponding setup fixture has run.
    #[instrument(skip_all)]
    pub async fn test_cleanup(&mut self) -> Result<()> {
        self.test_context.reset().await;
        Ok(())
    }
}


================================================
FILE: phd-tests/runner/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

mod config;
mod execute;
mod fixtures;

use clap::Parser;
use config::{ListOptions, ProcessArgs, RunOptions};
use phd_framework::log_config::{LogConfig, LogFormat};
use phd_tests::phd_testcase::{Framework, FrameworkParameters};
use std::sync::Arc;
use std::time::Duration;
use tracing::{debug, info, warn};
use tracing_bunyan_formatter::{BunyanFormattingLayer, JsonStorageLayer};
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::{EnvFilter, Registry};

use crate::execute::ExecutionStats;
use crate::fixtures::TestFixtures;

#[tokio::main]
async fn main() -> anyhow::Result<()> {
    let runner_args = ProcessArgs::parse();
    set_tracing_subscriber(&runner_args);

    let state_write_guard = phd_framework::host_api::set_vmm_globals();
    if let Err(e) = state_write_guard {
        warn!(
            error = ?e,
            "Failed to enable one or more kernel options, some tests may not work",
        );
    }

    info!(?runner_args);

    match &runner_args.command {
        config::Command::Run(opts) => {
            let exit_code = run_tests(opts, &runner_args).await?.tests_failed;
            debug!(exit_code);
            std::process::exit(exit_code.try_into().unwrap());
        }
        config::Command::List(opts) => list_tests(opts),
    }

    Ok(())
}

async fn run_tests(
    run_opts: &RunOptions,
    runner_args: &ProcessArgs,
) -> anyhow::Result<ExecutionStats> {
    let ctx_params = FrameworkParameters {
        propolis_server_path: run_opts.propolis_server_cmd.clone(),
        crucible_downstairs: run_opts.crucible_downstairs()?,
        base_propolis: run_opts.base_propolis(),
        tmp_directory: run_opts.tmp_directory.clone(),
        artifact_directory: run_opts.artifact_directory(),
        artifact_toml: run_opts.artifact_toml_path.clone(),
        // We have to synthesize an actual LogConfig for the test because the
        // log format - half of the config - is specified earlier to indicate
        // log formatting for the runner itself. Reuse that setting to influence
        // the formatting for tasks started by the runner during tests.
        log_config: LogConfig {
            output_mode: run_opts.output_mode,
            log_format: if runner_args.emit_bunyan {
                LogFormat::Bunyan
            } else {
                LogFormat::Plain
            },
        },
        default_guest_cpus: run_opts.default_guest_cpus,
        default_guest_memory_mib: run_opts.default_guest_memory_mib,
        default_guest_os_artifact: run_opts.default_guest_artifact.clone(),
        default_bootrom_artifact: run_opts.default_bootrom_artifact.clone(),
        port_range: 9000..10000,
        max_buildomat_wait: Duration::from_secs(
            run_opts.max_buildomat_wait_secs,
        ),
    };

    let ctx = Arc::new(
        Framework::new(ctx_params)
            .await
            .expect("should be able to set up a test context"),
    );

    let fixtures = TestFixtures::new(ctx.clone()).unwrap();

    // Run the tests and print results.
    let execution_stats =
        execute::run_tests_with_ctx(&ctx, fixtures, run_opts).await;
    if !execution_stats.failed_test_cases.is_empty() {
        println!("\nfailures:");
        for tc in &execution_stats.failed_test_cases {
            println!("    {}", tc.fully_qualified_name());
        }
        println!();
    }

    println!(
        "test result: {}. {} passed; {} failed; {} skipped; {} not run; \
        finished in {:.2}s\n",
        if execution_stats.tests_failed != 0 { "FAILED" } else { "ok" },
        execution_stats.tests_passed,
        execution_stats.tests_failed,
        execution_stats.tests_skipped,
        execution_stats.tests_not_run,
        execution_stats.duration.as_secs_f64()
    );

    Ok(execution_stats)
}

fn list_tests(list_opts: &ListOptions) {
    println!("Tests enabled after applying filters:\n");

    let mut count = 0;
    for tc in phd_tests::phd_testcase::filtered_test_cases(
        &list_opts.include_filter,
        &list_opts.exclude_filter,
    ) {
        println!("    {}", tc.fully_qualified_name());
        count += 1
    }

    println!("\n{count} test(s) selected");
}

fn set_tracing_subscriber(args: &ProcessArgs) {
    let filter = EnvFilter::builder()
        .with_default_directive(tracing::Level::INFO.into());
    let subscriber = Registry::default().with(filter.from_env_lossy());
    if args.emit_bunyan {
        let bunyan_layer =
            BunyanFormattingLayer::new("phd-runner".into(), std::io::stdout);
        let subscriber = subscriber.with(JsonStorageLayer).with(bunyan_layer);
        tracing::subscriber::set_global_default(subscriber).unwrap();
    } else {
        let stdout_log = tracing_subscriber::fmt::layer()
            .with_line_number(true)
            .with_ansi(!args.disable_ansi);
        let subscriber = subscriber.with(stdout_log);
        tracing::subscriber::set_global_default(subscriber).unwrap();
    }
}


================================================
FILE: phd-tests/testcase/Cargo.toml
================================================
[package]
name = "phd-testcase"
version = "0.1.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
futures.workspace = true
linkme.workspace = true
thiserror.workspace = true
phd-framework.workspace = true
phd-testcase-macros.workspace = true
anyhow = { workspace = true, features = ["backtrace"] }


================================================
FILE: phd-tests/testcase/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub use anyhow::{Context, Result};
pub use phd_framework;
pub use phd_testcase_macros::*;
use thiserror::Error;

pub use phd_framework::Framework;
pub use phd_framework::FrameworkParameters;
pub use phd_framework::TestCtx;

#[derive(Debug, Error)]
pub enum TestSkippedError {
    #[error("Test skipped: {0:?}")]
    TestSkipped(Option<String>),
}

/// The outcome from executing a specific test case.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum TestOutcome {
    /// The test passed.
    Passed,

    /// The test failed.
    Failed(Option<String>),

    /// The test chose to be skipped, i.e. it detected a parameter or condition
    /// that makes it impossible to execute the test or to meaningfully provide
    /// a pass/fail outcome. The payload is an optional message.
    Skipped(Option<String>),
}

/// A wrapper for test functions. This is needed to allow [`TestCase`] to have a
/// `const` constructor for the inventory crate.
pub struct TestFunction {
    pub f: fn(&TestCtx) -> futures::future::BoxFuture<'_, TestOutcome>,
}

/// A description of a single test case.
pub struct TestCase {
    /// The path to the module containing the test case. This is generally
    /// derived from the `module_path!()` macro, which the `#[phd_testcase]`
    /// attribute macro uses when constructing the test case's inventory entry.
    pub(crate) module_path: &'static str,

    /// The name of this test case, which is generally its function name.
    pub(crate) name: &'static str,

    /// The test function to execute to run this test.
    pub(crate) function: TestFunction,
}

#[allow(dead_code)]
impl TestCase {
    /// Constructs a new [`TestCase`].
    pub const fn new(
        module_path: &'static str,
        name: &'static str,
        function: TestFunction,
    ) -> Self {
        Self { module_path, name, function }
    }

    /// Returns the test case's fully qualified name, i.e. `module_path::name`.
    pub fn fully_qualified_name(&self) -> String {
        format!("{}::{}", self.module_path, self.name)
    }

    /// Returns the test case's name.
    pub fn name(&self) -> &str {
        self.name
    }

    /// Runs the test case's body with the supplied test context and returns its
    /// outcome.
    pub async fn run(&self, ctx: &TestCtx) -> TestOutcome {
        (self.function.f)(ctx).await
    }
}

#[linkme::distributed_slice]
pub static TEST_CASES: [TestCase];

pub fn all_test_cases() -> impl Iterator<Item = &'static TestCase> {
    TEST_CASES.into_iter()
}

/// Returns an iterator over the subset of tests for which (a) the fully
/// qualified name of the test includes every string in `must_include`, and (b)
/// the fully qualified name does not include any strings in `must_exclude`.
pub fn filtered_test_cases<'rule>(
    must_include: &'rule [String],
    must_exclude: &'rule [String],
) -> impl Iterator<Item = &'static TestCase> + 'rule {
    TEST_CASES.into_iter().filter(|tc| {
        must_include.iter().all(|inc| tc.fully_qualified_name().contains(inc))
            && must_exclude
                .iter()
                .all(|exc| !tc.fully_qualified_name().contains(exc))
    })
}


================================================
FILE: phd-tests/testcase_macro/Cargo.toml
================================================
[package]
name = "phd-testcase-macros"
version = "0.1.0"
edition = "2021"

[lib]
proc-macro = true
test = false
doctest = false

[dependencies]
heck.workspace = true
proc-macro2.workspace = true
proc-macro-error.workspace = true
quote.workspace = true
syn = { workspace = true, features = ["full"] }


================================================
FILE: phd-tests/testcase_macro/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use heck::ToShoutySnakeCase;
use proc_macro::TokenStream;
use proc_macro_error::{abort, proc_macro_error};
use quote::{format_ident, quote};
use syn::{parse_macro_input, spanned::Spanned, ItemFn};

/// The macro for labeling PHD testcases.
///
/// PHD testcase functions have the signature `fn test(ctx:
/// phd_testcase::TestContext)`. The macro inserts the function body into a
/// wrapper function that returns a `phd_testcase::TestOutcome` and creates an
/// entry in the test case inventory that allows the PHD runner to enumerate the
/// test.
#[proc_macro_error]
#[proc_macro_attribute]
pub fn phd_testcase(_attrib: TokenStream, input: TokenStream) -> TokenStream {
    let item_fn = parse_macro_input!(input as ItemFn);

    // Build the inventory record for this test. The `module_path!()` in the
    // generated code allows the test case to report the fully-qualified path to
    // itself regardless of where it's located.
    let fn_ident = item_fn.sig.ident.clone();
    let fn_name = fn_ident.to_string();
    let static_ident = format_ident!("{}", fn_name.to_shouty_snake_case());
    let submit: proc_macro2::TokenStream = quote! {
        #[linkme::distributed_slice(phd_testcase::TEST_CASES)]
        static #static_ident: phd_testcase::TestCase = phd_testcase::TestCase::new(
            module_path!(),
            #fn_name,
            phd_testcase::TestFunction { f: |ctx| Box::pin(#fn_ident(ctx)) }
        );
    };

    if item_fn.sig.asyncness.is_none() {
        abort!(item_fn.sig.span(), "PHD test cases must be async");
    }

    // Rebuild the test body into an immediately-executed function that returns
    // an `anyhow::Result`. This allows tests to use the `?` operator and to
    // `return Ok(())` to allow a test to pass early.
    let fn_vis = item_fn.vis.clone();
    let fn_sig = item_fn.sig.clone();
    let fn_block = item_fn.block.stmts;

    let remade_fn = quote! {
        #fn_vis #fn_sig -> TestOutcome {
            use tracing::Instrument;
            let res: phd_testcase::Result<()> = async {
                #(#fn_block)*
                Ok(())
            }.instrument(tracing::info_span!("test", path = %concat!(module_path!(), "::", #fn_name))).await;
            match res {
                Ok(()) => phd_testcase::TestOutcome::Passed,
                Err(e) => {
                    // Treat the test as skipped if the error downcasts to the
                    // phd_testcase "skipped" error type; otherwise, treat
                    // errors as failures.
                    if let Some(skipped) = e.downcast_ref::<phd_testcase::TestSkippedError>() {
                        let phd_testcase::TestSkippedError::TestSkipped(msg) = skipped;
                        phd_testcase::TestOutcome::Skipped(msg.clone())
                    } else {
                        let msg = format!("{e:?}");
                        phd_testcase::TestOutcome::Failed(Some(msg))
                    }
                }
            }
        }
    };

    quote! {
        #remade_fn

        #submit
    }
    .into()
}

/// Marks a test as skipped. The macro can take as an argument any expression
/// that has a `to_string` method.
#[proc_macro]
pub fn phd_skip(args: TokenStream) -> TokenStream {
    let args = if args.is_empty() {
        None
    } else {
        let lit = parse_macro_input!(args as proc_macro2::TokenStream);
        Some(lit)
    };

    let err_inner = match args {
        None => quote! { None },
        Some(_) => {
            let stringified = quote! { (#args).to_string() };
            quote! { Some(#stringified) }
        }
    };

    // Emit an early return that returns a `phd_testcase::TestSkippedError`.
    // The `phd_testcase` macro will try to downcast any errors returned from
    // the test body into this specific error type and will mark the test as
    // skipped if the downcast succeeds.
    quote! { return Err(phd_testcase::TestSkippedError::TestSkipped(#err_inner).into()); }
        .into()
}


================================================
FILE: phd-tests/tests/Cargo.toml
================================================
[package]
name = "phd-tests"
version = "0.1.0"
edition = "2021"

[lib]
test = false
doctest = false

[dependencies]
anyhow.workspace = true
backoff.workspace = true
byteorder.workspace = true
chrono.workspace = true
cpuid_utils.workspace = true
dropshot.workspace = true
futures.workspace = true
http.workspace = true
itertools.workspace = true
linkme.workspace = true
omicron-common.workspace = true
oximeter-producer.workspace = true
oximeter.workspace = true
phd-testcase.workspace = true
propolis-client.workspace = true
reqwest.workspace = true
slog-term.workspace = true
slog.workspace = true
strum.workspace = true
tokio = { workspace = true, features = ["time"] }
tracing.workspace = true
uuid.workspace = true


================================================
FILE: phd-tests/tests/src/boot_order/efi_utils.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! EFI variable parsing and manipulation utilities.
//!
//! Conceptually, this would be a separate crate. Something like `uefi`, or
//! maybe more accurately, `uefi-raw`. Those crates are very oriented towards
//! *being* the platform firmware though - it's not clear how to use them to
//! parse a boot option into a device path, for example, though they clearly are
//! able to support processing device paths.
//!
//! So instead, this is enough supporting logic for our tests in Propolis.

use anyhow::{bail, Error};
use byteorder::{LittleEndian, ReadBytesExt};
use phd_testcase::*;
use std::collections::HashMap;
use std::fmt::Write;
use std::io::{Cursor, Read};
use tracing::{info, trace, warn};

// First, some GUIDs. These GUIDs come from EDK2, and OVMF reuses them. Notably
// these are the raw bytes of the GUID: textual values will have slightly
// different ordering of bytes.
//
// Some source references, as you won't find these GUIDs in a UEFI or related
// spec document.. The firmware volume is identified by what seems to be the DXE
// firmware volume:
// https://github.com/tianocore/edk2/blob/712797c/OvmfPkg/OvmfPkgIa32.fdf#L181
// introduced in
// https://github.com/tianocore/edk2/commit/16f26de663967b5a64140b6abba2c145ea50194c,
// note this is the DXEFV entry.
//
// The *files* we'll care about in this test are identified by other GUIDs in
// the above *volume*.
//
// EFI Internal Shell:
// https://github.com/tianocore/edk2/blob/a445e1a/ShellPkg/ShellPkg.dec#L59-L60
// UiApp:
// https://github.com/tianocore/edk2/blob/a445e1a/MdeModulePkg/Application/UiApp/UiApp.inf#L13
pub(crate) const EDK2_FIRMWARE_VOL_GUID: &[u8; 16] = &[
    0xc9, 0xbd, 0xb8, 0x7c, 0xeb, 0xf8, 0x34, 0x4f, 0xaa, 0xea, 0x3e, 0xe4,
    0xaf, 0x65, 0x16, 0xa1,
];
pub(crate) const EDK2_UI_APP_GUID: &[u8; 16] = &[
    0x21, 0xaa, 0x2c, 0x46, 0x14, 0x76, 0x03, 0x45, 0x83, 0x6e, 0x8a, 0xb6,
    0xf4, 0x66, 0x23, 0x31,
];
pub(crate) const EDK2_EFI_SHELL_GUID: &[u8; 16] = &[
    0x83, 0xa5, 0x04, 0x7c, 0x3e, 0x9e, 0x1c, 0x4f, 0xad, 0x65, 0xe0, 0x52,
    0x68, 0xd0, 0xb4, 0xd1,
];

// The variable namespace `8be4df61-93ca-11d2-aa0d-00e098032b8c` comes from
// UEFI, as do the variable names here. The presentation as
// `{varname}-{namespace}`, and at a path like `/sys/firmware/efi/efivars/`, are
// both Linux `efivars`-isms.
//
// These tests likely will not pass when run with other guest OSes.
pub(crate) const BOOT_CURRENT_VAR: &str =
    "BootCurrent-8be4df61-93ca-11d2-aa0d-00e098032b8c";
pub(crate) const BOOT_ORDER_VAR: &str =
    "BootOrder-8be4df61-93ca-11d2-aa0d-00e098032b8c";

pub(crate) fn bootvar(num: u16) -> String {
    format!("Boot{num:04X}-8be4df61-93ca-11d2-aa0d-00e098032b8c")
}

pub(crate) fn efipath(varname: &str) -> String {
    format!("/sys/firmware/efi/efivars/{varname}")
}

/// A (very limited) parse of an `EFI_LOAD_OPTION` descriptor.
#[derive(Debug)]
pub(crate) struct EfiLoadOption {
    pub description: String,
    pub path: EfiLoadPath,
}

#[derive(Debug)]
pub(crate) enum EfiLoadPath {
    Device { acpi_root: DevicePath, pci_device: DevicePath },
    FirmwareFile { volume: DevicePath, file: DevicePath },
}

impl EfiLoadPath {
    pub fn matches_fw_file(
        &self,
        fw_vol: &[u8; 16],
        fw_file: &[u8; 16],
    ) -> bool {
        if let EfiLoadPath::FirmwareFile {
            volume: DevicePath::FirmwareVolume { guid: vol_gid },
            file: DevicePath::FirmwareFile { guid: vol_file },
        } = self
        {
            vol_gid == fw_vol && vol_file == fw_file
        } else {
            false
        }
    }

    pub fn matches_pci_device_function(
        &self,
        pci_device: u8,
        pci_function: u8,
    ) -> bool {
        if let EfiLoadPath::Device {
            acpi_root: DevicePath::Acpi { .. },
            pci_device: DevicePath::Pci { device, function },
        } = self
        {
            pci_device == *device && pci_function == *function
        } else {
            false
        }
    }

    pub fn as_pci_device_function(&self) -> Option<(u8, u8)> {
        if let EfiLoadPath::Device {
            acpi_root: DevicePath::Acpi { .. },
            pci_device: DevicePath::Pci { device, function },
        } = self
        {
            Some((*device, *function))
        } else {
            None
        }
    }
}

// The `Acpi` fields are not explicitly used (yet), but are useful for `Debug`
// purposes.
#[allow(dead_code)]
#[derive(Debug, Clone, Copy)]
pub(crate) enum DevicePath {
    Acpi { hid: u32, uid: u32 },
    Pci { device: u8, function: u8 },

    // These two are described in sections 8.2 and 8.3 of the UEFI PI spec,
    // respectively. Version 1.6 can be found at
    // https://uefi.org/sites/default/files/resources/PI_Spec_1.6.pdf
    FirmwareVolume { guid: [u8; 16] },
    FirmwareFile { guid: [u8; 16] },
}

impl DevicePath {
    fn parse_from(bytes: &mut Cursor<&[u8]>) -> Result<DevicePath, Error> {
        let ty = bytes.read_u8()?;
        let subtype = bytes.read_u8()?;

        macro_rules! check_size {
            ($desc:expr, $size: expr, $expect:expr) => {
                if $size != $expect {
                    bail!(
                        "{} size is wrong (was {:#04x}, not {:#04x}",
                        $desc,
                        $size,
                        $expect,
                    );
                }
            };
        }

        match (ty, subtype) {
            (2, 1) => {
                // ACPI Device Path
                let size = bytes.read_u16::<LittleEndian>()?;
                check_size!("ACPI Device Path", size, 0xc);
                let hid = bytes.read_u32::<LittleEndian>().unwrap();
                let uid = bytes.read_u32::<LittleEndian>().unwrap();
                Ok(DevicePath::Acpi { hid, uid })
            }
            (1, 1) => {
                // PCI device path
                let size = bytes.read_u16::<LittleEndian>()?;
                check_size!("PCI Device Path", size, 0x6);
                let function = bytes.read_u8().unwrap();
                let device = bytes.read_u8().unwrap();
                Ok(DevicePath::Pci { device, function })
            }
            (4, 6) => {
                // "PIWG Firmware File" aka "Firmware File" in UEFI PI spec
                let size = bytes.read_u16::<LittleEndian>()?;
                check_size!("Firmware File", size, 0x14);
                let mut guid = [0u8; 16];
                bytes.read_exact(&mut guid)?;
                Ok(DevicePath::FirmwareFile { guid })
            }
            (4, 7) => {
                // "PIWG Firmware Volume" aka "Firmware Volume" in UEFI PI spec
                let size = bytes.read_u16::<LittleEndian>()?;
                check_size!("Firmware Volume", size, 0x14);
                let mut guid = [0u8; 16];
                bytes.read_exact(&mut guid)?;
                Ok(DevicePath::FirmwareVolume { guid })
            }
            (ty, subtype) => {
                bail!(
                    "Device path type/subtype unsupported: ({ty:#02x}/{subtype:#02x})"
                );
            }
        }
    }
}

impl EfiLoadOption {
    // parsing here brought to you by rereading
    // * https://uefi.org/specs/UEFI/2.10/10_Protocols_Device_Path_Protocol.html
    // * https://uefi.org/specs/UEFI/2.10/03_Boot_Manager.html
    pub(crate) fn parse_from(
        bytes: &mut Cursor<&[u8]>,
    ) -> Result<EfiLoadOption, Error> {
        let _attributes = bytes.read_u32::<LittleEndian>()?;
        let file_path_list_length = bytes.read_u16::<LittleEndian>()?;

        // The `Description` field is a null-terminated string of char16.
        let mut description_chars: Vec<u16> = Vec::new();

        loop {
            let c = bytes.read_u16::<LittleEndian>()?;
            if c == 0 {
                break;
            }
            description_chars.push(c);
        }

        let description = String::from_utf16(&description_chars)
            .expect("description is valid utf16");

        let mut device_path_cursor = Cursor::new(
            &bytes.get_ref()[bytes.position() as usize..]
                [..file_path_list_length as usize],
        );

        let path_entry = DevicePath::parse_from(&mut device_path_cursor)
            .map_err(|e| {
                anyhow::anyhow!("unable to parse device path element: {e:?}")
            })?;
        let load_path = match path_entry {
            acpi_root @ DevicePath::Acpi { .. } => {
                let pci_device =
                    DevicePath::parse_from(&mut device_path_cursor)
                        .expect("can read device path element");
                if !matches!(pci_device, DevicePath::Pci { .. }) {
                    bail!(
                        "expected ACPI Device Path entry to be followed by \
                        a PCI Device Path, but was {pci_device:?}"
                    );
                }

                EfiLoadPath::Device { acpi_root, pci_device }
            }
            volume @ DevicePath::FirmwareVolume { .. } => {
                let file = DevicePath::parse_from(&mut device_path_cursor)
                    .expect("can read device path element");
                if !matches!(file, DevicePath::FirmwareFile { .. }) {
                    bail!(
                        "expected Firmware Volume entry to be followed by \
                        a Firmware File, but was {file:?}"
                    );
                }

                EfiLoadPath::FirmwareFile { volume, file }
            }
            other => {
                bail!("unexpected root EFI Load Option path item: {other:?}");
            }
        };

        // Not strictly necessary, but advance `bytes` by the number of bytes we
        // read from `device_path_cursor`. To callers, this keeps it as if we
        // had just been reading `bytes` all along.
        bytes.set_position(bytes.position() + device_path_cursor.position());

        Ok(EfiLoadOption { description, path: load_path })
    }

    pub fn pci_device_function(&self) -> (u8, u8) {
        let EfiLoadPath::Device {
            pci_device: DevicePath::Pci { device, function },
            ..
        } = self.path
        else {
            panic!(
                "expected load path to be an ACPI/PCI pair, but was {:?}",
                self.path
            );
        };
        (device, function)
    }
}

fn unhex(s: &str) -> Vec<u8> {
    let s = s.replace("\n", "");
    trace!("unhexing {}", s);
    let mut res = Vec::new();
    for chunk in s.as_bytes().chunks(2) {
        assert_eq!(chunk.len(), 2);

        let s = std::str::from_utf8(chunk).expect("valid string");

        let b = u8::from_str_radix(s, 16).expect("can parse");

        res.push(b);
    }
    res
}

/// Read the EFI variable `varname` from inside the VM, and return the data
/// therein as a byte array.
pub(crate) async fn read_efivar(
    vm: &phd_framework::TestVm,
    varname: &str,
) -> Result<Vec<u8>, Error> {
    // Linux's `efivarfs` prepends 4 bytes of attributes to EFI variables.
    let cmd = format!(
        "dd status=none if={} bs=1 skip=4 | xxd -p -",
        efipath(varname)
    );

    let hex = vm.run_shell_command(&cmd).await?;

    Ok(unhex(&hex))
}

/// Write the provided bytes to the EFI variable `varname`.
///
/// For Linux guests: variables automatically have their prior attributes
/// prepended. Provide only the variable's data.
pub(crate) async fn write_efivar(
    vm: &phd_framework::TestVm,
    varname: &str,
    data: &[u8],
) -> Result<(), Error> {
    let attr_cmd = format!(
        "dd status=none if={} bs=1 count=4 | xxd -p -",
        efipath(varname)
    );

    let attr_read_bytes = vm.run_shell_command(&attr_cmd).await?;
    let attrs = if attr_read_bytes.ends_with(": No such file or directory") {
        // Default attributes if the variable does not exist yet. We expect it
        // to be non-volatile because we are writing it, we expect it to be
        // available to boot services (not strictly required, but for boot
        // configuration we need it), and we expect it to be available at
        // runtime (e.g. where we are reading and writing it).
        //
        // so:
        // NON_VOLATILE | BOOTSERVICE_ACCESS | RUNTIME_ACCESS
        const FRESH_ATTRS: u32 = 0x00_00_00_07;
        FRESH_ATTRS.to_le_bytes().to_vec()
    } else {
        unhex(&attr_read_bytes)
    };

    let mut new_value = attrs;
    new_value.extend_from_slice(data);
    let data_len = new_value.len();

    // The command to write this data back out will be, roughtly:
    // ```
    // printf "\xAA\xAA\xAA\xAA\xDD\xDD\xDD\xDD" | \
    //   dd obs={inlen} of=/sys/firmware/efi/efivars/... status=none
    // ```
    // where AAAAAAAA are the attribute bytes and DDDDDDDD are caller-provided
    // data.
    //
    // notably do not printf directly to /sys/firmware/efi/efivars/*!! printf
    // may flush output early if the data to write contains a `\n` (observed at
    // least on Debian 11), and such a partial write to efivars may be rejected
    // as invalid UEFI variable data.
    let escaped: String =
        new_value.into_iter().fold(String::new(), |mut out, b| {
            write!(out, "\\x{b:02x}").expect("can append to String");
            out
        });

    let cmd = format!(
        "printf \"{}\" | dd obs={} of={} status=none",
        escaped,
        data_len,
        efipath(varname)
    );

    let res = vm.run_shell_command(&cmd).await?;
    // If something went sideways and the write failed with something like
    // `invalid argument`...
    if !res.is_empty() {
        bail!("writing efi produced unexpected output: {res}");
    }

    Ok(())
}

/// Learn the boot option numbers associated with various boot options that may
/// or should exist.
///
/// The fundamental wrinkle here is that we don't necessarily know what
/// `Boot####` entries exist, or which numbers they have, because NvVar is
/// handled through persistence in guest disks. This means a guest image may
/// have some prior NvVar state with `Boot####` entries that aren't removed, and
/// cause entries reflecting the current system to have later numbers than a
/// fully blank initial set of variables.
pub(crate) async fn discover_boot_option_numbers(
    vm: &phd_framework::TestVm,
    device_names: &[((u8, u8), &'static str)],
) -> Result<HashMap<String, u16>> {
    let mut option_mappings: HashMap<String, u16> = HashMap::new();

    let boot_order_bytes = read_efivar(vm, BOOT_ORDER_VAR).await?;
    info!("Initial boot order var: {:?}", boot_order_bytes);

    for chunk in boot_order_bytes.chunks(2) {
        assert_eq!(chunk.len(), 2);
        let option_num = u16::from_le_bytes(chunk.try_into().unwrap());

        let option_bytes = read_efivar(vm, &bootvar(option_num)).await?;

        let mut cursor = Cursor::new(option_bytes.as_slice());

        let load_option = match EfiLoadOption::parse_from(&mut cursor) {
            Ok(option) => option,
            Err(e) => {
                warn!("Unhandled boot option: {:?}", e);
                continue;
            }
        };

        if load_option
            .path
            .matches_fw_file(EDK2_FIRMWARE_VOL_GUID, EDK2_UI_APP_GUID)
        {
            let prev = option_mappings.insert("uiapp".to_string(), option_num);
            assert_eq!(prev, None);
        } else if load_option
            .path
            .matches_fw_file(EDK2_FIRMWARE_VOL_GUID, EDK2_EFI_SHELL_GUID)
        {
            let prev =
                option_mappings.insert("efi shell".to_string(), option_num);
            assert_eq!(prev, None);
        } else if let Some((device, function)) =
            load_option.path.as_pci_device_function()
        {
            let description = device_names.iter().find_map(|(path, desc)| {
                if path.0 == device && path.1 == function {
                    Some(desc)
                } else {
                    None
                }
            });

            if let Some(description) = description {
                option_mappings.insert(description.to_string(), option_num);
            } else {
                warn!("Unknown PCI boot device {:#x}.{:#x}", device, function);
            }
        } else {
            warn!("Unknown boot option: {:?}", load_option);

            let prev = option_mappings
                .insert(load_option.description.to_string(), option_num);
            assert_eq!(prev, None);
        }
    }

    info!("Found boot options: {:?}", option_mappings);

    Ok(option_mappings)
}

pub(crate) fn find_option_in_boot_order(
    order: &[u8],
    option: u16,
) -> Option<usize> {
    let option = option.to_le_bytes();
    order
        .chunks(2)
        .enumerate()
        .find(|(_i, chunk)| *chunk == option)
        .map(|(i, _chunk)| i)
}

/// Remove the boot option from `vm`'s EFI BootOrder variable. `boot_option_num`
/// is assumed to refer to a boot option named like
/// `format!("Boot{boot_option_num:4X}-*")`.
///
/// If the boot order was actually modified, returns the index that
/// `boot_option_num` was removed at.
pub(crate) async fn remove_boot_entry(
    vm: &phd_framework::TestVm,
    boot_option_num: u16,
) -> Result<Option<usize>> {
    let mut without_option = read_efivar(vm, BOOT_ORDER_VAR).await?;

    let Some(option_idx) =
        find_option_in_boot_order(&without_option, boot_option_num)
    else {
        return Ok(None);
    };

    info!(
        "Removing Boot{:4X} from the boot order. It was at index {}",
        boot_option_num, option_idx
    );

    without_option.remove(option_idx * 2);
    without_option.remove(option_idx * 2);

    // Technically it's fine if an option is present multiple times, but
    // typically an option is present only once. This function intends to remove
    // all copies of the specified option, so assert that we have done so in the
    // new order.
    assert_eq!(
        find_option_in_boot_order(&without_option, boot_option_num),
        None
    );

    write_efivar(vm, BOOT_ORDER_VAR, &without_option).await?;

    Ok(Some(option_idx))
}


================================================
FILE: phd-tests/tests/src/boot_order.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::bail;
use phd_framework::{
    disk::{fat::FatFilesystem, DiskSource},
    test_vm::{DiskBackend, DiskInterface},
};
use phd_testcase::*;
use std::io::Cursor;
use tracing::warn;

mod efi_utils;

use efi_utils::{
    bootvar, discover_boot_option_numbers, efipath, find_option_in_boot_order,
    read_efivar, remove_boot_entry, write_efivar, EfiLoadOption,
    BOOT_CURRENT_VAR, BOOT_ORDER_VAR, EDK2_EFI_SHELL_GUID,
    EDK2_FIRMWARE_VOL_GUID, EDK2_UI_APP_GUID,
};

// This test checks that with a specified boot order, the guest boots whichever
// disk we wanted to come first. This is simple enough, until you want to know
// "what you booted from"..
//
// For live CDs, such as Alpine's, the system boots into a tmpfs loaded from a
// boot disk, but there's no clear line to what disk that live image *came
// from*. If you had two Alpine 3.20.3 images attached to one VM, you'd
// ceretainly boot into Alpine 3.20.3, but I don't see a way to tell *which
// disk* that Alpine would be sourced from, from Alpine alone.
//
// So instead, check EFI variables. To do this, then, we have to.. parse EFI
// variables. That is what this test does below, but it's probably not fully
// robust to what we might do with PCI devices in the future.
//
// A more "future-proof" setup might be to just boot an OS, see that we ended up
// in the OS we expected, and check some attribute about it like that the kernel
// version is what we expected the booted OS to be. That's still a good fallback
// if we discover that parsing EFI variables is difficult to stick with for any
// reason. It has a downside though: we'd have to keep a specific image around
// with a specific kernel version as either the "we expect to boot into this"
// image or the "we expected to boot into not this" cases.
//
// The simplest case: show that we can configure the guest's boot order from
// outside the machine.  This is the most likely common case, where Propolis is
// told what the boot order should be by Nexus and we simply make it happen.
//
// Unlike later tests, this test does not manipulate boot configuration from
// inside the guest OS.
#[phd_testcase]
async fn configurable_boot_order(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("configurable_boot_order");

    // Create a second disk backed by the same artifact as the default
    // `boot-disk`. This way we'll boot to the same environment regardless of
    // which disk is used; we'll check EFI variables to figure out if the right
    // disk was booted.
    cfg.data_disk(
        "alt-boot",
        DiskSource::Artifact(ctx.default_guest_os_artifact()),
        DiskInterface::Virtio,
        DiskBackend::File,
        24,
    );

    // We haven't specified a boot order. So, we'll expect that we boot to the
    // lower-numbered PCI device (4) and end up in Alpine 3.20.
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    if !vm.guest_os_kind().is_linux() {
        phd_skip!("boot order tests require efivarfs to manipulate UEFI vars");
    }
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let boot_num_bytes = read_efivar(&vm, BOOT_CURRENT_VAR).await?;

    let boot_num: u16 = u16::from_le_bytes(boot_num_bytes.try_into().unwrap());

    let boot_option_bytes = read_efivar(&vm, &bootvar(boot_num)).await?;

    let mut cursor = Cursor::new(boot_option_bytes.as_slice());

    let load_option = EfiLoadOption::parse_from(&mut cursor).unwrap();

    assert!(load_option.path.matches_pci_device_function(4, 0));

    // Now specify a boot order and do the whole thing again. Note that this
    // order puts the later PCI device first, so this changes the boot order!
    cfg.boot_order(vec!["alt-boot", "boot-disk"]);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let boot_num_bytes = read_efivar(&vm, BOOT_CURRENT_VAR).await?;

    let boot_num: u16 = u16::from_le_bytes(boot_num_bytes.try_into().unwrap());

    let boot_option_bytes = read_efivar(&vm, &bootvar(boot_num)).await?;

    let mut cursor = Cursor::new(boot_option_bytes.as_slice());

    let load_option = EfiLoadOption::parse_from(&mut cursor).unwrap();

    // If we were going to test the PCI bus number too, we'd check the AHCI
    // Device Path entry that precedes these PCI values. But we only use PCI bus
    // 0 today, and the mapping from an AHCI Device Path to a PCI root is not
    // immediately obvious?
    assert!(load_option.path.matches_pci_device_function(24, 0));
}

// This is very similar to the `in_memory_backend_smoke_test` test, but
// specifically asserts that the unbootable disk is first in the boot order; the
// system booting means that boot order is respected and a non-bootable disk
// does not wedge startup.
#[phd_testcase]
async fn unbootable_disk_skipped(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("unbootable_disk_skipped");

    cfg.data_disk(
        "unbootable",
        DiskSource::FatFilesystem(FatFilesystem::new()),
        DiskInterface::Virtio,
        DiskBackend::InMemory { readonly: true },
        16,
    );

    // `boot-disk` is the implicitly-created boot disk made from the default
    // guest OS artifact.
    //
    // explicitly boot from it later, so OVMF has to try and fail to boot
    // `unbootable`.
    cfg.boot_order(vec!["unbootable", "boot-disk"]);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    if !vm.guest_os_kind().is_linux() {
        phd_skip!("boot order tests require efivarfs to manipulate UEFI vars");
    }
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let boot_num_bytes = read_efivar(&vm, BOOT_CURRENT_VAR).await?;

    let boot_num: u16 = u16::from_le_bytes(boot_num_bytes.try_into().unwrap());

    let boot_option_bytes = read_efivar(&vm, &bootvar(boot_num)).await?;

    let mut cursor = Cursor::new(boot_option_bytes.as_slice());

    let load_option = EfiLoadOption::parse_from(&mut cursor).unwrap();

    // Device 4 is the implicitly-used `boot-disk` PCI device number. This is
    // not 16, for example, as we expect to not boot `unbootable`.
    assert_eq!(load_option.pci_device_function(), (4, 0));

    let boot_order_bytes = read_efivar(&vm, BOOT_ORDER_VAR).await?;

    // Interestingly, when we specify a boot order via fwcfg, OVMF includes two
    // additional entries:
    // * "UiApp", which I can't find much about
    // * "EFI Internal Shell", the EFI shell the system drops into if no disks
    //   are bootable
    //
    // Exactly where these end up in the boot order is not entirely important;
    // we really just need to make sure that the boot order we specified comes
    // first (and before "EFI Internal Shell")
    #[derive(Debug, PartialEq, Eq)]
    enum TestState {
        SeekingUnbootable,
        FoundUnbootable,
        AfterBootOrder,
    }

    let mut state = TestState::SeekingUnbootable;

    for item in boot_order_bytes.chunks(2) {
        let option_num: u16 = u16::from_le_bytes(item.try_into().unwrap());

        let option_bytes = read_efivar(&vm, &bootvar(option_num)).await?;

        let mut cursor = Cursor::new(option_bytes.as_slice());

        let load_option = EfiLoadOption::parse_from(&mut cursor).unwrap();

        match state {
            TestState::SeekingUnbootable => {
                if load_option.path.matches_pci_device_function(16, 0) {
                    state = TestState::FoundUnbootable;
                    continue;
                } else if load_option
                    .path
                    .matches_fw_file(EDK2_FIRMWARE_VOL_GUID, EDK2_UI_APP_GUID)
                {
                    // `UiApp`. Ignore it and continue.
                    continue;
                } else {
                    bail!(
                        "Did not expect to find {load_option:?} yet (test state = {state:?})"
                    );
                }
            }
            TestState::FoundUnbootable => {
                if load_option.path.matches_pci_device_function(4, 0) {
                    state = TestState::AfterBootOrder;
                    continue;
                } else {
                    bail!(
                        "Did not expect to find {load_option:?} (test state = {state:?})"
                    );
                }
            }
            TestState::AfterBootOrder => {
                let is_ui_app = load_option
                    .path
                    .matches_fw_file(EDK2_FIRMWARE_VOL_GUID, EDK2_UI_APP_GUID);
                let is_efi_shell = load_option.path.matches_fw_file(
                    EDK2_FIRMWARE_VOL_GUID,
                    EDK2_EFI_SHELL_GUID,
                );
                if !is_ui_app && !is_efi_shell {
                    bail!(
                        "Did not expect to find {load_option:?} (test state = {state:?})"
                    );
                }
            }
        }
    }

    assert_eq!(state, TestState::AfterBootOrder);
}

// Start with the boot order being `["boot-disk", "unbootable"]`, then change it
// so that next boot we'll boot from `unbootable` first. Then reboot and verify
// that the boot order is still "boot-disk" first.
#[phd_testcase]
async fn guest_can_adjust_boot_order(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("guest_can_adjust_boot_order");

    cfg.data_disk(
        "unbootable",
        DiskSource::FatFilesystem(FatFilesystem::new()),
        DiskInterface::Virtio,
        DiskBackend::InMemory { readonly: true },
        16,
    );

    cfg.boot_order(vec!["boot-disk", "unbootable"]);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    if !vm.guest_os_kind().is_linux() {
        phd_skip!("boot order tests require efivarfs to manipulate UEFI vars");
    }
    vm.launch().await?;
    vm.wait_to_boot().await?;

    // If the guest doesn't have an EFI partition then there's no way for boot
    // order preferences to be persisted.
    let mountline =
        vm.run_shell_command("mount | grep efivarfs").ignore_status().await?;

    if !mountline.starts_with("efivarfs on ") {
        warn!(
            "guest doesn't have an efivarfs, cannot manage boot order! \
            exiting test WITHOUT VALIDATING ANYTHING"
        );
        return Ok(());
    }

    // Try adding a few new boot options, then add them to the boot order,
    // reboot, and make sure they're all as we set them.
    let bootffff_path = efipath(&bootvar(0xffff));
    let bootffff_res = vm
        .run_shell_command(&format!("ls {bootffff_path}"))
        .ignore_status()
        .await?;
    // `ls` just prints the file path if it exists, but the error text varies a
    // bit depending on Alpine, Ubuntu, Busybox, etc. Notionally we could
    // `check_err()` above, but having a `BootFFFF` entry already is merely
    // weird; we can still replace it and continue with the test.
    if bootffff_res == bootffff_path {
        warn!(
            "guest environment already has a BootFFFF entry; \
            is this not a fresh image?"
        );
    }

    let boot_num: u16 = {
        let bytes = read_efivar(&vm, BOOT_CURRENT_VAR).await?;
        u16::from_le_bytes(bytes.try_into().unwrap())
    };

    // The entry we booted from is clearly valid, so we should be able to insert
    // a few duplicate entries. We won't boot into them, but if something bogus
    // happens and we did boot one of these, at least it'll work and we can
    // detect the misbehavior.
    //
    // But here's a weird one: if we just append these to the end, on reboot
    // they'll be moved somewhat up the boot order. This occurrs both if setting
    // variables through `efibootmgr` or by writing to
    // /sys/firmware/efi/efivars/BootOrder-* directly. As an example, say we had
    // a boot order of "0004,0001,0003,0000" where boot options were as follows:
    // * 0000: UiApp
    // * 0001: PCI device 4, function 0
    // * 0003: EFI shell (Firmware volume+file)
    // * 0004: Ubuntu (HD partition 15, GPT formatted)
    //
    // If we duplicate entry 4 to new options FFF0 and FFFF, reset the boot
    // order to "0004,0001,0003,0000,FFF0,FFFF", then reboot the VM, the boot
    // order when it comes back up will be "0004,0001,FFF0,FFFF,0003,0000".
    //
    // This almost makes sense, but with other devices in the mix I've seen
    // reorderings like `0004,0001,<PCI 16.0>,0003,0000,FFF0,FFFF` turning into
    // `0004,0001,FFF0,FFFF,<PCI 16.0>,0003,0000`. This is particularly strange
    // in that the new options were reordered around some other PCI device. It's
    // not the boot order we set!
    //
    // So, to at least confirm we *can* modify the boot order in a stable way,
    // make a somewhat less ambitious change: insert the duplicate boot options
    // in the order directly after the option they are duplicates of. This seems
    // to not get reordered.
    let boot_option_bytes = read_efivar(&vm, &bootvar(boot_num)).await?;

    // Finally, seeing a read-write `efivarfs` is not sufficient to know that
    // writes to EFI variables will actually stick. For example, an Alpine live
    // image backed by an ISO 9660 filesystem may have an EFI System Partition
    // and `efivarfs`, but certainly cannot persist state and will drop writes
    // to EFI variables.
    //
    // Check for this condition and exit early if the guest OS configuration
    // will not let us perform a useful test.
    write_efivar(&vm, &bootvar(0xfff0), &boot_option_bytes).await?;
    let reread = read_efivar(&vm, &bootvar(0xfff0)).await?;
    if reread.is_empty() {
        phd_skip!("Guest environment drops EFI variable writes");
    } else {
        assert_eq!(
            boot_option_bytes,
            read_efivar(&vm, &bootvar(0xfff0)).await?,
            "EFI variable write wrote something, but not what we expected?"
        );
    }

    let boot_order_bytes = read_efivar(&vm, BOOT_ORDER_VAR).await?;

    let mut new_boot_order = Vec::new();
    new_boot_order.extend_from_slice(&boot_order_bytes);

    let mut new_boot_order = boot_order_bytes.clone();
    let booted_idx = new_boot_order
        .chunks(2)
        .enumerate()
        .find(|(_i, chunk)| *chunk == boot_num.to_le_bytes())
        .map(|(i, _chunk)| i)
        .expect("booted entry exists");
    let suffix = new_boot_order.split_off((booted_idx + 1) * 2);
    new_boot_order.extend_from_slice(&[0xf0, 0xff]);
    new_boot_order.extend_from_slice(&[0xff, 0xff]);
    new_boot_order.extend_from_slice(&suffix);

    write_efivar(&vm, &bootvar(0xfff0), &boot_option_bytes).await?;
    assert_eq!(boot_option_bytes, read_efivar(&vm, &bootvar(0xfff0)).await?);
    write_efivar(&vm, &bootvar(0xffff), &boot_option_bytes).await?;
    assert_eq!(boot_option_bytes, read_efivar(&vm, &bootvar(0xffff)).await?);

    write_efivar(&vm, BOOT_ORDER_VAR, &new_boot_order).await?;
    let written_boot_order = read_efivar(&vm, BOOT_ORDER_VAR).await?;
    assert_eq!(new_boot_order, written_boot_order);

    // Now, reboot and check that the settings stuck.
    vm.graceful_reboot().await?;

    let boot_order_after_reboot = read_efivar(&vm, BOOT_ORDER_VAR).await?;
    assert_eq!(new_boot_order, boot_order_after_reboot);

    let boot_num_after_reboot: u16 = {
        let bytes = read_efivar(&vm, BOOT_CURRENT_VAR).await?;
        u16::from_le_bytes(bytes.try_into().unwrap())
    };
    assert_eq!(boot_num, boot_num_after_reboot);

    let boot_option_bytes_after_reboot =
        read_efivar(&vm, &bootvar(boot_num)).await?;
    assert_eq!(boot_option_bytes, boot_option_bytes_after_reboot);
}

// This test is less demonstrating specific desired behavior, and more the
// observed behavior of OVMF with configuration we can offer today. If Propolis
// or other changes break this test, the test may well be what needs changing.
//
// If a `bootorder` file is present in fwcfg, there two relevant consequences
// demonstrated here: * The order of devices in `bootorder` is the order that
// will be used; on reboot any persisted configuration will be replaced with one
// derived from `bootorder` and corresponding OVMF logic.  * Guests cannot
// meaningfully change boot order. If an entry is in `bootorder`, that
// determines its' order. If it is not in `bootorder` but is retained for
// booting, it is appended to the end of the boot order in what seems to be the
// order that OVMF discovers the device.
//
// If `bootorder` is removed for subsequent reboots, the EFI System Partition's
// store of NvVar variables is the source of boot order, and guests can control
// their boot fates.
#[phd_testcase]
async fn boot_order_source_priority(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("boot_order_source_priority");

    cfg.data_disk(
        "unbootable",
        DiskSource::FatFilesystem(FatFilesystem::new()),
        DiskInterface::Virtio,
        DiskBackend::InMemory { readonly: true },
        16,
    );

    cfg.data_disk(
        "unbootable-2",
        DiskSource::FatFilesystem(FatFilesystem::new()),
        DiskInterface::Virtio,
        DiskBackend::InMemory { readonly: true },
        20,
    );

    // For the first stage of this test, we want to leave the boot procedure up
    // to whatever the guest firmware will do.
    cfg.clear_boot_order();

    let mut vm_no_bootorder = ctx.spawn_vm(&cfg, None).await?;
    if !vm_no_bootorder.guest_os_kind().is_linux() {
        phd_skip!("boot order tests require efivarfs to manipulate UEFI vars");
    }
    vm_no_bootorder.launch().await?;
    vm_no_bootorder.wait_to_boot().await?;

    let boot_option_numbers = discover_boot_option_numbers(
        &vm_no_bootorder,
        &[
            ((4, 0), "boot-disk"),
            ((16, 0), "unbootable"),
            ((20, 0), "unbootable-2"),
        ],
    )
    .await?;

    // `unbootable` should be somewhere in the middle of the boot order:
    // definitely between `boot-disk` and `unbootable-2`, for the options
    // enumerated from PCI devices.
    let unbootable_num = boot_option_numbers["unbootable"];

    let unbootable_idx = remove_boot_entry(&vm_no_bootorder, unbootable_num)
        .await?
        .expect("unbootable was in the boot order");

    vm_no_bootorder.graceful_reboot().await?;

    let reloaded_order = read_efivar(&vm_no_bootorder, BOOT_ORDER_VAR).await?;

    // Somewhat unexpected, but where OVMF gets us: `unbootable` is back in the
    // boot order, but at the end of the list. One might hope it would be
    // entirely removed from the boot order now, but no such luck. The good news
    // is that we can in fact influence the boot order.
    let unbootable_idx_after_reboot =
        find_option_in_boot_order(&reloaded_order, unbootable_num)
            .expect("unbootable is back in the order");

    let last_boot_option = &reloaded_order[reloaded_order.len() - 2..];
    assert_eq!(last_boot_option, &unbootable_num.to_le_bytes());

    // But this new position for `unbootable` definitely should be different
    // from before.
    assert_ne!(unbootable_idx, unbootable_idx_after_reboot);

    // And if we do the whole dance again with an explicit boot order provided
    // to the guest, we'll get different results!
    drop(vm_no_bootorder);
    cfg.boot_order(vec!["boot-disk", "unbootable", "unbootable-2"]);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let boot_option_numbers = discover_boot_option_numbers(
        &vm,
        &[
            ((4, 0), "boot-disk"),
            ((16, 0), "unbootable"),
            ((20, 0), "unbootable-2"),
        ],
    )
    .await?;

    let unbootable_num = boot_option_numbers["unbootable"];

    // Try removing a fw_cfg-defined boot option.
    let unbootable_idx = remove_boot_entry(&vm, unbootable_num)
        .await?
        .expect("unbootable was in the boot order");

    vm.graceful_reboot().await?;

    let reloaded_order = read_efivar(&vm, BOOT_ORDER_VAR).await?;

    // The option will be back in the boot order, where it was before! This is
    // because fwcfg still has a `bootorder` file.
    assert_eq!(
        find_option_in_boot_order(&reloaded_order, unbootable_num),
        Some(unbootable_idx)
    );
}

#[phd_testcase]
async fn nvme_boot_option_description(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("nvme_boot_option_description");

    cfg.data_disk(
        "nvme-test-disk",
        DiskSource::Artifact(ctx.default_guest_os_artifact()),
        DiskInterface::Nvme,
        DiskBackend::File,
        8,
    );

    // We'll boot to `boot-disk`, but this test actually cares about the
    // description of `nvme-test-disk`. Ensure it's in the boot order list so
    // that we'll have a `BootNNNN` option for it.
    cfg.boot_order(vec!["boot-disk", "nvme-test-disk"]);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    if !vm.guest_os_kind().is_linux() {
        phd_skip!("boot option description test depends on efivarfs");
    }
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let boot_option_numbers = discover_boot_option_numbers(
        &vm,
        &[((4, 0), "boot-disk"), ((8, 0), "nvme-test-disk")],
    )
    .await?;

    let test_disk_option: u16 = boot_option_numbers["nvme-test-disk"];

    let test_disk_option_bytes =
        read_efivar(&vm, &bootvar(test_disk_option)).await?;

    let mut cursor = Cursor::new(test_disk_option_bytes.as_slice());

    let load_option = EfiLoadOption::parse_from(&mut cursor).unwrap();

    // Just a quick integrity check: we just put `nvme-test-disk` at PCI slot 8,
    // so we should be comparing to a load option describing PCI slot 8. If
    // these don't match, the description checking below would probably be a red
    // herring.
    assert!(load_option.path.matches_pci_device_function(8, 0));

    // The test assertion here is "UEFI  2" because we currently expect an NVMe
    // boot option to be named via the following procedure:
    // * fw_cfg bootorder (via `cfg_boot_order()` above) specifies `boot-disk`
    //   first, and `nvme-test-disk` second.
    // * OVMF processes boot options in that order. For each option:
    //   * try determining a boot description via these handlers in order:
    //     https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L749-L756
    // * `boot-disk` is NVMe and described by BmGetNvmeDescription
    // * in that function, OVMF sends an NVMe IDENTIFY CONTROLLER command:
    //   https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L600-L618
    // * the returned identification information has the following Mn/Sn:
    //   - Mn: default (`[0; 40]`):
    //     https://github.com/oxidecomputer/propolis/blob/5fe523a/lib/propolis/src/hw/nvme/bits.rs#L1001
    //   - Sn: the first 20 characters of the disk name. Here: "boot-disk"
    //     https://github.com/oxidecomputer/propolis/blob/5fe523a/lib/propolis/src/hw/nvme/mod.rs#L507-L532
    // * OVMF assembles the identification information into a wide-char string
    //   like "\x00\x00\x00\x00\x00... boot-disk\x00\x00...":
    //   https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L628-L643
    // * The preliminary description has "UEFI " prepended to it:
    //   https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L788-L790
    // * `StrCatS` appends the preliminary description to this new string.
    //   Because the model number is all nulls, the first character of
    //   "boot-disk"'s description is \x00, and `StrCatS` immediately returns
    //   having added nothing to the description:
    //   https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L791
    // * At this point "boot-disk"'s description is "UEFI ". The same procedure
    //   runs for "nvme-test-disk" and describes it "UEFI " as well.
    // * Finally, `BmMakeBootOptionDescriptionUnique` runs and appends " 2" to
    //   make "nvme-test-disk"'s description distinct from "boot-disk". At this
    //   point, "nvme-test-disk"'s description is "UEFI  2":
    //   https://github.com/oxidecomputer/edk2/blob/propolis/edk2-stable202105/MdeModulePkg/Library/UefiBootManagerLib/BmBootDescription.c#L863-L868
    const EXPECTED_BOOT_DESCRIPTION: &str = "UEFI  2";

    // Hey! If this assertion failed, you may have done a good thing!
    //
    // This test's primary purpose is to ensure we do not *unknowingly* change
    // the description of OVMF-determined boot options. It is not unacceptable
    // that these options change, but changing them requires careful
    // consideration. Specifically, as of writing this test, if a device has:
    // * a boot option automatically determined by EDK2
    // * has that option persisted to NvVars
    // * a boot option with changed name on subsequent boot
    // the previously valid automatically-added boot option will be removed from
    // the boot order, and a new option with the new name will be added to the
    // end of the boot order.
    //
    // At this point, if the EFI shell is in the boot order list and in front of
    // a disk with a bootable OS on it, a guest VM could end up simply booting
    // into the EFI shell and get "stuck" there. This is not ideal, especially
    // since operating the EFI shell is not very well documented.
    //
    // So, if this assertion failed, something caused the
    // automatically-determined boot option description to change. You may be
    // providing a model number in the NVMe IDENTIFY CONTROLLER command, or OVMF
    // may be using different logic to determine descriptions. Presumably you've
    // changed something, so you'd have a better guess than me. If UEFI NvVars
    // are still retained in user-managed disks, where we are not managing the
    // ESP or NvVars data ourselves, then we probably should preserve existing
    // disk boot option descriptions. This test would be a great place to ensure
    // any new compatibility mechanism also works correctly. If UEFI NvVars are
    // provided through an emulated firmware device, or we're being more
    // invasive with changes to OVMF including boot order determination, then
    // maybe the assertion should fail and this test is no longer useful!
    assert_eq!(load_option.description, EXPECTED_BOOT_DESCRIPTION);
}


================================================
FILE: phd-tests/tests/src/cpuid.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use cpuid_utils::{CpuidIdent, CpuidSet, CpuidValues};
use itertools::Itertools;
use phd_framework::{test_vm::MigrationTimeout, TestVm};
use phd_testcase::*;
use propolis_client::instance_spec::{CpuidEntry, InstanceSpecStatus};
use tracing::info;
use uuid::Uuid;

fn cpuid_entry(
    leaf: u32,
    subleaf: Option<u32>,
    eax: u32,
    ebx: u32,
    ecx: u32,
    edx: u32,
) -> CpuidEntry {
    CpuidEntry { leaf, subleaf, eax, ebx, ecx, edx }
}

#[phd_testcase]
async fn cpuid_instance_spec_round_trip_test(ctx: &TestCtx) {
    // The guest isn't actually going to boot with these nonsense settings. The
    // goal is simply to verify that the ensure API properly records these
    // options and reflects them back out on request.
    let entries = vec![
        cpuid_entry(0, None, 0xaaaa, 0xbbbb, 0xcccc, 0xdddd),
        cpuid_entry(0x8000_0000, None, 0x88aa, 0x88bb, 0x88cc, 0x88dd),
    ];

    let mut cfg = ctx.vm_config_builder("cpuid_instance_spec_round_trip_test");
    cfg.cpuid(entries.clone());
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;

    let spec_get_response = vm.get_spec().await?;
    let InstanceSpecStatus::Present(spec) = spec_get_response.spec else {
        panic!("instance spec should be present for a running VM");
    };

    let cpuid = spec.board.cpuid.expect("board should have explicit CPUID");
    assert_eq!(cpuid.entries.len(), entries.len());
    itertools::assert_equal(cpuid.entries, entries);
}

/// A synthetic brand string that can be injected into guest CPUID leaves
/// 0x8000_0002-0x8000_0004.
const BRAND_STRING: &[u8; 48] =
    b"Oxide Cloud Computer Company Cloud Computer\0\0\0\0\0";

/// Injects a fake CPU brand string into CPUID leaves 0x8000_0002-0x8000_0004.
///
/// # Panics
///
/// Panics if the input CPUID set does not include the brand string leaves.
fn inject_brand_string(cpuid: &mut CpuidSet) {
    // The brand string leaves have been defined for long enough that they
    // should be present on virtually any host that's modern enough to run
    // Propolis and PHD. Assert (instead of returning a "skipped" result) if
    // they're missing, since that may indicate a latent bug in the
    // `cpuid_utils` crate.
    let ext_leaf_0 = cpuid
        .get(CpuidIdent::leaf(cpuid_utils::bits::EXTENDED_BASE_LEAF))
        .expect("PHD-capable processors should have some extended leaves");

    assert!(
        ext_leaf_0.eax >= 0x8000_0004,
        "PHD-capable processors should support at least leaf 0x8000_0004 \
        (reported {})",
        ext_leaf_0.eax
    );

    let chunks = BRAND_STRING.chunks_exact(4);
    let mut ext_leaf_2 = CpuidValues::default();
    let mut ext_leaf_3 = CpuidValues::default();
    let mut ext_leaf_4 = CpuidValues::default();
    let dst = ext_leaf_2
        .iter_mut()
        .chain(ext_leaf_3.iter_mut())
        .chain(ext_leaf_4.iter_mut());

    for (chunk, dst) in chunks.zip(dst) {
        *dst = u32::from_le_bytes(chunk.try_into().unwrap());
    }

    cpuid.insert(CpuidIdent::leaf(0x8000_0002), ext_leaf_2).unwrap();
    cpuid.insert(CpuidIdent::leaf(0x8000_0003), ext_leaf_3).unwrap();
    cpuid.insert(CpuidIdent::leaf(0x8000_0004), ext_leaf_4).unwrap();
}

/// Asserts that `/proc/cpuinfo` in the guest returns output that contains
/// [`BRAND_STRING`].
async fn verify_guest_brand_string(vm: &TestVm) -> anyhow::Result<()> {
    let cpuinfo = vm.run_shell_command("cat /proc/cpuinfo").await?;
    info!(cpuinfo, "/proc/cpuinfo output");
    assert!(cpuinfo.contains(
        std::str::from_utf8(BRAND_STRING).unwrap().trim_matches('\0')
    ));

    Ok(())
}

/// Launches a test VM with a synthetic brand string injected into its CPUID
/// leaves.
async fn launch_cpuid_smoke_test_vm(
    ctx: &TestCtx,
    vm_name: &str,
) -> anyhow::Result<TestVm> {
    let mut host_cpuid = cpuid_utils::host::query_complete(
        cpuid_utils::host::CpuidSource::BhyveDefault,
    )?;

    info!(?host_cpuid, "read bhyve default CPUID");

    inject_brand_string(&mut host_cpuid);

    let mut cfg = ctx.vm_config_builder(vm_name);
    cfg.cpuid(
        host_cpuid
            .iter()
            .map(|(leaf, value)| CpuidEntry {
                leaf: leaf.leaf,
                subleaf: leaf.subleaf,
                eax: value.eax,
                ebx: value.ebx,
                ecx: value.ecx,
                edx: value.edx,
            })
            .collect(),
    );
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    Ok(vm)
}

#[phd_testcase]
async fn cpuid_boot_test(ctx: &TestCtx) {
    let vm = launch_cpuid_smoke_test_vm(ctx, "cpuid_boot_test").await?;
    verify_guest_brand_string(&vm).await?;
}

#[phd_testcase]
async fn cpuid_migrate_smoke_test(ctx: &TestCtx) {
    let vm = launch_cpuid_smoke_test_vm(ctx, "cpuid_boot_test").await?;
    verify_guest_brand_string(&vm).await?;

    // Migrate the VM and make sure the brand string setting persists.
    let mut target = ctx
        .spawn_successor_vm("cpuid_boot_test_migration_target", &vm, None)
        .await?;

    target
        .migrate_from(&vm, Uuid::new_v4(), MigrationTimeout::default())
        .await?;

    // Reset the target to force it to reread its CPU information.
    target.reset().await?;
    target.wait_to_boot().await?;
    verify_guest_brand_string(&target).await?;
}

struct LinuxGuestTopo<'a> {
    vm: &'a TestVm,
}

impl<'a> LinuxGuestTopo<'a> {
    fn cpu_stem(vcpu: u8) -> String {
        format!("/sys/devices/system/cpu/cpu{vcpu}/topology")
    }

    async fn new(vm: &'a TestVm) -> Self {
        let this = Self { vm };
        // Expect Linux numbers CPUs as 0 through vCPU-1 (inclusive).
        //
        // cpu0 should always exist (if it does not, /sys/devices/system is not
        // what we expect), and cpu<vCPU> should not (if it does, again,
        // /sys/devices/system is not what we expect).
        let out = this
            .vm
            .run_shell_command(&format!("ls {}", Self::cpu_stem(0)))
            .await
            .expect("can run ls of a directory that exists");
        assert!(out.contains("thread_siblings"));

        let out = this
            .vm
            .run_shell_command(&format!(
                "ls {}",
                Self::cpu_stem(this.cpus().await)
            ))
            .ignore_status()
            .await
            .expect("can run ls of a directory that doesn't exist");
        assert!(out.contains("No such file or directory"));

        this
    }

    async fn vendor_string(&self) -> String {
        let command = "cat /proc/cpuinfo | \
            grep vendor_id | \
            head -n 1 | \
            cut -d':' -f 2";
        let out = self
            .vm
            .run_shell_command(command)
            .await
            .expect("can grep vendor_id out of cpuinfo");

        out.trim().to_string()
    }

    async fn cpus(&self) -> u8 {
        let spec_get_response =
            self.vm.get_spec().await.expect("can get the instance's spec back");
        let InstanceSpecStatus::Present(spec) = spec_get_response.spec else {
            panic!("instance spec should be present for a running VM");
        };

        spec.board.cpus
    }

    async fn physical_package_ids(&self) -> impl Iterator<Item = u32> {
        let mut result = Vec::new();
        for cpu_num in 0..self.cpus().await {
            let out = self
                .vm
                .run_shell_command(&format!(
                    "cat {}/physical_package_id",
                    Self::cpu_stem(cpu_num)
                ))
                .await
                .expect("can get cores' physical package ID");
            // Linux' `Documentation/API/stable/sysfs-devices-system-cpu` says
            // this is "integer" but drivers/base/topology.c says it is "%d"
            // specifically.
            result
                .push(out.parse::<u32>().expect("physical package id parses"));
        }
        result.into_iter()
    }

    /// Returns an iterator of hexadecimal bitmaps from Linux's determined
    /// thread sharing in the guest VM.
    ///
    /// Since Linux uses one bit per CPU to show if threads are shared in a
    /// processor core, this would use 65 bits for a 65-vCPU guest, or 256 bits
    /// for a 256-vCPU guest. So use `String` here instead of parsing into a
    /// numeric type so at least this "just works" when we get to larger core
    /// counts.
    async fn thread_siblings(&self) -> impl Iterator<Item = String> {
        let mut result = Vec::new();
        for cpu_num in 0..self.cpus().await {
            let out = self
                .vm
                .run_shell_command(&format!(
                    "cat {}/thread_siblings",
                    Self::cpu_stem(cpu_num)
                ))
                .await
                .expect("can get thread siblings of a core that exists");
            let out = out.trim();
            // Linux' `Documentation/API/stable/sysfs-devices-system-cpu` says
            // this is "hexadecimal bitmask." This is kept a string for reasons
            // described above so just validate the characters in it are
            // reasonable.
            assert!(out.chars().all(|c| c.is_ascii_hexdigit()));
            result.push(out.to_string());
        }
        result.into_iter()
    }
}

#[phd_testcase]
async fn guest_cpu_topo_test(ctx: &TestCtx) {
    let vm = launch_cpuid_smoke_test_vm(ctx, "guest_cpu_topo_test").await?;

    // The topology-checking is Linux-specific, though it should be appropriate
    // for all Linux distributions. For other OSes, just warn and skip for now.
    if !vm.guest_os_kind().is_linux() {
        phd_skip!(format!(
            "guest topo test does not yet have support for {:?}",
            vm.guest_os_kind()
        ));
    }

    // Between the way we set initial APIC IDs and the way Linux numbers logical
    // processors, a 4-vCPU VM should report a topology like:
    // * core 0: thread 0, thread 1
    // * core 1: thread 2, thread 3
    //
    // over in sysfs, the CPU topology files then represent sibling-ness as a
    // bitfield. So core 0 has two threads (0, 1) and their bits are set in core
    // 0's thread_slibings (0b0000_0011 -> "3").  Core 1 does the same for
    // threads 2 and 3, which yields an expected core 1 thread_siblings of
    // (0b0000_1100 -> "c").
    let guest_topo = LinuxGuestTopo::new(&vm).await;

    // Except that for the time being we exclude leaf B from specialization (see
    // the comment over in `propolis-server/src/lib/initializer.rs`), meaning
    // that on AMD there isn't enough topology information for Linux to
    // determine anything other than "single-thread single-core many-socket".
    // That makes this test Intel-only for the time being.
    if guest_topo.vendor_string().await != "GenuineIntel" {
        phd_skip!("guest topo test is Intel-only for the moment");
    }

    // All cores should be in socket 0
    assert!(guest_topo.physical_package_ids().await.all(|item| item == 0));

    // We currently number CPUs such that Linux numbers them as successive pairs
    // of thread twins.
    let siblings = guest_topo.thread_siblings().await;
    for (idx, mut pair) in siblings.chunks(2).into_iter().enumerate() {
        let lower = pair.next().expect("sibling pair has a pair of cores");
        let upper = pair.next().expect("pairs have even numbers of cores");

        // Each pair of siblings should see that they have the same siblings
        assert_eq!(lower, upper);
        // This character in the string should have a pair of bits for the
        // current sibling pair under consideration.
        let sibling_idx = idx / 4;
        // And at that index, the character should be this hex digit
        // (representing the pair of bits for these sibling threads). We can be
        // looking either of the lower pairs (in which case the cores are 0b0011
        // => 3), or the higher pairs (in which case the cores are 0b1100 => c)
        let sibling_char = match idx % 4 {
            0 => '3',
            1 => '3',
            2 => 'c',
            3 => 'c',
            o => {
                panic!("bit index in hex digit is less than four? except {o}");
            }
        };
        assert!(lower.chars().enumerate().all(|(i, ch)| {
            if i != sibling_idx {
                ch == '0'
            } else {
                ch == sibling_char
            }
        }));
    }
}


================================================
FILE: phd-tests/tests/src/crucible/migrate.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_framework::test_vm::MigrationTimeout;
use phd_testcase::*;
use tracing::info;
use uuid::Uuid;

#[phd_testcase]
async fn smoke_test(ctx: &TestCtx) {
    let mut config = ctx.vm_config_builder("crucible_migrate_smoke_source");
    super::add_default_boot_disk(ctx, &mut config)?;
    let mut source = ctx.spawn_vm(&config, None).await?;
    let disk_handles = source.cloned_disk_handles();
    let disk = disk_handles[0].as_crucible().unwrap();
    disk.set_generation(1);

    source.launch().await?;
    source.wait_to_boot().await?;

    let lsout =
        source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
    assert_eq!(lsout, "");
    source.run_shell_command("touch ./foo.bar").await?;
    source.run_shell_command("sync ./foo.bar").await?;

    disk.set_generation(2);
    let mut target = ctx
        .spawn_successor_vm("crucible_migrate_smoke_target", &source, None)
        .await?;

    target
        .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
        .await?;
    let lsout = target.run_shell_command("ls foo.bar").await?;
    assert_eq!(lsout, "foo.bar");
}

#[phd_testcase]
async fn load_test(ctx: &TestCtx) {
    let mut config = ctx.vm_config_builder("crucible_load_test_source");
    super::add_default_boot_disk(ctx, &mut config)?;
    let mut source = ctx.spawn_vm(&config, None).await?;
    let disk_handles = source.cloned_disk_handles();
    let disk = disk_handles[0].as_crucible().unwrap();
    disk.set_generation(1);

    source.launch().await?;
    source.wait_to_boot().await?;

    disk.set_generation(2);
    let mut target = ctx
        .spawn_successor_vm("crucible_load_test_target", &source, None)
        .await?;

    // Create some random data.
    let block_count = 10;
    let ddout = source
        .run_shell_command(
            format!(
                "dd if=/dev/random of=./rand.txt bs=5M count={block_count}"
            )
            .as_str(),
        )
        .await?;
    assert!(ddout.contains(format!("{block_count}+0 records in").as_str()));

    // Compute the data's hash.
    let sha256sum_out = source.run_shell_command("sha256sum rand.txt").await?;
    let checksum = sha256sum_out.split_whitespace().next().unwrap();
    info!("Generated SHA256 checksum: {}", checksum);

    // Start copying the generated file into a second file, then start a
    // migration while that copy is in progress.
    source.run_shell_command("dd if=./rand.txt of=./rand_new.txt &").await?;
    target
        .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
        .await?;

    // Wait for the background command to finish running, then compute the
    // hash of the copied file. If all went well this will match the hash of
    // the source file.
    target.run_shell_command("wait $!").await?;
    let sha256sum_target =
        target.run_shell_command("sha256sum rand_new.txt").await?;
    let checksum_target = sha256sum_target.split_whitespace().next().unwrap();
    assert_eq!(checksum, checksum_target);
}


================================================
FILE: phd-tests/tests/src/crucible/mod.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_testcase::{
    phd_framework::{
        disk::BlockSize,
        test_vm::{DiskBackend, DiskInterface, VmConfig},
    },
    *,
};

mod migrate;
mod smoke;

fn add_crucible_boot_disk_or_skip<'a>(
    ctx: &TestCtx,
    config: &mut VmConfig<'a>,
    artifact: &'a str,
    interface: DiskInterface,
    pci_slot: u8,
    min_disk_size_gib: u64,
    block_size: BlockSize,
) -> phd_testcase::Result<()> {
    if !ctx.crucible_enabled() {
        phd_skip!("Crucible backends not enabled (no downstairs path)");
    }

    config.boot_disk(
        artifact,
        interface,
        DiskBackend::Crucible { min_disk_size_gib, block_size },
        pci_slot,
    );

    Ok(())
}

fn add_default_boot_disk<'a>(
    ctx: &'a TestCtx,
    config: &mut VmConfig<'a>,
) -> phd_testcase::Result<()> {
    add_crucible_boot_disk_or_skip(
        ctx,
        config,
        ctx.default_guest_os_artifact(),
        DiskInterface::Nvme,
        4,
        10,
        BlockSize::Bytes512,
    )
}


================================================
FILE: phd-tests/tests/src/crucible/smoke.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::time::Duration;

use phd_framework::{
    disk::{BlockSize, DiskSource},
    test_vm::{DiskBackend, DiskInterface},
};
use phd_testcase::*;
use propolis_client::types::InstanceState;

#[phd_testcase]
async fn boot_test(ctx: &TestCtx) {
    let mut config = ctx.vm_config_builder("crucible_boot_test");
    super::add_default_boot_disk(ctx, &mut config)?;
    let mut vm = ctx.spawn_vm(&config, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;
}

#[phd_testcase]
async fn api_reboot_test(ctx: &TestCtx) {
    let mut config = ctx.vm_config_builder("crucible_guest_reboot_test");
    super::add_default_boot_disk(ctx, &mut config)?;

    let mut vm = ctx.spawn_vm(&config, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;
    vm.reset().await?;
    vm.wait_to_boot().await?;
}

#[phd_testcase]
async fn guest_reboot_test(ctx: &TestCtx) {
    let mut config = ctx.vm_config_builder("crucible_guest_reboot_test");
    super::add_default_boot_disk(ctx, &mut config)?;

    let mut vm = ctx.spawn_vm(&config, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    vm.graceful_reboot().await?;
}

#[phd_testcase]
async fn shutdown_persistence_test(ctx: &TestCtx) {
    let mut config =
        ctx.vm_config_builder("crucible_shutdown_persistence_test");
    super::add_default_boot_disk(ctx, &mut config)?;
    let mut vm = ctx.spawn_vm(&config, None).await?;
    if vm.guest_os_has_read_only_fs() {
        phd_skip!(
            "Can't run data persistence test on a guest with a read-only file
             system"
        );
    }

    let disk_handles = vm.cloned_disk_handles();
    let disk = disk_handles[0].as_crucible().unwrap();
    disk.set_generation(1);
    vm.launch().await?;
    vm.wait_to_boot().await?;

    // Verify that the test file doesn't exist yet, then touch it, flush it, and
    // shut down the VM.
    let lsout =
        vm.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
    assert_eq!(lsout, "");
    vm.run_shell_command("touch ./foo.bar").await?;
    vm.run_shell_command("sync ./foo.bar").await?;
    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;

    // Increment the disk's generation before attaching it to a new VM.
    disk.set_generation(2);
    let mut vm = ctx
        .spawn_successor_vm("crucible_shutdown_persistence_test_2", &vm, None)
        .await?;

    vm.launch().await?;
    vm.wait_to_boot().await?;

    // The touched file from the previous VM should be present in the new one.
    let lsout = vm.run_shell_command("ls foo.bar 2> /dev/null").await?;
    assert_eq!(lsout, "foo.bar");
}

#[phd_testcase]
async fn vcr_replace_during_start_test(ctx: &TestCtx) {
    if !ctx.crucible_enabled() {
        phd_skip!("Crucible backends not enabled (no downstairs path)");
    }

    let mut config =
        ctx.vm_config_builder("crucible_vcr_replace_during_start_test");

    // Create a blank data disk on which to perform VCR replacement. This is
    // necessary because Crucible doesn't permit VCR replacements for volumes
    // whose read-only parents are local files (which is true for artifact-based
    // Crucible disks).
    const DATA_DISK_NAME: &str = "vcr-replacement-target";
    config.data_disk(
        DATA_DISK_NAME,
        DiskSource::Blank(1024 * 1024 * 1024),
        DiskInterface::Nvme,
        DiskBackend::Crucible {
            min_disk_size_gib: 1,
            block_size: BlockSize::Bytes512,
        },
        5,
    );

    // Configure the disk so that when the VM starts, it will have an invalid
    // downstairs address.
    let spec = config.vm_spec(ctx).await?;
    let disk_hdl =
        spec.get_disk_by_device_name(DATA_DISK_NAME).cloned().unwrap();
    let disk = disk_hdl.as_crucible().unwrap();
    disk.enable_vcr_black_hole();

    // Try to start the VM, but don't wait for it to boot; it should get stuck
    // while activating using an invalid downstairs address.
    let mut vm = ctx.spawn_vm_with_spec(spec, None).await?;
    vm.launch().await?;

    // The VM is expected not to reach the Running state. Unfortunately, there's
    // no great way to test that this is never going to happen; as a best-effort
    // alternative, wait for a short while and assert that the VM doesn't reach
    // Running in the timeout interval.
    vm.wait_for_state(InstanceState::Running, Duration::from_secs(5))
        .await
        .unwrap_err();

    // Fix the disk's downstairs address and send a replacement request. This
    // should be processed and should allow the VM to boot.
    disk.disable_vcr_black_hole();
    disk.set_generation(2);
    vm.replace_crucible_vcr(disk).await?;
    vm.wait_to_boot().await?;

    assert_eq!(vm.get().await?.instance.state, InstanceState::Running);

    // VCR replacements should continue to be accepted now that the instance is
    // running.
    disk.set_generation(3);
    vm.replace_crucible_vcr(disk).await?;
}


================================================
FILE: phd-tests/tests/src/disk.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_framework::{
    disk::{fat::FatFilesystem, DiskSource},
    guest_os::GuestOsKind,
    test_vm::{DiskBackend, DiskInterface, MigrationTimeout},
    TestVm,
};
use phd_testcase::*;
use uuid::Uuid;

/// Creates a VM with an in-memory disk backed by the supplied `data`, waits for
/// it to boot, and issues some shell commands to find the
///
/// Returns a tuple containing the created VM and the path to the guest disk
/// device representing the in-memory disk.
async fn launch_vm_and_find_in_memory_disk(
    ctx: &TestCtx,
    vm_name: &str,
    data: DiskSource<'_>,
    readonly: bool,
) -> anyhow::Result<(TestVm, String)> {
    let mut cfg = ctx.vm_config_builder(vm_name);
    cfg.data_disk(
        "data-disk-0",
        data,
        DiskInterface::Virtio,
        DiskBackend::InMemory { readonly },
        24,
    );
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let device_path = if let Some(vm) = vm.get_windows_vm() {
        // Cygwin documents that \Device\HardDisk devices in the NT device
        // namespace map to /dev/sd devices in the emulated POSIX namespace:
        // disk 0 is /dev/sda, disk 1 is /dev/sdb, and so on. Get the NT device
        // number of the attached in-memory disk.
        let cmd = "(Get-PhysicalDisk | Where {$_.BusType -ne 'NVMe'}).DeviceId";
        let num = vm.run_powershell_command(cmd).await?.parse::<u8>()?;

        // If the test requires the disk to be writable, run diskpart to ensure
        // that its readonly attribute is cleared.
        if !readonly {
            vm.run_shell_command(&format!(
                "echo 'select disk {num}' >> diskpart.txt"
            ))
            .await?;
            vm.run_shell_command(
                "echo 'attributes disk clear readonly' >> diskpart.txt",
            )
            .await?;
            vm.run_shell_command("diskpart /s diskpart.txt").await?;
        }

        // Crudely map from the drive number to the appropriate letter suffix.
        // Cygwin supports more than 26 drives (up to /dev/sddx), but the data
        // disk shouldn't map into that range unless Windows does something
        // unexpected with its drive number assignments.
        assert!(
            num < 26,
            "physical drive number must be less than 26 to map to a Cygwin dev"
        );

        format!("/dev/sd{}", (b'a' + num) as char)
    } else {
        let ls = vm
            .run_shell_command(
                "ls /sys/devices/pci0000:00/0000:00:18.0/virtio0/block",
            )
            .await?;

        format!("/dev/{ls}")
    };

    Ok((vm, device_path))
}

async fn mount_in_memory_disk(
    vm: &mut TestVm,
    device_path: &str,
    readonly: bool,
) -> anyhow::Result<()> {
    if vm.guest_os_kind().is_windows() {
        phd_skip!(
            "in-memory disk tests use mount options not supported by Cygwin"
        );
    }

    vm.run_shell_command("mkdir /phd").await?;

    // If the disk is read-only, add the `ro` qualifier to the mount command
    // so that it doesn't complain about being unable to mount for writing.
    if readonly {
        let mount = vm
            .run_shell_command(&format!("mount -o ro {device_path} /phd"))
            .await?;
        assert_eq!(mount, "");
    } else {
        vm.run_shell_command(&format!(
            "echo '{device_path} /phd vfat defaults 0 2' >> /etc/fstab"
        ))
        .await?;

        let mount = vm.run_shell_command("mount -a").await?;
        assert_eq!(mount, "");
    }

    Ok(())
}

#[phd_testcase]
async fn in_memory_backend_smoke_test(ctx: &TestCtx) {
    if ctx.default_guest_os_kind().await?.is_windows() {
        phd_skip!(
            "in-memory disk tests use mount options not supported by Cygwin"
        );
    }

    const HELLO_MSG: &str = "hello oxide!";

    let readonly = true;
    let mut data = FatFilesystem::new();
    data.add_file_from_str("hello_oxide.txt", HELLO_MSG)?;
    let (mut vm, device_path) = launch_vm_and_find_in_memory_disk(
        ctx,
        "in_memory_backend_test",
        DiskSource::FatFilesystem(data),
        readonly,
    )
    .await?;

    mount_in_memory_disk(&mut vm, &device_path, readonly).await?;

    // The file should be there and have the expected contents.
    let ls = vm.run_shell_command("ls /phd").await?;
    assert_eq!(ls, "hello_oxide.txt");

    let cat = vm.run_shell_command("cat /phd/hello_oxide.txt").await?;
    assert_eq!(cat, HELLO_MSG);
}

#[phd_testcase]
async fn in_memory_backend_migration_test(ctx: &TestCtx) {
    // A blank disk is fine for this test: the rest of the test will address the
    // disk device directly instead of assuming it has a file system. This works
    // around #824 for Windows guests (which may not recognize the FAT
    // filesystems PHD produces).
    let (vm, device_path) = launch_vm_and_find_in_memory_disk(
        ctx,
        "in_memory_backend_migration_test_source",
        DiskSource::Blank(16 * 1024),
        false,
    )
    .await?;

    // Scribble random data into the first kilobyte of the data disk, passing
    // the appropriate flags to ensure that the guest actually writes the data
    // to the disk (instead of just holding it in memory).
    let force_sync = if let GuestOsKind::Alpine = vm.guest_os_kind() {
        "conv=sync"
    } else {
        "oflag=sync"
    };

    vm.run_shell_command(&format!(
        "dd if=/dev/random of={device_path} {force_sync} bs=1K count=1"
    ))
    .await?;

    // Read the scribbled data out to a file on the main OS disk.
    vm.run_shell_command(&format!(
        "dd if={device_path} of=/tmp/before iflag=direct bs=1K"
    ))
    .await?;

    // Migrate the VM.
    let mut target = ctx
        .spawn_successor_vm(
            "in_memory_backend_migration_test_target",
            &vm,
            None,
        )
        .await?;

    target
        .migrate_from(&vm, Uuid::new_v4(), MigrationTimeout::default())
        .await?;

    // Read the scribbled data back from the disk. On most guests, adding
    // `iflag=direct` to the `dd` invocation is sufficient to bypass the guest's
    // caches and read from the underlying disk. Alpine guests appear also to
    // need a procfs poke to drop page caches before they'll read from the disk.
    if let GuestOsKind::Alpine = vm.guest_os_kind() {
        target.run_shell_command("sync").await?;
        target.run_shell_command("echo 3 > /proc/sys/vm/drop_caches").await?;
    }

    target
        .run_shell_command(&format!(
            "dd if={device_path} of=/tmp/after iflag=direct bs=1K"
        ))
        .await?;

    // The data that was scribbled before migrating should match what was read
    // back from the disk. If it doesn't, migration restored the original
    // (blank) disk contents, which is incorrect.
    let out = target
        .run_shell_command("diff --report-identical /tmp/before /tmp/after")
        .await?;

    assert_eq!(out, "Files /tmp/before and /tmp/after are identical");
}


================================================
FILE: phd-tests/tests/src/framework.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tests that primarily exercise the PHD framework itself.

use phd_framework::guest_os::GuestOsKind;
use phd_testcase::*;

#[phd_testcase]
async fn multiline_serial_test(ctx: &TestCtx) {
    let mut vm = ctx.spawn_default_vm("multiline_test").await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let out = vm.run_shell_command("echo \\\nhello \\\nworld").await?;
    assert_eq!(out, "hello world");
}

#[phd_testcase]
async fn long_line_serial_test(ctx: &TestCtx) {
    let os = ctx.default_guest_os_kind().await?;
    if matches!(
        os,
        GuestOsKind::WindowsServer2016 | GuestOsKind::WindowsServer2019
    ) {
        phd_skip!(format!(
            "long serial lines not supported for guest OS {os:?}"
        ));
    }

    let mut vm = ctx.spawn_default_vm("long_line_serial_test").await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let long_str = "In my younger and more vulnerable years my father gave \
    me some advice that I've been turning over in my mind ever since. \
    \"Whenever you feel like sending a long serial console line,\" he told me, \
    \"just remember that all the guest OSes in this world haven't had the tty \
    settings you've had.\"";

    let out = vm
        .run_shell_command(&format!(
            "echo '{}'",
            // Fitzgerald didn't have to deal with nested Bash quotes, but this
            // test does. Replace apostrophes in the input string with a
            // string-terminating `'`, followed by an escaped single quote that
            // serves as the apostrophe, followed by a string-opening `'`.
            long_str.replace("'", "'\\''")
        ))
        .await?;
    assert_eq!(out, long_str);
}


================================================
FILE: phd-tests/tests/src/hw.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_framework::lifecycle::Action;
use phd_testcase::*;

#[phd_testcase]
async fn lspci_lifecycle_test(ctx: &TestCtx) {
    const LSPCI: &str = "sudo lspci -vvx";
    const LSHW: &str = "sudo lshw -notime";

    let mut vm = ctx
        .spawn_vm(&ctx.vm_config_builder("lspci_lifecycle_test"), None)
        .await?;

    vm.launch().await?;
    vm.wait_to_boot().await?;

    // XXX: do not `ignore_status()` on these commands! They fail for any number
    // of reasons on different guests:
    // * sudo may not exist (some Alpine)
    // * lshw may not exist (Debian)
    // * we may not input a sudo password (Ubuntu)
    //
    // see also: https://github.com/oxidecomputer/propolis/issues/792

    let lspci = vm.run_shell_command(LSPCI).ignore_status().await?;
    let lshw = vm.run_shell_command(LSHW).ignore_status().await?;
    ctx.lifecycle_test(vm, &[Action::StopAndStart], move |vm| {
        let lspci = lspci.clone();
        let lshw = lshw.clone();
        Box::pin(async move {
            let new_lspci =
                vm.run_shell_command(LSPCI).ignore_status().await.unwrap();
            assert_eq!(new_lspci, lspci);
            let new_lshw =
                vm.run_shell_command(LSHW).ignore_status().await.unwrap();
            assert_eq!(new_lshw, lshw);
        })
    })
    .await?;
}


================================================
FILE: phd-tests/tests/src/hyperv.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::time::{Duration, Instant};

use phd_framework::{
    artifacts, lifecycle::Action, test_vm::MigrationTimeout, TestVm,
};
use phd_testcase::*;
use propolis_client::instance_spec::{
    GuestHypervisorInterface, HyperVFeatureFlag,
};
use tracing::{info, warn};
use uuid::Uuid;

/// Attempts to see if the guest has detected Hyper-V support. This is
/// best-effort, since not all PHD guest images contain in-box tools that
/// display the current hypervisor vendor.
///
/// NOTE: If the guest lacks a facility to check the hypervisor vendor, this
/// routine logs a warning but does not return a "Skipped" result. This allows
/// the smoke tests to return a Pass result to show that they exercised VM
/// startup and shutdown with Hyper-V emulation enabled.
async fn guest_detect_hyperv(vm: &TestVm) -> anyhow::Result<()> {
    if vm.guest_os_kind().is_linux() {
        // One might imagine we could simply use `systemd-detect-virt` to check
        // hypervisor information here, but it's not present out of the box on
        // Alpine. In the interest of exercising Hyper-V in typical CI runs on a
        // standard Alpine image, detect Hyper-V in a... worse but reliable way:
        // looking for relevant logs in dmesg. This should work for all Linuxes
        // from later than May-ish 2010 (>2.6.34 or so). If we don't see Hyper-V
        // reported in dmesg either it's genuinely not detected, it's a very old
        // Linux, or it's a new Linux and dmesg text has changed.
        const HV_TEXT: &str = "Hypervisor detected: Microsoft Hyper-V";

        // No "sudo" here because Alpine doesn't have sudo; for Linux tests we
        // expect to run test commands as root anyway.
        vm.run_shell_command(&format!("dmesg | grep \"{HV_TEXT}\"")).await?;
    } else if vm.guest_os_kind().is_windows() {
        // Windows is good about giving signals that it's running in a Hyper-V
        // *root partition*, but offers no clear signal as to whether it has
        // detected a Hyper-V host when it's running as a non-root guest. (There
        // are methods for detecting whether Windows is running as a guest, but
        // these don't identify the detected hypervisor type.)
        warn!("running on Windows, can't verify it detected Hyper-V support");
    } else {
        warn!("unknown guest type, can't verify it detected Hyper-V support");
    }

    Ok(())
}

#[phd_testcase]
async fn hyperv_smoke_test(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("hyperv_smoke_test");
    cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
        features: Default::default(),
    });
    // For reasons absolutely indecipherable to me, Alpine (3.16, kernel
    // 5.15.41-0-virt) seems to lose some early dmesg lines if booted with less
    // than four vCPUs. Among the early dmesg lines are `Hypervisor detected:
    // Microsoft Hyper-V` that we look for as confirmation that Linux knows
    // there's a Hyper-V-like hypervisor present.
    //
    // I'd love to debug exactly why this is relevant to the contents of dmesg
    // (the remaining log is identical and it doesn't seem that the ring buffer
    // is full), but I really can't justify the time!
    cfg.cpus(4);
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    guest_detect_hyperv(&vm).await?;
}

#[phd_testcase]
async fn hyperv_lifecycle_test(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("hyperv_lifecycle_test");
    cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
        features: Default::default(),
    });
    // Spooky load-bearing vCPU count to preserve dmesg log lines. See the
    // comment in `hyperv_smoke_test`.
    cfg.cpus(4);
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    ctx.lifecycle_test(
        vm,
        &[
            Action::Reset,
            Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT),
        ],
        |target: &TestVm| {
            Box::pin(async {
                guest_detect_hyperv(target).await.unwrap();
            })
        },
    )
    .await?;
}

#[phd_testcase]
async fn hyperv_reference_tsc_clocksource_test(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("hyperv_reference_tsc_test");
    cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
        features: [HyperVFeatureFlag::ReferenceTsc].into_iter().collect(),
    });
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let clocksource = vm
        .run_shell_command(
            "cat /sys/devices/system/clocksource/clocksource0\
                /current_clocksource",
        )
        .await?;

    let check_clocksource = !clocksource.ends_with("No such file or directory");
    if check_clocksource {
        assert_eq!(clocksource, "hyperv_clocksource_tsc_page");
    }

    // Migrate to a new VM and make sure the clocksource is kept intact. If
    // clocksource queries aren't supported for this guest, poke the serial
    // console anyway just to make sure the guest remains operable.
    ctx.lifecycle_test(
        vm,
        &[
            Action::Reset,
            Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT),
        ],
        |target: &TestVm| {
            Box::pin(async move {
                if check_clocksource {
                    let clocksource = target
                        .run_shell_command(
                            "cat /sys/devices/system/clocksource/clocksource0\
                            /current_clocksource",
                        )
                        .await
                        .unwrap();

                    assert_eq!(clocksource, "hyperv_clocksource_tsc_page");
                } else {
                    target.run_shell_command("").await.unwrap();
                }
            })
        },
    )
    .await?;

    // Only report a Passed result for this test if it actually managed to query
    // the clocksource. Note that if the clocksource can't be queried, but the
    // guest stops responding during the foregoing lifecycle test, the test will
    // fail (and report that result accordingly) before reaching this point.
    if !check_clocksource {
        phd_skip!("guest doesn't support querying clocksource through sysfs");
    }
}

#[phd_testcase]
async fn hyperv_reference_tsc_elapsed_time_test(ctx: &TestCtx) {
    #[derive(Debug)]
    struct Reading {
        taken_at: Instant,
        guest_ns: u64,
    }

    impl Reading {
        async fn take_from(vm: &TestVm) -> anyhow::Result<Self> {
            let cmd =
                "cat /proc/timer_list | grep \"now at\" | awk '{ print $3 }'";
            let guest_ns = vm.run_shell_command(cmd).await?.parse::<u64>()?;

            // Ideally, the guest and host readings would be taken
            // simultaneously, but in practice getting a guest timestamp itself
            // requires some work that itself takes time:
            //
            // 1. The framework needs to type the command into the guest
            // 2. The guest itself needs to run the command and print the
            //    result
            // 3. The framework needs to recognize a new command prompt, split
            //    off the result, and return it to the test case
            //
            // Snapshotting the host time here makes a bet that (3) is less
            // expensive than (1) and (2). This seems reasonable given that
            // executing a shell command involves both sending bytes to the
            // guest and waiting for them to be echoed, while waiting for the
            // result of an already-executed command just involves waiting for
            // the guest to print another prompt.
            let taken_at = Instant::now();
            Ok(Self { taken_at, guest_ns })
        }

        /// Compares `self` with an earlier reading, `other`, and returns the
        /// difference between the measured elapsed time on the host and the
        /// measured elapsed time on the guest, expressed as a percentage of the
        /// measured elapsed time on the host.
        fn compare_with_earlier(&self, other: &Reading) -> f64 {
            let host_delta_ns =
                i64::try_from((self.taken_at - other.taken_at).as_nanos())
                    .expect("host delta is small enough to fit in an i64");

            let guest_delta_ns = i64::try_from(self.guest_ns - other.guest_ns)
                .expect("guest delta is small enough to fit in an i64");

            let diff = (host_delta_ns - guest_delta_ns).unsigned_abs();
            let diff_pct = (diff as f64) / (host_delta_ns as f64);

            info!(
                before = ?other,
                after = ?self,
                host_delta_ns,
                guest_delta_ns,
                diff,
                diff_pct,
                "compared time readings"
            );

            diff_pct
        }
    }

    /// Checks that time is advancing at roughly the correct rate in the guest.
    /// This is done by taking several host and guest time readings, comparing
    /// elapsed time in the host to elapsed time in the guest, and declaring a
    /// "good" result if the percentage difference between them is within some
    /// tolerance. The check passes if the number of good results exceeds the
    /// number of bad results.
    async fn check_tsc(vm: &TestVm) -> anyhow::Result<()> {
        // The amount of error that can be tolerated in the guest's elapsed
        // time reading, expressed as a percentage of elapsed time on the host.
        //
        // If the reference TSC is working properly, host and guest time should
        // be very closely synchronized. However, because there is no way to
        // capture host and guest timestamps atomically, it will always appear
        // that more time has advanced in one domain than the other. The time
        // snapshotting logic tries to minimize this delta, but some error is
        // still expected, so a tolerance value is required.
        //
        // A 2.5% tolerance is *extremely* generous, but is necessary to keep
        // this test from flaking in CI runs. Generous as it is, this tolerance
        // value is still enough to catch egregious errors in computing TSC
        // scaling factors: shifting the scaling factor by the wrong number of
        // bits, for example, is liable to produce a much larger error than
        // this.
        const TOLERANCE: f64 = 0.025;

        // Take six readings to get five comparisons of consecutive readings.
        const NUM_READINGS: usize = 6;

        let mut readings = vec![];
        let mut good_diffs = 0;
        let mut bad_diffs = 0;

        for _ in 0..NUM_READINGS {
            readings.push(Reading::take_from(vm).await?);
            tokio::time::sleep(Duration::from_millis(500)).await;
        }

        for window in readings.as_slice().windows(2) {
            let first = &window[0];
            let second = &window[1];
            let diff_pct = second.compare_with_earlier(first);
            if diff_pct < TOLERANCE {
                good_diffs += 1;
            } else {
                bad_diffs += 1;
            }
        }

        assert!(
            bad_diffs < good_diffs,
            "more out-of-tolerance time diffs ({bad_diffs}) than in-tolerance \
            diffs ({good_diffs}); see test log for details",
        );

        info!(good_diffs, bad_diffs, "TSC test results");

        Ok(())
    }

    if ctx.default_guest_os_kind().await?.is_windows() {
        phd_skip!("test requires a guest with /proc/timer_list in procfs");
    }

    let mut cfg = ctx.vm_config_builder("hyperv_reference_tsc_elapsed_test");
    cfg.guest_hv_interface(GuestHypervisorInterface::HyperV {
        features: [HyperVFeatureFlag::ReferenceTsc].into_iter().collect(),
    });
    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    check_tsc(&vm).await?;

    let mut target = ctx
        .spawn_successor_vm("hyperv_reference_tsc_elapsed_target", &vm, None)
        .await?;

    target
        .migrate_from(&vm, Uuid::new_v4(), MigrationTimeout::default())
        .await?;

    check_tsc(&vm).await?;
}


================================================
FILE: phd-tests/tests/src/lib.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

pub use phd_testcase;

mod boot_order;
mod cpuid;
mod crucible;
mod disk;
mod framework;
mod hw;
mod hyperv;
mod migrate;
mod server_state_machine;
mod smoke;
mod stats;
mod vsock;


================================================
FILE: phd-tests/tests/src/migrate.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::time::Duration;

use phd_framework::{
    artifacts, lifecycle::Action, test_vm::MigrationTimeout, TestVm,
};
use phd_testcase::*;
use propolis_client::types::{InstanceState, MigrationState};
use tracing::info;
use uuid::Uuid;

#[phd_testcase]
async fn smoke_test(ctx: &TestCtx) {
    run_smoke_test(ctx, ctx.spawn_default_vm("migration_smoke").await?).await?;
}

#[phd_testcase]
async fn serial_history(ctx: &TestCtx) {
    run_serial_history_test(
        ctx,
        ctx.spawn_default_vm("migration_serial_history").await?,
    )
    .await?;
}

/// Tests for migrating from a "migration base" Propolis revision (e.g. the
/// latest commit to the `master` git branch) to the revision under test.
mod from_base {
    use super::*;

    #[phd_testcase]
    async fn can_migrate_from_base(ctx: &TestCtx) {
        run_smoke_test(ctx, spawn_base_vm(ctx, "migration_from_base").await?)
            .await?;
    }

    #[phd_testcase]
    async fn serial_history(ctx: &TestCtx) {
        run_serial_history_test(
            ctx,
            spawn_base_vm(ctx, "migration_serial_history_base").await?,
        )
        .await?;
    }

    // Tests migrating from the "migration base" propolis artifact to the Propolis
    // version under test, back to "base", and back to the version under
    // test.
    #[phd_testcase]
    async fn migration_from_base_and_back(ctx: &TestCtx) {
        let mut source =
            spawn_base_vm(ctx, "migration_from_base_and_back").await?;
        source.launch().await?;
        source.wait_to_boot().await?;
        // `ls` with no results exits non-zero, so expect an error here.
        let lsout = source
            .run_shell_command("ls foo.bar 2> /dev/null")
            .check_err()
            .await?;
        assert_eq!(lsout, "");

        // create an empty file on the source VM.
        source.run_shell_command("touch ./foo.bar").await?;
        source.run_shell_command("sync ./foo.bar").await?;

        ctx.lifecycle_test(
            source,
            &[
                Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT),
                Action::MigrateToPropolis(artifacts::BASE_PROPOLIS_ARTIFACT),
                Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT),
            ],
            |target: &TestVm| {
                Box::pin(async {
                    // the file should still exist on the target VM after migration.
                    let lsout = target
                        .run_shell_command("ls foo.bar")
                        .ignore_status()
                        .await
                        .expect("can try to run `ls foo.bar`");
                    assert_eq!(lsout, "foo.bar");
                })
            },
        )
        .await?;
    }

    async fn spawn_base_vm(ctx: &TestCtx, name: &str) -> Result<TestVm> {
        if !ctx.migration_base_enabled() {
            phd_skip!("No 'migration base' Propolis revision available");
        }

        let mut env = ctx.environment_builder();
        env.propolis(artifacts::BASE_PROPOLIS_ARTIFACT);
        let mut cfg = ctx.vm_config_builder(name);
        // TODO: not strictly necessary, but as of #756 PHD began adding a
        // `boot_settings` key by default to new instances. This is not
        // understood by older Propolis, so migration tests would fail because
        // the test changed, rather than a migration issue.
        //
        // At some point after landing #756, stop clearing the boot order,
        // because a newer base Propolis will understand `boot_settings` just
        // fine.
        cfg.clear_boot_order();
        ctx.spawn_vm(&cfg, Some(&env)).await
    }
}

/// Tests for migrations while a process is running on the guest.
mod running_process {
    use super::*;

    #[phd_testcase]
    async fn migrate_running_process(ctx: &TestCtx) {
        let mut source =
            ctx.spawn_default_vm("migrate_running_process_source").await?;
        let mut target = ctx
            .spawn_successor_vm("migrate_running_process_target", &source, None)
            .await?;

        source.launch().await?;
        source.wait_to_boot().await?;

        mk_dirt(&source).await?;

        target
            .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
            .await?;

        check_dirt(&target).await?;
    }

    #[phd_testcase]
    async fn import_failure(ctx: &TestCtx) {
        let mut cfg = ctx.vm_config_builder(
            "migrate_running_process::import_failure_source",
        );
        // Ensure the migration failure device is present in the VM config for
        // the source as well as the target, so that the source will offer the
        // device.
        cfg.fail_migration_imports(0);
        let mut source = ctx.spawn_vm(&cfg, None).await?;

        let mut target1 = {
            // Configure the target to fail when it imports the migration
            // failure device.
            cfg.named("migrate_running_process::import_failure_target1")
                .fail_migration_imports(1);

            // N.B. that we don't use `spawn_successor_vm` here, because we must
            // add the `fail_migration_imports` option to the new VM's
            // `VmConfig`. Instead, we use `spawn_vm`, and pass the source VM's
            // environment to ensure it's inherited.
            ctx.spawn_vm(&cfg, Some(&source.environment_spec())).await?
        };

        let mut target2 = ctx
            .spawn_successor_vm(
                "migrate_running_process::import_failure_target2",
                &source,
                None,
            )
            .await?;

        source.launch().await?;
        source.wait_to_boot().await?;

        mk_dirt(&source).await?;

        // first migration should fail.
        let error = target1
            .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
            .await
            .unwrap_err();
        info!(%error, "first migration failed as expected");

        // Also verify that the target reports that it failed.
        let target_migration_state = target1
            .get_migration_state()
            .await?
            .migration_in
            .expect("target should have a migration-in status")
            .state;
        assert_eq!(target_migration_state, MigrationState::Error);

        // Wait for the source to report that it has resumed before requesting
        // another migration.
        source
            .wait_for_state(
                InstanceState::Running,
                std::time::Duration::from_secs(5),
            )
            .await?;

        // try again. this time, it should work!
        target2
            .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
            .await?;

        check_dirt(&target2).await?;
    }

    #[phd_testcase]
    async fn export_failure(ctx: &TestCtx) {
        let mut source = {
            let mut cfg = ctx.vm_config_builder(
                "migrate_running_process::export_failure_source",
            );
            cfg.fail_migration_exports(1);
            ctx.spawn_vm(&cfg, None).await?
        };
        let mut target1 = ctx
            .spawn_successor_vm(
                "migrate_running_process::export_failure_target1",
                &source,
                None,
            )
            .await?;
        let mut target2 = ctx
            .spawn_successor_vm(
                "migrate_running_process::export_failure_target2",
                &source,
                None,
            )
            .await?;

        source.launch().await?;
        source.wait_to_boot().await?;

        mk_dirt(&source).await?;

        // first migration should fail.
        let error = target1
            .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
            .await
            .unwrap_err();
        info!(%error, "first migration failed as expected");

        // Also verify that the target reports that it failed.
        let target_migration_state = target1
            .get_migration_state()
            .await?
            .migration_in
            .expect("target should have a migration-in status")
            .state;
        assert_eq!(target_migration_state, MigrationState::Error);

        // Wait for the source to report that it has resumed before requesting
        // another migration.
        source
            .wait_for_state(
                InstanceState::Running,
                std::time::Duration::from_secs(5),
            )
            .await?;

        // try again. this time, it should work!
        target2
            .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
            .await?;

        check_dirt(&target2).await?;
    }

    /// Starts a process on the guest VM which stores a bunch of strings in
    /// memory and then suspends itself, waiting to be brought to the
    /// foreground. Once the process is resumed, it checks the contents of the
    /// string to ensure that they're still valid.
    ///
    /// Resuming this process after migration allows us to check that the
    /// guest's memory was migrated correctly.
    async fn mk_dirt(vm: &TestVm) -> phd_testcase::Result<()> {
        vm.run_shell_command(concat!(
            "cat >dirt.sh <<'EOF'\n",
            include_str!("../testdata/dirt.sh"),
            "\nEOF"
        ))
        .await?;
        vm.run_shell_command("chmod +x dirt.sh").await?;
        // When dirt.sh suspends itself, the parent shell will report a non-zero
        // status (one example is 148: 128 + SIGTSTP aka 20 on Linux).
        let run_dirt = vm.run_shell_command("./dirt.sh").check_err().await?;
        assert!(run_dirt.contains("made dirt"), "dirt.sh failed: {run_dirt:?}");
        assert!(
            run_dirt.contains("Stopped"),
            "dirt.sh didn't suspend: {run_dirt:?}"
        );

        Ok(())
    }

    async fn check_dirt(vm: &TestVm) -> phd_testcase::Result<()> {
        let output = vm.run_shell_command("fg").await?;
        assert!(output.contains("all good"), "dirt.sh failed: {output:?}");
        Ok(())
    }
}

#[phd_testcase]
async fn multiple_migrations(ctx: &TestCtx) {
    let mut vm0 = ctx.spawn_default_vm("multiple_migrations_0").await?;
    let mut vm1 =
        ctx.spawn_successor_vm("multiple_migrations_1", &vm0, None).await?;
    let mut vm2 =
        ctx.spawn_successor_vm("multiple_migrations_2", &vm1, None).await?;

    vm0.launch().await?;
    vm0.wait_to_boot().await?;
    vm1.migrate_from(&vm0, Uuid::new_v4(), MigrationTimeout::default()).await?;
    assert_eq!(vm1.run_shell_command("echo Hello world").await?, "Hello world");
    vm2.migrate_from(&vm1, Uuid::new_v4(), MigrationTimeout::default()).await?;
    assert_eq!(
        vm2.run_shell_command("echo I have migrated!").await?,
        "I have migrated!"
    );
}

async fn run_smoke_test(ctx: &TestCtx, mut source: TestVm) -> Result<()> {
    source.launch().await?;
    source.wait_to_boot().await?;
    let lsout =
        source.run_shell_command("ls foo.bar 2> /dev/null").check_err().await?;
    assert_eq!(lsout, "");

    // create an empty file on the source VM.
    source.run_shell_command("touch ./foo.bar").await?;
    source.run_shell_command("sync ./foo.bar").await?;

    ctx.lifecycle_test(
        source,
        &[Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT)],
        |target: &TestVm| {
            Box::pin(async {
                // the file should still exist on the target VM after migration.
                let lsout = target
                    .run_shell_command("ls foo.bar")
                    .ignore_status()
                    .await
                    .expect("can try to run `ls foo.bar`");
                assert_eq!(lsout, "foo.bar");
            })
        },
    )
    .await
}

async fn run_serial_history_test(
    ctx: &TestCtx,
    mut source: TestVm,
) -> Result<()> {
    source.launch().await?;
    source.wait_to_boot().await?;

    let out =
        source.run_shell_command("echo hello from the source VM!").await?;
    assert_eq!(out, "hello from the source VM!");

    let serial_hist_pre = source.get_serial_console_history(0).await?;
    assert!(!serial_hist_pre.data.is_empty());

    ctx.lifecycle_test(
        source,
        &[Action::MigrateToPropolis(artifacts::DEFAULT_PROPOLIS_ARTIFACT)],
        move |target| {
            let serial_hist_pre = serial_hist_pre.clone();
            Box::pin(async move {
                let serial_hist_post =
                    target.get_serial_console_history(0).await.expect(
                        "should get serial console history from the target",
                    );
                assert_eq!(
                    serial_hist_pre.data,
                    serial_hist_post.data[..serial_hist_pre.data.len()]
                );
                assert!(
                    serial_hist_pre.last_byte_offset
                        <= serial_hist_post.last_byte_offset
                );
            })
        },
    )
    .await
}

#[phd_testcase]
async fn migration_ensures_instance_metadata(ctx: &TestCtx) {
    // Create a source instance, and fetch the instance metadata its metrics are
    // generated with.
    let mut source = ctx
        .spawn_default_vm("migration_ensures_instance_metadata_source")
        .await?;
    let mut target = ctx
        .spawn_successor_vm(
            "migration_ensures_instance_metadata_target",
            &source,
            None,
        )
        .await?;
    source.launch().await?;
    source.wait_to_boot().await?;
    let expected_metadata = source.vm_spec().metadata;
    let source_metadata = source.get_spec().await?.properties.metadata;
    assert_eq!(
        expected_metadata, source_metadata,
        "Source instance was not populated with the correct instance metadata"
    );

    // Migrate the instance to a new server, and refetch the metadata.
    target
        .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
        .await?;
    let expected_metadata = target.vm_spec().metadata;
    let target_metadata = target.get_spec().await?.properties.metadata;
    assert_eq!(
        expected_metadata, target_metadata,
        "Target instance was not populated with the correct instance metadata"
    );

    // Check that the source / target sled identifiers are different.
    assert_ne!(
        source_metadata.sled_serial, target_metadata.sled_serial,
        "Source and target serial numbers should be different"
    );
    assert_ne!(
        source_metadata.sled_id, target_metadata.sled_id,
        "Source and target UUIDs should be different"
    );
}

#[phd_testcase]
async fn vm_reaches_destroyed_after_migration_out(ctx: &TestCtx) {
    let mut source = ctx
        .spawn_default_vm("vm_reaches_destroyed_after_migration_out_source")
        .await?;

    let mut target = ctx
        .spawn_successor_vm(
            "vm_reaches_destroyed_after_migration_out_target",
            &source,
            None,
        )
        .await?;

    source.launch().await?;
    source.wait_to_boot().await?;
    target
        .migrate_from(&source, Uuid::new_v4(), MigrationTimeout::default())
        .await?;

    source
        .wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;
}


================================================
FILE: phd-tests/tests/src/server_state_machine.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! Tests verifying the server state machine.

use std::time::Duration;

use phd_framework::{
    disk::{BlockSize, DiskSource},
    test_vm::{DiskBackend, DiskInterface},
};
use phd_testcase::*;
use propolis_client::types::InstanceState;

#[phd_testcase]
async fn instance_start_stop_test(ctx: &TestCtx) {
    let mut vm = ctx.spawn_default_vm("instance_ensure_running_test").await?;

    vm.instance_ensure().await?;
    let instance = vm.get().await?.instance;
    assert_eq!(instance.state, InstanceState::Creating);

    vm.launch().await?;
    vm.wait_for_state(InstanceState::Running, Duration::from_secs(60)).await?;

    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;
}

#[phd_testcase]
async fn instance_stop_unstarted_test(ctx: &TestCtx) {
    let mut vm = ctx.spawn_default_vm("instance_stop_unstarted_test").await?;

    vm.instance_ensure().await?;
    let instance = vm.get().await?.instance;
    assert_eq!(instance.state, InstanceState::Creating);

    // At this point the VM is created and its resources are held as
    // appropriate. Stopping the VM will cause propolis-server to destroy the
    // VM, releasing those resources and getting the server ready for shutdown.
    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;
}

#[phd_testcase]
async fn instance_stop_causes_destroy_test(ctx: &TestCtx) {
    let mut vm =
        ctx.spawn_default_vm("instance_stop_causes_destroy_test").await?;

    vm.launch().await?;
    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;

    assert!(matches!(
        vm.run().await.unwrap_err().status().unwrap(),
        reqwest::StatusCode::FAILED_DEPENDENCY
    ));
    assert!(matches!(
        vm.stop().await.unwrap_err().status().unwrap(),
        reqwest::StatusCode::FAILED_DEPENDENCY
    ));
    assert!(matches!(
        vm.reset().await.unwrap_err().status().unwrap(),
        reqwest::StatusCode::FAILED_DEPENDENCY
    ));
}

#[phd_testcase]
async fn instance_reset_test(ctx: &TestCtx) {
    let mut vm =
        ctx.spawn_default_vm("instance_reset_returns_to_running_test").await?;

    assert!(vm.reset().await.is_err());
    vm.launch().await?;
    vm.wait_for_state(InstanceState::Running, Duration::from_secs(60)).await?;

    // Because "Rebooting" is not a steady state, the Propolis state worker may
    // transition the instance from Running to Rebooting to Running before the
    // test gets a chance to monitor its state any further. Thus, it's not safe
    // to test that the Rebooting state was observed here.
    vm.reset().await?;
    vm.wait_for_state(InstanceState::Running, Duration::from_secs(60)).await?;

    // Queue multiple reset attempts. These should all succeed, even though
    // Propolis will only allow one reboot to be enqueued at a time. Once again,
    // the specific number of reboots that will be queued depends on factors
    // outside of the test's control.
    for _ in 0..10 {
        vm.reset().await?;
    }

    vm.wait_for_state(InstanceState::Running, Duration::from_secs(60)).await?;
    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;
    assert!(vm.reset().await.is_err());
}

#[phd_testcase]
async fn instance_reset_requires_running_test(ctx: &TestCtx) {
    let mut vm =
        ctx.spawn_default_vm("instance_reset_requires_running_test").await?;

    assert!(vm.reset().await.is_err());
    vm.launch().await?;
    vm.wait_for_state(InstanceState::Running, Duration::from_secs(60)).await?;
}

#[phd_testcase]
async fn stop_while_blocked_on_start_test(ctx: &TestCtx) {
    // This test uses a Crucible disk backend to cause VM startup to block.
    if !ctx.crucible_enabled() {
        phd_skip!("test requires Crucible support");
    }

    let mut config = ctx.vm_config_builder("stop_while_blocked_on_start_test");

    // Create a VM that blocks while starting by attaching a Crucible data disk
    // to it and enabling the black hole address in its volume construction
    // request. The invalid address will keep Crucible from activating and so
    // will block the VM from fully starting.
    const DATA_DISK_NAME: &str = "vcr-replacement-target";
    config.data_disk(
        DATA_DISK_NAME,
        DiskSource::Blank(1024 * 1024 * 1024),
        DiskInterface::Nvme,
        DiskBackend::Crucible {
            min_disk_size_gib: 1,
            block_size: BlockSize::Bytes512,
        },
        5,
    );

    let spec = config.vm_spec(ctx).await?;
    let disk_hdl =
        spec.get_disk_by_device_name(DATA_DISK_NAME).cloned().unwrap();
    let disk = disk_hdl.as_crucible().unwrap();
    disk.enable_vcr_black_hole();

    // Launch the VM and wait for it to advertise that its components are
    // starting.
    let mut vm = ctx.spawn_vm_with_spec(spec, None).await?;
    vm.launch().await?;
    vm.wait_for_state(InstanceState::Starting, Duration::from_secs(15))
        .await
        .unwrap();

    // Send a stop request. This should enqueue successfully, and the VM should
    // shut down even though activation is blocked.
    vm.stop().await?;
    vm.wait_for_state(InstanceState::Destroyed, Duration::from_secs(60))
        .await?;
}


================================================
FILE: phd-tests/tests/src/smoke.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_testcase::*;
use propolis_client::instance_spec::InstanceSpecStatus;

#[phd_testcase]
async fn nproc_test(ctx: &TestCtx) {
    let mut vm =
        ctx.spawn_vm(ctx.vm_config_builder("nproc_test").cpus(6), None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    let nproc = vm.run_shell_command("nproc").await?;
    assert_eq!(nproc.parse::<u8>().unwrap(), 6);
}

#[phd_testcase]
async fn api_reboot_test(ctx: &TestCtx) {
    let mut vm = ctx.spawn_default_vm("api_reboot_test").await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;
    vm.reset().await?;
    vm.wait_to_boot().await?;
}

#[phd_testcase]
async fn guest_reboot_test(ctx: &TestCtx) {
    let mut vm = ctx.spawn_default_vm("guest_reboot_test").await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    vm.graceful_reboot().await?;
}

#[phd_testcase]
async fn instance_spec_get_test(ctx: &TestCtx) {
    let mut vm = ctx
        .spawn_vm(
            ctx.vm_config_builder("instance_spec_test")
                .cpus(4)
                .memory_mib(3072),
            None,
        )
        .await?;
    vm.launch().await?;

    let spec_get_response = vm.get_spec().await?;
    let InstanceSpecStatus::Present(spec) = spec_get_response.spec else {
        panic!("launched instance should have a spec");
    };

    assert_eq!(spec.board.cpus, 4);
    assert_eq!(spec.board.memory_mb, 3072);
}


================================================
FILE: phd-tests/tests/src/stats.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::HashMap;
use std::str::FromStr;
use std::time::Duration;

use phd_framework::test_vm::{FakeOximeterSampler, MetricsLocation};
use propolis_client::instance_spec::HyperVFeatureFlag;

use chrono::{DateTime, Utc};
use oximeter::types::{ProducerResults, ProducerResultsItem, Sample};
use oximeter::{Datum, FieldValue};
use phd_testcase::*;
use tracing::{trace, warn};

// For convenience when comparing times below.
const NANOS_PER_SEC: f64 = 1_000_000_000.0;

#[derive(Copy, Clone, Debug, Eq, Hash, PartialEq, strum::EnumString)]
#[strum(serialize_all = "snake_case")]
enum VcpuState {
    Emulation,
    Run,
    Idle,
    Waiting,
}

#[derive(Default, Debug)]
struct VcpuUsageMetric {
    metrics: HashMap<VcpuState, u64>,
}

/// A collection of the stats produced by `propolis-server`'s Oximeter producer.
///
/// Oximeter producers produce a series of lists of samples, where each list
/// of samples is conceptually distinct but may still be interesting to
/// test. In `propolis-server`, the first list of samples will be
/// `virtual_machine:vcpu_usage`, which may be blank if kstats have not been
/// sampled since the last producer poll. The second list of samples
/// will be `virtual_machine:reset`.
///
/// `VirtualMachineMetrics` collects these all back together into a single view
/// to test against. See [`VirtualMachineMetrics::add_producer_result`] as the
/// means to accumulate samples into this struct.
///
/// ### Should this be in `framework`?
///
/// Arguably! It happens to encompass all `propolis-server` metrics today after
/// all. I expect we'll have many more Oximeter-produced stats from
/// `propolis-server` in the future, though, and only a few of them might be
/// relevant to these specific tests. Other tests may not want to wait for fresh
/// vCPU data that they don't care about, as well.
///
/// As-is, this is trying to be a relatively-standalone halfway point before the
/// most comprehensive approach: running Clickhouse alongside PHD and storing
/// Oximeter data there, maybe even querying samples with OxQL.
#[derive(Debug)]
struct VirtualMachineMetrics {
    oldest_time: DateTime<Utc>,
    reset: Option<u64>,
    vcpus: HashMap<u32, VcpuUsageMetric>,
}

/// Collect a record of all metrics for a VM as of the time this function is called.
async fn vm_metrics_snapshot(
    sampler: &FakeOximeterSampler,
) -> VirtualMachineMetrics {
    let min_metric_time = Utc::now();

    let metrics_check = move |producer_items| {
        producer_results_as_vm_metrics(producer_items)
            .filter(|metrics| metrics.oldest_time >= min_metric_time)
    };

    sampler.wait_for_propolis_stats(metrics_check).await
}

fn producer_results_as_vm_metrics(
    results: ProducerResults,
) -> Option<VirtualMachineMetrics> {
    let mut metrics = VirtualMachineMetrics {
        oldest_time: Utc::now(),
        reset: None,
        vcpus: HashMap::new(),
    };

    for result in results {
        match result {
            ProducerResultsItem::Ok(samples) => {
                metrics.add_producer_result(&samples);
            }
            ProducerResultsItem::Err(e) => {
                panic!("ProducerResultsItem error: {e}");
            }
        }
    }

    if metrics.vcpus.is_empty() {
        trace!("no vcpu metrics yet?");
        return None;
    }

    Some(metrics)
}

impl VirtualMachineMetrics {
    fn vcpu_state_total(&self, state: &VcpuState) -> u64 {
        self.vcpus
            .values()
            .fold(0, |total, vcpu_usage| total + vcpu_usage.metrics[state])
    }

    fn update_metric_times(&mut self, metric_time: DateTime<Utc>) {
        self.oldest_time = std::cmp::min(self.oldest_time, metric_time);
    }

    /// Integrate a list of samples into this collection of virtual machine
    /// metrics.
    fn add_producer_result(&mut self, samples: &[Sample]) {
        let mut samples_by_metric = HashMap::new();

        for sample in samples {
            let name = sample.timeseries_name.to_owned();
            let fields = sample.sorted_metric_fields().to_owned();
            let collection: &mut Vec<Sample> =
                samples_by_metric.entry((name, fields)).or_default();

            collection.push(sample.clone());
        }

        for v in samples_by_metric.values_mut() {
            v.sort_by_key(|s| s.measurement.timestamp());
        }

        for ((name, fields), samples) in samples_by_metric {
            let last_sample = samples.last().expect("at least one sample");
            if name == "virtual_machine:reset" {
                assert!(
                    self.reset.is_none(),
                    "multiple virtual_machine:reset measurements for a \
                     single Propolis?"
                );

                let datum = last_sample.measurement.datum();
                let amount = if let Datum::CumulativeU64(amount) = datum {
                    amount.value()
                } else {
                    panic!("unexpected reset datum type: {datum:?}");
                };
                self.reset = Some(amount);
                self.update_metric_times(last_sample.measurement.timestamp());
            } else if name == "virtual_machine:vcpu_usage" {
                let datum = last_sample.measurement.datum();
                let amount = if let Datum::CumulativeU64(amount) = datum {
                    amount.value()
                } else {
                    panic!("unexpected vcpu_usage datum type: {datum:?}");
                };
                let field = &fields["state"];
                let state: VcpuState = if let FieldValue::String(state) =
                    &field.value
                {
                    VcpuState::from_str(state.as_ref()).unwrap_or_else(|_| {
                        panic!("unknown Oximeter vpcu state name: {state}");
                    })
                } else {
                    panic!("unknown vcpu state datum type: {field:?}");
                };
                let field = &fields["vcpu_id"];
                let vcpu_id = if let FieldValue::U32(vcpu_id) = field.value {
                    vcpu_id
                } else {
                    panic!("unknown vcpu id datum type: {field:?}");
                };
                let vcpu_metrics = self.vcpus.entry(vcpu_id).or_default();
                if vcpu_metrics.metrics.contains_key(&state) {
                    panic!(
                        "vcpu {vcpu_id} state {state:?} has duplicate metric \
                         {last_sample:?}"
                    );
                }
                trace!(
                    "recorded cpu {} state {:?} = {} at {}",
                    vcpu_id,
                    state,
                    amount,
                    last_sample.measurement.timestamp()
                );
                vcpu_metrics.metrics.insert(state, amount);
                self.update_metric_times(last_sample.measurement.timestamp());
            }
        }
    }
}

#[phd_testcase]
async fn instance_vcpu_stats(ctx: &TestCtx) {
    /// Allow as much as 20% measurement error for time comparisons in this
    /// test. When measuring active guest time, some guests (looking at you
    /// Windows) may have services that continue running in the time period
    /// where our test workload completes but we're still waiting for metrics;
    /// this means Oximeter can see more running time than we know we caused.
    /// When measuring guest idle time, these same idle services can result in
    /// the VM being less idle than our intended idling.
    ///
    /// "0.XX" here reflects an expectation that a system with no user-directed
    /// activity will actually be idle for XX% of a given time period. In
    /// practice this may be as low as 5% or less (many Linux guests), and as
    /// high as 12% in practice for Windows guests. Round up to 20% for some
    /// buffer.
    const TOLERANCE: f64 = 0.8;

    let mut env = ctx.environment_builder();
    env.metrics(Some(MetricsLocation::Local));

    let mut vm_config = ctx.vm_config_builder("instance_vcpu_stats");
    vm_config.guest_hv_interface(
        propolis_client::instance_spec::GuestHypervisorInterface::HyperV {
            features: [HyperVFeatureFlag::ReferenceTsc].into_iter().collect(),
        },
    );
    // Having one CPU simplifies the math for time expectations later in the
    // test. One CPU means one second per second of time across all one vCPU's
    // microstates, and if we have caused guest load the guest vCPU should
    // be in "run" basically entirely until the load completes.
    //
    // Using the (configurable!) default "could" work, but this lets us avoid
    // having to for additional probably-idle CPUs.
    vm_config.cpus(1);

    let mut vm = ctx.spawn_vm(&vm_config, Some(&env)).await?;

    let sampler = vm.metrics_sampler().expect("metrics are enabled");
    vm.launch().await?;

    vm.wait_to_boot().await?;

    // From watching Linux guests, some services may be relatively active right
    // at and immediately after login. Wait a few seconds to try counting any
    // post-boot festivities as part of "baseline".
    vm.run_shell_command("sleep 20").await?;

    let start_metrics = vm_metrics_snapshot(&sampler).await;

    // Measure a specific amount of time with guest vCPUs in the "run" state.
    //
    // We measure the "run" state using some fixed-size busywork because we
    // can't simply say "run for 5 seconds please" - if we did, a combination of
    // host OS or guest OS may leave the process timing itself descheduled for
    // some or all of that time, so we could end up with substantially less than
    // 5 seconds of execution and a flaky test as a result.
    //
    // Instead, run some busywork, time how long that took on the host OS, then
    // know the guest OS should have spent around that long running. This still
    // relies us measuring the completion time relatively quickly after the
    // busywork completes, but it's one fewer vms of nondeterminism.

    let run_start = std::time::SystemTime::now();
    vm.run_shell_command("i=0").await?;
    vm.run_shell_command("lim=2000000").await?;
    vm.run_shell_command("while [ $i -lt $lim ]; do i=$((i+1)); done").await?;
    let run_time = run_start.elapsed().expect("time goes forwards");
    trace!("measured run time {:?}", run_time);

    let now_metrics = vm_metrics_snapshot(&sampler).await;
    let total_run_window = run_start.elapsed().expect("time goes forwards");

    let run_delta = (now_metrics.vcpu_state_total(&VcpuState::Run)
        - start_metrics.vcpu_state_total(&VcpuState::Run))
        as u128;

    // The guest should not have run longer than the total time we were
    // measuring it..
    assert!(run_delta < total_run_window.as_nanos());

    // Our measurement of how long the guest took should be pretty close to the
    // guest's measured running time. It won't be exact: the guest may have
    // services that continue in the period between the shell command completing
    // and a final metrics collection - it may be running for more time than we
    // intended.
    //
    // (Anecdotally the actual difference here depends on the guest, with
    // minimal Linux guests like Alpine being quite close with <1% differences
    // here.)
    let min_guest_run_delta = (run_time.as_nanos() as f64 * TOLERANCE) as u128;
    assert!(
        run_delta > min_guest_run_delta,
        "{} > {}",
        run_delta as f64 / NANOS_PER_SEC,
        min_guest_run_delta as f64 / NANOS_PER_SEC
    );

    // VM vCPU stats are sampled roughly every five seconds, which means the
    // minimum granularity of `run + idle + waiting + emul` is also roughly
    // units of 5 seconds. There could be one or two sample intervals between
    // `start_metrics` and `now_metrics` depending on how long it took to get
    // from starting the Oximeter producer to actually sampling.
    //
    // This is to say: there isn't a strong statement that we can make about
    // idle time at this point other than that it is probably around a large
    // enough value to fill the total time out to a mutiple of 5 seconds.
    //
    // The guesswork to validate that doesn't seem great in the face of
    // variable-time CI. We'll validate idle time measurements separately,
    // below.

    // Idle time boundaries are a little differnt than running time boundaries
    // because it's more difficult to stop counting to idle vCPU time than it is
    // to stop counting running vCPU time. Instead, the maximum amount of idling
    // time we might measure is however long it takes to get the initial kstat
    // readings, plus how long the idle time takes, plus however long it takes
    // to get final kstat readings. The miminum amount of idling time is
    // the time elapsed since just after the initial kstat readings.
    let max_idle_start = std::time::SystemTime::now();
    let idle_start_metrics = vm_metrics_snapshot(&sampler).await;
    let idle_start = std::time::SystemTime::now();
    vm.run_shell_command("sleep 10").await?;

    let now_metrics = vm_metrics_snapshot(&sampler).await;

    // The guest VM would continues to exist with its idle vCPU being accounted
    // by the kstats Oximeter samples. This means `vm_metrics_snapshot` could
    // introduce as much as a full Oximeter sample interval of additional idle
    // vCPU, and is we why wait to measure idle time until *after* getting new
    // Oximeter metrics.
    let max_idle_time = max_idle_start.elapsed().expect("time goes forwards");
    let idle_time = idle_start.elapsed().expect("time goes forwards");
    trace!("measured idle time {:?}", idle_time);

    let idle_delta = (now_metrics.vcpu_state_total(&VcpuState::Idle)
        - idle_start_metrics.vcpu_state_total(&VcpuState::Idle))
        as u128;

    // We've idled for at least 20 seconds. The guest may not be fully idle (its
    // OS is still running on its sole CPU, for example), so we test that the
    // guest was just mostly idle for the time period.
    assert!(
        idle_delta < max_idle_time.as_nanos(),
        "{} < {}",
        idle_delta as f64 / NANOS_PER_SEC,
        idle_time.as_nanos() as f64 / NANOS_PER_SEC
    );
    let min_guest_idle_delta =
        (idle_time.as_nanos() as f64 * TOLERANCE) as u128;
    assert!(
        idle_delta > min_guest_idle_delta,
        "{} > {}",
        idle_delta as f64 / NANOS_PER_SEC,
        min_guest_idle_delta as f64 / NANOS_PER_SEC
    );

    // The delta in vCPU `run` time should be negligible. We've run one shell
    // command which in turn just idled. In reality, if the guest has idle
    // processes running even sitting at an empty prompt, assume there is up to
    // THRESHOLD activity happening anyway. This is another threshold that
    // varies based on guest OS type.
    let run_delta = (now_metrics.vcpu_state_total(&VcpuState::Run)
        - idle_start_metrics.vcpu_state_total(&VcpuState::Run))
        as u128;
    let idle_delta = (now_metrics.vcpu_state_total(&VcpuState::Idle)
        - idle_start_metrics.vcpu_state_total(&VcpuState::Idle))
        as u128;
    assert!(run_delta < (idle_delta as f64 * (1.0 - TOLERANCE)) as u128);

    let full_run_delta = (now_metrics.vcpu_state_total(&VcpuState::Run)
        - start_metrics.vcpu_state_total(&VcpuState::Run))
        as u128;

    let full_idle_delta = (now_metrics.vcpu_state_total(&VcpuState::Idle)
        - start_metrics.vcpu_state_total(&VcpuState::Idle))
        as u128;

    let full_waiting_delta = (now_metrics.vcpu_state_total(&VcpuState::Waiting)
        - start_metrics.vcpu_state_total(&VcpuState::Waiting))
        as u128;

    let full_emul_delta = (now_metrics.vcpu_state_total(&VcpuState::Emulation)
        - start_metrics.vcpu_state_total(&VcpuState::Emulation))
        as u128;

    // Theoretically 100ms would be a comically high upper bound for how much
    // time we've spent emulating instructions on the guest's behalf during this
    // test. In reality, the situation is more subtle. Guest OSes can be
    // surprisingly heavy on the APIC, and if they believe the TSC is
    // unreliable, heavy on the ACPI PM timer too. We've at least set up guest
    // enlightments to present a reliable TSC, so as long as the guest picks up
    // that enlightenment and does not fall back to the ACPI PM timer, that
    // source of instruction emulation activity is quashed.
    //
    // So, it's hard to make a universal statement about how much time should be
    // spent emulating instructions here. Instead, only check this if we know
    // the guest is going to result in predictable times. Specifically: expect
    // that Linux doesn't use the APIC much while idle and trusts the HyperV TSC
    // enlightenment, and so is a candidate for reliable assertions on
    // instruction emulation time.
    if vm.guest_os_kind().is_linux() {
        const EMUL_LIMIT: u128 = Duration::from_millis(100).as_nanos();
        // As of writing this test, `full_emul_delta` is around 12-13ms with an
        // Alpine guest. 100ms is hopefully plenty of margin for slower or
        // busier test systems, or reasonable implementation changes.
        assert!(
            full_emul_delta < EMUL_LIMIT,
            "full emul delta was above threshold: \
             {full_emul_delta} > {EMUL_LIMIT}"
        );
    } else {
        warn!(
            "guest OS may cause substantial emulation time due to benign \
               factors outside our control; skipping emulation stat check"
        );
    }

    // Waiting is a similar but more constrained situation as `emul`: time when
    // the vCPU was runnable but not *actually* running. This should be a very
    // short duration, and on my workstation this is around 400 microseconds.
    // Again, test against a significantly larger threshold in case CI is
    // extremely slow.
    const WAIT_LIMIT: u128 = Duration::from_millis(20).as_nanos();
    assert!(
        full_waiting_delta < WAIT_LIMIT,
        "full waiting delta was above threshold: {} > {}",
        full_waiting_delta,
        WAIT_LIMIT
    );

    trace!("run: {}", full_run_delta);
    trace!("idle: {}", full_idle_delta);
    trace!("waiting: {}", full_waiting_delta);
    trace!("emul: {}", full_emul_delta);
}


================================================
FILE: phd-tests/tests/src/vsock.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use phd_testcase::*;

const GUEST_CID: u64 = 16;
const PCI_DEV_NUM: u8 = 26;

#[phd_testcase]
async fn vsock_smoke_test(ctx: &TestCtx) {
    let mut cfg = ctx.vm_config_builder("vsock_smoke_test");
    cfg.vsock(GUEST_CID, PCI_DEV_NUM);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    // This doesn't tell the whole story since linux will sometimes make this
    // device available even if the hypervisor does not present the virtio
    // device itself. Either way, it would be an error if it's not present.
    vm.run_shell_command("test -e /dev/vsock").await?;
}

#[phd_testcase]
async fn vsock_get_cid(ctx: &TestCtx) {
    const GET_CID: &str = "/usr/local/bin/getcid";

    let mut cfg = ctx.vm_config_builder("vsock_get_cid");
    cfg.vsock(GUEST_CID, PCI_DEV_NUM);

    let mut vm = ctx.spawn_vm(&cfg, None).await?;
    vm.launch().await?;
    vm.wait_to_boot().await?;

    // If we are not using a modified alpine image with our additional tooling
    // we should skip this test entirely.
    if vm.run_shell_command(&format!("test -e {GET_CID}")).await.is_err() {
        phd_skip!("guest doesn't have getcid installed");
    }

    let cid = vm.run_shell_command(GET_CID).await?.parse::<u64>()?;
    assert_eq!(cid, GUEST_CID, "guest cid matches what was configured");
}


================================================
FILE: phd-tests/tests/testdata/dirt.sh
================================================
#!/usr/bin/env sh
# a simple script for testing migration of running process memory.
#
# this script will write a bunch of "hello"s to an in-memory string, then
# suspend to wait for a migration to occur. after the migration happens, the
# script can be foregrounded, and it will then check that the data is still
# there.
dirt="hello"
len=8192 # 8 KiB ought to be enough for anyone

# store 8k of data in memory
i=0
while [ "$i" -lt "$len" ]
do
    dirt="$dirt
hello"
    i=$(( i + 1 ))
done

echo "made dirt"

# suspend this process and wait for it to be resumed before checking that the
# data still exists.
#
# N.B.: posix sh doesn't have a suspend builtin, but we can make our own!
kill -s TSTP $$

# check that the data is still correct.
#
# we do this by writing it out to a file and then looping through the file,
# because i wasn't sure how to loop over a variable line by line in posix sh
# without using a file.
dirtfile="/tmp/dirt.txt"
echo "$dirt" > "$dirtfile"
actual_len=$(wc -l "$dirtfile" | cut -d " " -f 1)
echo "found $actual_len lines of dirt"

# pre-check the file's length
if [ "$actual_len" -lt "$len" ]
then
    echo "not enough dirt: $actual_len < $len"
    exit 1
fi

i=0
while read -r line
do
    if [ $i -eq 8192 ]; then
        echo "all good"
        exit 0
    fi
    if [ "$line" != "hello" ]
    then
        echo "bad dirt $i: $line"
        exit 1
    fi
    i=$(( i + 1 ))
done < "$dirtfile"

================================================
FILE: rust-toolchain.toml
================================================
[toolchain]
# We choose a specific toolchain (rather than "stable") for repeatability.  The
# intent is to keep this up-to-date with recently-released stable Rust.

channel = "1.90.0"
profile = "default"


================================================
FILE: rustfmt.toml
================================================
# ---------------------------------------------------------------------------
# Stable features that we customize locally
# ---------------------------------------------------------------------------
max_width = 80
use_small_heuristics = "max"
edition = "2021"


================================================
FILE: scripts/README.md
================================================
# Propolis Scripts

This is a collection of assorted scripts which may be useful for certain
development activities, such as observing aspects of propolis performance. See
individual files for details.

## Scripts

- `live-migration-times.d`: Measure the length of individual phases of live
  migration on a running propolis-server.
- `nvme_trace.d`: Measure propolis-emulated NVMe read/write latency.
- `time_adjustments.d`: Observe guest timing data adjustments on the target host
  of a live migration.
- `vm_exit_codes.d`: Measure VM exits and information about them both for
  #VMEXIT events and returns to Propolis.
- `cpuid-queries.d`: Report guest CPUID queries and returned leaves.
- `viona.d`: Report Viona ioctl and notification events in real time.


================================================
FILE: scripts/cpuid-queries.d
================================================
#pragma D option defaultargs
#pragma D option quiet

/*
 * Report guest CPUID queries and returned leaves.
 *
 * Usage: ./cpuid-queries.d <target VM's struct vm*>
 *
 * You probably have a VM name you care about, not its `struct vm*`. How do you
 * go from the name to the pointer required here? And why?
 *
 * To the first question, something like this will get you the `vmm_vm` pointer
 * to use with this script:
 * ```
 * mdb -ke '::walk vmm | ::print struct vmm_softc vmm_vm vmm_name ! grep -B 1 <VM_NAME>'
 * ```
 *
 * Why? Because the VM's name is on vmm_softc, which has a reference to the
 * structure used in CPUID emulation and so on, but there's no back reference.
 * So this is slightly less convoluted than capturing the `struct vm` pointer at
 * some earlier point when we still have a softc. This may well get simplified
 * with a backref in the future.
 */

BEGIN {
	if ($$1 == "") {
		printf("target `struct vm*` required");
		exit(1);
	}

	target_vm = $1;
}

fbt::vcpu_emulate_cpuid:entry / arg0 == target_vm / {
	self->rax = (uint64_t*)arg2;
	self->rbx = (uint64_t*)arg3;
	self->rcx = (uint64_t*)arg4;
	self->rdx = (uint64_t*)arg5;
	self->leaf = (uint32_t)*self->rax;
	self->subleaf = (uint32_t)*self->rcx;
}

fbt::vcpu_emulate_cpuid:return / self->rax != NULL / {
	printf("CPUID query: leaf/subleaf 0x%08x/0x%08x, returns rax = 0x%08x, rbx = 0x%08x, rcx = 0x%08x, rdx = 0x%08x\n",
		self->leaf,
	 	self->subleaf,
		*self->rax,
		*self->rbx,
		*self->rcx,
		*self->rdx
	);
	self->rax = 0;
}


================================================
FILE: scripts/live-migration-times.d
================================================
#!/usr/sbin/dtrace -s

/*
 * Measures the length of individual phases of the propolis-server live
 * migration protocol.
 *
 * Usage: ./live-migration-times.d <propolis-server PID> [v]
 *
 * Use "v" for more verbose output.
 *
 *
 * Some implementation notes:
 * - A pid is required because multiple migrations might be running on the same
 *   machine. It's possible to bifurcate the data here based on pid using
 *   aggregations, but the tradeoff is that formatting output is a bit more
 *   difficult. So for now, we require a pid.
 * - This script relies on the fact that each phase in the migration will only
 *   fire the migrate_phase_{begin,end} probes once. If our architecture
 *   changes, this script might break.
 * - We also assume that each phase has a unique name, which is passed as an
 *   arugment into the migrate_phase_{begin,end} probes. We use the name as
 *   a key for tracking the phase deltas. If those names change, or phases are
 *   added/removed, this script will break.
 * - If a VM's guest is mostly idle, the post-pause RAM transfer phase will
 *   copy very few pages, which can make the transfer rate abnormally low (the
 *   cost of entering and leaving the phase becomes significant relative to the
 *   cost of transferring RAM). If the post-pause transfer rate for some
 *   migration seems abnormally low, check to make sure there were actually
 *   some pages to transfer!
 */

#pragma D option quiet
#pragma D option defaultargs

inline uint64_t NS_PER_SEC = 1000000000;

enum vm_paused {
	VM_UNPAUSED = 0,
	VM_PAUSED = 1,
};

uint64_t xfer_pages[uint8_t];
uint64_t xfer_bytes[uint8_t];

dtrace:::BEGIN
{
	if ($$1 == "") {
		printf("ERROR: propolis-server pid required\n");
		exit(1);
	}

	printf("tracing live migration protocol times for pid %d...\n", $1);
	printf("\n");

	if ($$2 == "v") {
		printf("%-12s %-10s %30s\n", "PHASE", "", "TIMESTAMP");
	}

	this->phase = "";
}

propolis$1:::migrate_xfer_ram_region
{
	xfer_pages[arg2] += arg0;
	xfer_bytes[arg2] += arg1;
}

propolis$1:::migrate_phase_begin
{
	this->phase = copyinstr(arg0);
	start_times[this->phase] = timestamp;

	if ($$2 == "v") {
		printf("%-12s %-10s %30d\n", this->phase, "BEGIN",
		    start_times[this->phase]);
	}
}

propolis$1:::migrate_phase_end
{
	this->phase = copyinstr(arg0);
	this->start = start_times[this->phase];
	this->end = timestamp;

	if (this->start != 0) {
		delta[this->phase] = this->end - this->start;
	} else {
		printf("WARNING: phase \"%s\" could not be measured\n",
		    this->phase);
	}

	if ($$2 == "v") {
		printf("%-12s %-10s %30d\n", this->phase, "END",
		    this->end);
	}
}

dtrace:::END
{
	this->sync = "Sync";
	this->rpush_pre = "RamPushPrePause";
	this->pause = "Pause";
	this->rpush_post = "RamPushPostPause";
	this->td = "TimeData";
	this->dev = "DeviceState";
	this->rpull = "RamPull";
	this->sstate = "ServerState";
	this->fin = "Finish";

	this->d_sync = delta[this->sync];
	this->d_rpush_pre = delta[this->rpush_pre];
	this->d_pause = delta[this->pause];
	this->d_rpush_post = delta[this->rpush_post];
	this->d_td = delta[this->td];
	this->d_dev = delta[this->dev];
	this->d_rpull = delta[this->rpull];
	this->d_sstate = delta[this->sstate];
	this->d_fin = delta[this->fin];

	this->total = 0;

	/* Print header */
	if ($$1 != "") {
		printf("\n\n");
		printf("%-15s %30s\n", "PHASE", "TIME ELAPSED (usec)");
	}

	/* Print the values of each phase, if they occurred */
	if (this->d_sync != 0) {
		printf("%-16s %29d\n", this->sync, this->d_sync / 1000);
		this->total += this->d_sync;
	}
	if (this->d_rpush_pre != 0) {
		printf("%-16s %29d\n", this->rpush_pre, this->d_rpush_pre / 1000);
		this->total += this->d_rpush_pre;
	}
	if (this->d_pause != 0) {
		printf("%-16s %29d\n", this->pause, this->d_pause / 1000);
		this->total += this->d_pause;
	}
	if (this->d_rpush_post != 0) {
		printf("%-16s %29d\n", this->rpush_post, this->d_rpush_post / 1000);
		this->total += this->d_rpush_post;
	}
	if (this->d_td != 0) {
		printf("%-15s %30d\n", this->td, this->d_td / 1000);
		this->total += this->d_td;
	}
	if (this->d_dev != 0) {
		printf("%-16s %29d\n", this->dev, this->d_dev / 1000);
		this->total += this->d_dev;
	}
	if (this->d_rpull != 0) {
		printf("%-16s %29d\n", this->rpull, this->d_rpull / 1000);
		this->total += this->d_rpull;
	}
	if (this->d_sstate != 0) {
		printf("%-16s %29d\n", this->sstate, this->d_sstate / 1000);
		this->total += this->d_sstate;
	}
	if (this->d_fin != 0) {
		printf("%-16s %29d\n", this->fin, this->d_fin / 1000);
		this->total += this->d_fin;
	}

	/* Print total elapsed time */
	if ($$1 != "") {
		printf("%-15s %30d\n", "TOTAL", this->total / 1000);
		printf("\n");
	}

	xfer_pages_total = xfer_pages[VM_PAUSED] + xfer_pages[VM_UNPAUSED];
	xfer_bytes_total = xfer_bytes[VM_PAUSED] + xfer_bytes[VM_UNPAUSED];

	/* Print summary of RAM pages transferred */
	if ($$1 != "" && xfer_pages_total != 0) {
		printf("%-25s %20d\n", "NPAGES XFERED (total)", xfer_pages_total);
		printf("%-25s %20d\n", "NBYTES XFERED (total)", xfer_bytes_total);

		printf("%-25s %20d\n",
				"NPAGES XFERED (unpaused)",
				xfer_pages[VM_UNPAUSED]);

		printf("%-25s %20d\n",
				"NBYTES XFERED (unpaused)",
				xfer_bytes[VM_UNPAUSED]);

		if (this->d_rpush_pre != 0 && xfer_bytes[VM_UNPAUSED] != 0) {
			bytes_per_sec =
				((xfer_bytes[VM_UNPAUSED] * NS_PER_SEC) / this->d_rpush_pre);

			printf("%-25s %20d\n", "KiB/SEC (unpaused)", bytes_per_sec / 1024);
		}

		printf("%-25s %20d\n",
				"NPAGES XFERED (paused)",
				xfer_pages[VM_PAUSED]);

		printf("%-25s %20d\n",
				"NBYTES XFERED (paused)",
				xfer_bytes[VM_PAUSED]);

		if (this->d_rpush_post != 0 && xfer_bytes[VM_PAUSED] != 0) {
			bytes_per_sec =
				((xfer_bytes[VM_PAUSED] * NS_PER_SEC) / this->d_rpush_post);

			printf("%-25s %20d\n", "KiB/SEC (paused)", bytes_per_sec / 1024);
		}
	}
}


================================================
FILE: scripts/nvme-trace.d
================================================
#!/usr/sbin/dtrace -s

/*
 * nvme-trace.d     Print propolis emulated NVMe read/write latency.
 *
 * USAGE: ./nvme-trace.d -p propolis-pid
 */

#pragma D option quiet

dtrace:::BEGIN
{
    printf("Tracing propolis PID %d... Hit Ctrl-C to end.\n", $target);
}

struct io_info {
    string op;
    uint64_t ts;
    uint64_t offset_bytes;
    uint64_t size_bytes;
};

struct io_info io[uint64_t];

propolis$target:::nvme_read_enqueue,
propolis$target:::nvme_write_enqueue
{
    this->cid = args[2];
    this->op = (probename == "nvme_read_enqueue") ? "read" : "write";
    io[this->cid].op = this->op;
    io[this->cid].ts = timestamp;
    io[this->cid].offset_bytes = args[3];
    io[this->cid].size_bytes = args[4];
}

propolis$target:::nvme_read_complete,
propolis$target:::nvme_write_complete
/io[args[1]].ts != 0/
{
    this->cid = args[1];
    this->elapsed = timestamp - io[this->cid].ts;
    this->elapsed_us = this->elapsed / 1000;
    @time[strjoin(io[this->cid].op, " (us)")] = quantize(this->elapsed_us);
    printf("%s(cid=%u) %d bytes from offset 0x%x in %uus\n",
           io[this->cid].op,
           this->cid,
           io[this->cid].size_bytes,
           io[this->cid].offset_bytes,
           this->elapsed_us);
}

dtrace:::END
{
}


================================================
FILE: scripts/time-adjustments.d
================================================
#!/usr/sbin/dtrace -s

/*
 * Provides visibility into adjustments made on time-related data on a
 * destination live migration host.
 *
 * Usage: ./time_adjustments.d <propolis-server PID>
 */

#pragma D option defaultargs
#pragma D option quiet

uint64_t	start_gf;
uint64_t	usr_adj_gf;
uint64_t	krn_adj_gf;

uint64_t	start_gtsc;
uint64_t	usr_adj_gtsc;
uint64_t	krn_adj_gtsc;

int64_t		start_bhrt;
int64_t		usr_adj_bhrt;
int64_t		krn_adj_bhrt;

uint64_t	migrate_time;
uint64_t	guest_uptime;

uint64_t	usr_time;
uint64_t	krn_time;


dtrace:::BEGIN
{
	if ($$1 == "") {
		printf("ERROR: propolis-server pid required\n");
		exit(1);
	}

	printf("tracing pid %d...\n", $1);
}

propolis$1:::migrate_time_data_before
{
	start_gf = args[0];
	start_gtsc = args[1];
	start_bhrt = args[2];
	usr_time = timestamp;
}


propolis$1:::migrate_time_data_after
{
	usr_adj_gf = args[0];
	usr_adj_gtsc = args[1];
	usr_adj_bhrt = args[2];

	this->start = usr_time;
	this->end = timestamp;

	usr_time = this->end - this->start;
	vm_uptime = args[3];
	migrate_time = args[4];
}

fbt::vmm_data_write_vmm_time:entry
{
	self->vm = (struct vm *)args[0];
	self->req = (struct vdi_time_info_v1 *)args[1]->vdr_data;
	self->ts = timestamp;

	this->gf = self->req->vt_guest_freq;
	this->gtsc = self->req->vt_guest_tsc;
	this->bhrt = self->req->vt_boot_hrtime;

	if (usr_adj_gtsc != this->gtsc) {
		printf("ERROR: propolis and VMM data guest TSC differ\n");
		printf("propolis_val = %lu, bhyve_val = %lu\n",
		    usr_adj_gtsc, this->gtsc);
	}

	if (usr_adj_gf != this->gf) {
		printf("ERROR: propolis and VMM data guest freq differ\n");
		printf("propolis_val = %lu, bhyve_val = %lu\n",
		    usr_adj_gf, this->gf);
	}

	if (usr_adj_bhrt != this->bhrt) {
		printf("ERROR: propolis and VMM data boot_hrtime differ\n");
		printf("propolis_val = %ld, bhyve_val = %ld\n",
		    usr_adj_bhrt, (int64_t)this->bhrt);
	}
}

fbt::vmm_data_write_vmm_time:return
/ self->vm /
{
	if (args[1] == 0) {
		krn_adj_bhrt = self->vm->boot_hrtime;
		krn_adj_gf = self->vm->guest_freq;
	} else {
		print(args[1]);
	}

	krn_time = timestamp - self->ts;

	self->vm = 0;
	self->req = 0;
	self->ts = 0;

	printf("time data imported; press CTRL+C for summary\n");
}

fbt::calc_tsc_offset:entry
/ self->vm /
{
	krn_adj_gtsc = args[1];
}

dtrace:::END
{
	this->total = migrate_time + usr_time + krn_time;
	this->usr_d_tsc = usr_adj_gtsc - start_gtsc;
	this->usr_d_bhrt = usr_adj_bhrt - start_bhrt;
	this->krn_d_tsc = krn_adj_gtsc - usr_adj_gtsc;
	this->krn_d_bhrt = krn_adj_bhrt - usr_adj_bhrt;

	printf("%15s %20s %20s\n",
	    "GUEST FREQ (Hz)", "UPTIME (usec)", "TOTAL TIME (usec)");
	printf("%15lu %20lu %20lu\n", start_gf,vm_uptime / 1000,
	    this->total / 1000);
	printf("\n");

	printf("%-10s %20s %20s %20s\n",
	    "EVENT", "DURATION (usec)", "GUEST TSC", "BOOT HRTIME");
	printf("%-10s %20lu %20lu %20ld\n", "Migration", migrate_time / 1000,
	    start_gtsc, start_bhrt);
	printf("%-30s  %20lu %20ld\n", "[adjustment]",
	    this->usr_d_tsc, this->usr_d_bhrt);
	printf("%-10s %20lu %20lu %20ld\n", "Propolis",
	    usr_time / 1000, usr_adj_gtsc, usr_adj_bhrt);
	printf("%-30s  %20lu %20ld\n", "[adjustment]",
	    this->krn_d_tsc, this->krn_d_bhrt);
	printf("%-10s %20d %20d %20d\n", "Kernel",
	    krn_time / 1000, krn_adj_gtsc, krn_adj_bhrt);

}


================================================
FILE: scripts/viona.d
================================================
#!/usr/sbin/dtrace -qCs

#pragma D option quiet

/*
 * Dtrace ioctl calls into the viona kernel module as they occur, and show
 * in-kernel ring notifications.
 *
 * This script needs the `viona` module to be loaded. If you want to run it
 * before starting a VM utter: `modload /usr/kernel/drv/amd64/viona`
 */

#define	VNA_IOC	(('V' << 16)|('C' << 8))
#define	VNA_IOC_CREATE			(VNA_IOC | 0x01)
#define	VNA_IOC_DELETE			(VNA_IOC | 0x02)
#define	VNA_IOC_VERSION			(VNA_IOC | 0x03)
#define	VNA_IOC_DEFAULT_PARAMS		(VNA_IOC | 0x04)

#define	VNA_IOC_RING_INIT		(VNA_IOC | 0x10)
#define	VNA_IOC_RING_RESET		(VNA_IOC | 0x11)
#define	VNA_IOC_RING_KICK		(VNA_IOC | 0x12)
#define	VNA_IOC_RING_SET_MSI		(VNA_IOC | 0x13)
#define	VNA_IOC_RING_INTR_CLR		(VNA_IOC | 0x14)
#define	VNA_IOC_RING_SET_STATE		(VNA_IOC | 0x15)
#define	VNA_IOC_RING_GET_STATE		(VNA_IOC | 0x16)
#define	VNA_IOC_RING_PAUSE		(VNA_IOC | 0x17)
#define	VNA_IOC_RING_INIT_MODERN	(VNA_IOC | 0x18)

#define	VNA_IOC_INTR_POLL		(VNA_IOC | 0x20)
#define	VNA_IOC_SET_FEATURES		(VNA_IOC | 0x21)
#define	VNA_IOC_GET_FEATURES		(VNA_IOC | 0x22)
#define	VNA_IOC_SET_NOTIFY_IOP		(VNA_IOC | 0x23)
#define	VNA_IOC_SET_PROMISC		(VNA_IOC | 0x24)
#define	VNA_IOC_GET_PARAMS		(VNA_IOC | 0x25)
#define	VNA_IOC_SET_PARAMS		(VNA_IOC | 0x26)
#define	VNA_IOC_GET_MTU			(VNA_IOC | 0x27)
#define	VNA_IOC_SET_MTU			(VNA_IOC | 0x28)
#define	VNA_IOC_SET_NOTIFY_MMIO		(VNA_IOC | 0x29)
#define	VNA_IOC_INTR_POLL_MQ		(VNA_IOC | 0x2a)

#define	VNA_IOC_GET_PAIRS		(VNA_IOC | 0x30)
#define	VNA_IOC_SET_PAIRS		(VNA_IOC | 0x31)
#define	VNA_IOC_GET_USEPAIRS		(VNA_IOC | 0x32)
#define	VNA_IOC_SET_USEPAIRS		(VNA_IOC | 0x33)

BEGIN {
	printf("Tracing...\n");
}

propolis*:::virtio_state_reset {
	printf("--- DEVICE RESET ---\n");
}

propolis*:::virtio_set_status {
	printf("--- DEVICE STATUS SET: %02x ---\n", arg0);
}

viona_ioctl:entry {
	self->dptr = arg2;
	self->rvp = args[5];
}

viona_ioctl:entry/arg1 == VNA_IOC_CREATE/ {
	self->cmd = "CREATE";
	create = (vioc_create_t *)copyin(arg2, sizeof (vioc_create_t));
	printf("%s (link 0x%x)\n", self->cmd, create->c_linkid);
}

viona_ioctl:entry/arg1 == VNA_IOC_DELETE/ {
	self->cmd = "DELETE";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_VERSION/ {
	self->cmd = "VERSION";
	self->pending = arg1;
}

viona_ioctl:return/self->pending == VNA_IOC_VERSION/ {
	printf("VERSION 0x%x\n", *self->rvp);
}

viona_ioctl:entry/arg1 == VNA_IOC_DEFAULT_PARAMS/ {
	self->cmd = "DEFAULT_PARAMS";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_INIT/ {
	self->cmd = "RING_INIT";
	init = (vioc_ring_init_t *)copyin(arg2, sizeof (vioc_ring_init_t));
	printf("%s 0x%x %x %x\n", self->cmd,
	    init->ri_index, init->ri_qaddr, init->ri_qsize);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_INIT_MODERN/ {
	self->cmd = "RING_INIT_MODERN";
	initm = (vioc_ring_init_modern_t *)copyin(arg2,
	    sizeof (vioc_ring_init_modern_t));
	printf("%s 0x%x %x/%x/%x %x\n", self->cmd,
	    initm->rim_index, initm->rim_qaddr_desc, initm->rim_qaddr_avail,
	    initm->rim_qaddr_used, initm->rim_qsize);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_RESET/ {
	self->cmd = "RING_RESET";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_KICK/ {
	self->cmd = "RING_KICK";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_SET_MSI/ {
	self->cmd = "RING_SET_MSI";
	msi = (vioc_ring_msi_t *)copyin(arg2, sizeof (vioc_ring_msi_t));
	printf("%s 0x%x addr=%x msg=%x\n", self->cmd,
	    msi->rm_index, msi->rm_addr, msi->rm_msg);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_INTR_CLR/ {
	self->cmd = "RING_INTR_CLR";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_SET_STATE/ {
	self->cmd = "RING_SET_STATE";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_GET_STATE/ {
	self->cmd = "RING_GET_STATE";
	self->pending = arg1;
}

viona_ioctl:return/self->pending == VNA_IOC_RING_GET_STATE/ {
	self->pending = 0;
	statep = (vioc_ring_state_t *)copyin(self->dptr,
	    sizeof (vioc_ring_state_t));
	printf("GET_STATE %x %x/%x/%x AV %x USED %x\n",
	    statep->vrs_index, statep->vrs_qaddr_desc, statep->vrs_qaddr_avail,
	    statep->vrs_qaddr_used, statep->vrs_avail_idx,
	    statep->vrs_used_idx);
}

viona_ioctl:entry/arg1 == VNA_IOC_RING_PAUSE/ {
	self->cmd = "RING_PAUSE";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_INTR_POLL_MQ/ {
	self->cmd = "INTR_POLL_MQ";
	self->pending = arg1;
}

viona_ioctl:return/self->pending == VNA_IOC_INTR_POLL_MQ/ {
	self->pending = 0;
	intrp = (vioc_intr_poll_mq_t *)copyin(self->dptr,
	    sizeof (vioc_intr_poll_mq_t));
	printf("INTR_POLL %x set (0x%08x)\n", *self->rvp,
	    intrp->vipm_status[0]);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_FEATURES/ {
	self->cmd = "SET_FEATURES";
	featp = (uint64_t *)copyin(arg2, sizeof (uint64_t));
	printf("%s 0x%x\n", self->cmd, *featp);
}

viona_ioctl:entry/arg1 == VNA_IOC_GET_FEATURES/ {
	self->cmd = "GET_FEATURES";
	self->pending = arg1;
}

viona_ioctl:return/self->pending == VNA_IOC_GET_FEATURES/ {
	self->pending = 0;
	gfeatp = (uint64_t *)copyin(self->dptr, sizeof (uint64_t));
	printf("GET_FEATURES 0x%x\n", *gfeatp);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_NOTIFY_IOP/ {
	self->cmd = "SET_NOTIFY_IOP";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_PROMISC/ {
	self->cmd = "SET_PROMISC";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_GET_PARAMS/ {
	self->cmd = "GET_PARAMS";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_PARAMS/ {
	self->cmd = "SET_PARAMS";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_GET_MTU/ {
	self->cmd = "GET_MTU";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_MTU/ {
	self->cmd = "SET_MTU";
	printf("%s %u\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_NOTIFY_MMIO && !arg2/ {
	self->cmd = "SET_NOTIFY_MMIO <NULL>";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_NOTIFY_MMIO && arg2/ {
	self->cmd = "SET_NOTIFY_MMIO";
	mmio = (vioc_notify_mmio_t *)copyin(arg2, sizeof (vioc_notify_mmio_t));
	printf("%s %x+%x\n", self->cmd, mmio->vim_address, mmio->vim_size);
}

viona_ioctl:entry/arg1 == VNA_IOC_GET_PAIRS/ {
	self->cmd = "GET_PAIRS";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_PAIRS/ {
	self->cmd = "SET_PAIRS";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_ioctl:entry/arg1 == VNA_IOC_GET_USEPAIRS/ {
	self->cmd = "GET_USEPAIRS";
	printf("%s\n", self->cmd);
}

viona_ioctl:entry/arg1 == VNA_IOC_SET_USEPAIRS/ {
	self->cmd = "SET_USEPAIRS";
	printf("%s 0x%x\n", self->cmd, arg2);
}

viona_notify_iop:entry/arg1/ {
	printf("IOP NOTIFY read %x+%x\n", arg2, arg3);
}

viona_notify_iop:entry/!arg1/ {
	printf("IOP NOTIFY write %x+%x = %x\n", arg2, arg3, *args[4]);
}

viona_notify_mmio:entry/!arg1/ {
	printf("MMIO NOTIFY read %x+%x\n", arg2, arg3);
}

viona_notify_mmio:entry/arg1/ {
	printf("MMIO NOTIFY write %x+%x = %x\n", arg2, arg3, *args[4]);
}

viona_ioctl:return/self->cmd != 0 && arg1 != 0/ {
	printf("ioctl(%s) returning 0x%x\n", self->cmd, arg1);
	self->cmd = 0;
}


================================================
FILE: scripts/vm-exit-codes.d
================================================
#pragma D option quiet

/*
 * Report reasons why a VM's vCPU has exited, and exits out to the VMM.
 *
 * A vCPU's exits are often handled entirely in bhyve, with the VMM unaware, and
 * an exit to VMM may not be the direct consequence of a VM exit. An exit to VMM
 * likely will only occur after at least one VMEXIT, though, when other
 * conditions are re-evaluated before resuming the vCPU. This script measures
 * and reports both SVM VMEXIT and exits to VMM because while separate they are
 * likely related in some way.
 */

/*
 * From AMD APM Volume 2 Appendix C: "SVM Intercept Exit Codes"
 *
 * Subset of exit codes this script is particularly interested in.
 */
enum svm_exitcode {
	VMEXIT_INTR = 0x60,
	VMEXIT_VINTR = 0x64,
	VMEXIT_CPUID = 0x72,
	VMEXIT_IOIO = 0x7b,
	VMEXIT_NPF = 0x400
};

/*
 * Exit codes that a VMM may receive after running a vCPU, taken from
 * illumos' `intel/sys/vmm.h`.
 */
enum vm_exitcode {
	VM_EXITCODE_INOUT,
	VM_EXITCODE_VMX,
	VM_EXITCODE_BOGUS,
	VM_EXITCODE_RDMSR,
	VM_EXITCODE_WRMSR,
	VM_EXITCODE_HLT,
	VM_EXITCODE_MTRAP,
	VM_EXITCODE_PAUSE,
	VM_EXITCODE_PAGING,
	VM_EXITCODE_INST_EMUL,
	VM_EXITCODE_RUN_STATE,
	VM_EXITCODE_MMIO_EMUL,
	VM_EXITCODE_DEPRECATED,	/* formerly RUNBLOCK */
	VM_EXITCODE_IOAPIC_EOI,
	VM_EXITCODE_SUSPENDED,
	VM_EXITCODE_MMIO,
	VM_EXITCODE_TASK_SWITCH,
	VM_EXITCODE_MONITOR,
	VM_EXITCODE_MWAIT,
	VM_EXITCODE_SVM,
	VM_EXITCODE_DEPRECATED2, /* formerly REQIDLE */
	VM_EXITCODE_DEBUG,
	VM_EXITCODE_VMINSN,
	VM_EXITCODE_BPT,
	VM_EXITCODE_HT,
	VM_EXITCODE_MAX
};

BEGIN {
	misunderstood_exits = 0;
}

fbt::vm_run:entry {
	/*
	 * Some functions of interest here are only interesting when called
	 * under vm_run, but may be called elsewhere as well. Keep track of
	 * if we're in vm_run to scope other probes correspondingly.
	 */
	self->in_vm_run = 1;
	/*
	 * Assuming we'll exit vm_run at some point, presume we don't know
	 * why that exit occurred. We'll flip this to true in cases the script
	 * knows about. Any exits that are not understood are a sign the script
	 * is stale, the kernel has changed, or both.
	 */
	self->exit_understood = 0;
	self->next_exit_reason = "unknown";
}

fbt::svm_launch:return {
	self->vcpu = (struct svm_vcpu *)NULL;
}

fbt::svm_vmexit:entry {
	self->vcpu = &((struct svm_softc*)arg0)->vcpu[arg1];
	self->ctrl = self->vcpu->vmcb.ctrl;
	self->state = self->vcpu->vmcb.state;
	self->vmexit = (struct vm_exit*)arg2;

	@exits[self->ctrl.exitcode] = count();

	if (self->ctrl.exitcode == VMEXIT_IOIO) {
		this->opsz = (self->ctrl.exitinfo1 >> 4) & 7;
		this->addrsz = (self->ctrl.exitinfo1 >> 7) & 7;
		@io_info[
		  self->ctrl.exitinfo1 >> 16,
		  self->ctrl.exitinfo1 & 1 == 0 ? "out" : "in",
		  this->opsz == 1 ? "8b" :
		    this->opsz == 2 ? "16b" :
		      this->opsz == 4 ? "32b" : "bogus",
		  this->addrsz == 1 ? "16b" :
		    this->addrsz == 2 ? "32b" :
		      this->addrsz == 4 ? "64b" : "bogus"
		] = count();
	}

	if (self->ctrl.exitcode == VMEXIT_NPF) {
		@npf_info[
			self->ctrl.exitinfo2,
			self->state.rip,
			/*
			 * Instruction/Data access
			 */
			(self->ctrl.exitinfo1 >> 4) & 1 ? "I" : "D",
			/*
			 * Processor read 1 in a PTE's reserved bits
			 */
			(self->ctrl.exitinfo1 >> 3) & 1 ? "R" : "-",
			/*
			 * User/Supervisor (CPL=3 or not 3)
			 */
			(self->ctrl.exitinfo1 >> 2) & 1 ? "U" : "S",
			/*
			 * Access is write or read
			 */
			(self->ctrl.exitinfo1 >> 1) & 1 ? "W" : "R",
			/*
			 * Page is present or not
			 */
			(self->ctrl.exitinfo1 >> 0) & 1 ? "P" : "-"
		] = count();
	}
}

fbt::vcpu_entry_bailout_checks:return / self->in_vm_run == 1 /{
	if (arg1 != 0) {
		self->exit_understood = 1;
		self->next_exit_reason = "early_bailout";
	}
}

fbt::vcpu_run_state_pending:return / self->in_vm_run == 1 /{
	if (arg1 != 0) {
		self->exit_understood = 1;
		self->next_exit_reason = "run_state_pending";
	}
}

fbt::vm_run:return / self->in_vm_run == 1 && self->vmexit != NULL / {
	self->in_vm_run = 0;

	if (!self->exit_understood) {
		misunderstood_exits += 1;
	}
	if (self->vmexit->exitcode == VM_EXITCODE_BOGUS) {
		@bogus_reasons[self->next_exit_reason] = count();
	}
}

tick-1s {
	printf("=== Exit summary, one second ending at %Y ===\n",
	    walltimestamp);
	printf("  %8s  %s\n", "SVM code", "Count");
	printa("  %8x  %@8u\n", @exits);

	printf("IOIO SVM exits:\n");
	printf("  %4s  %3s  %4s  %4s  %5s\n",
	    "Port", "Op", "OpSz", "Addr", "Count");
	printa("  %4x  %3s  %4s  %4s  %@5d\n", @io_info);

	printf("NPF SVM exits:\n");
	printf("  %-16s  %-16s  %5s  %s\n",
	    "Guest PA", "Guest RIP", "#PF flags", "Count");
	printa("  %16x  %16x  %5s%s%s%s%s  %@8u\n", @npf_info);

	printf("vm_run() VM_EXITCODE_BOGUS reasons:\n");
	printa("  %20s %@8u\n", @bogus_reasons);

	if (misunderstood_exits > 0) {
		printf("Exits this script did not understand: %d\n",
		    misunderstood_exits);
		misunderstood_exits = 0;
	}

	printf("\n");

	/*
	 * Clear all accumulated data, but keep the most common keys to churn a
	 * little less if there is relatively little activity and a key flips
	 * from zero to non-zero counts regularly.
	 */
	trunc(@exits, 10);
	clear(@exits);
	trunc(@io_info, 10);
	clear(@io_info);
	trunc(@npf_info, 10);
	clear(@npf_info);
	trunc(@bogus_reasons, 10);
	clear(@bogus_reasons);
}


================================================
FILE: tools/check_headers
================================================
#!/bin/bash
set -e

# The ref that headers should be checked against.
#
# This should almost certainly be `stlouis`, as in the "stlouis" branch in the
# Oxide fork of illumos. You may want to change this for testing or development
# if your changes to Propolis track changes in the OS as well.
#
# As a default this ref should probably not change.
HEADER_CHECK_REF="stlouis"

# Directories with `ctest2`-based `header-check` crates. This list should track
# the similar exclusions in `Cargo.toml`, and are described more there.
HEADER_CHECK_DIRS=(
  crates/bhyve-api/header-check/
  crates/nvpair/header-check/
  crates/viona-api/header-check/
)

function usage() {
  SCRIPTNAME="${0##*/}"
  echo "usage:"
  echo "  $SCRIPTNAME run <gate_src_dir>"
  echo "      run Propolis header-check tests against the provided gate checkout"
  echo "  $SCRIPTNAME gate_ref"
  echo "      print the illumos-gate ref headers should be checked against"
}

function run_checks() {
  export GATE_SRC="$(readlink -e $1)"

  if ! [ -d "$GATE_SRC" ]; then
    echo "header-check was given non-existent \"$GATE_SRC\" as gate directory"
    exit 1
  fi

  for checkdir in "${HEADER_CHECK_DIRS[@]}"; do
    echo "RUNNING HEADER-CHECK FOR $checkdir"
    (cd "$checkdir"; GATE_SRC="$GATE_SRC" cargo test)
  done
}

OP="$1"

if [ -z "$1" ]; then
  usage
  exit 1;
fi

case "$OP" in
  "gate_ref" )
    echo "$HEADER_CHECK_REF"
    ;;
  "run" )
    GATE_DIR="$2"
    run_checks "$GATE_DIR"
    ;;
  * )
    usage
    exit 1
    ;;
esac


================================================
FILE: tools/install_builder_prerequisites.sh
================================================
#!/usr/bin/env bash

# This file is adapted from Omicron's install_builder_prerequisites.sh.

set -eu

MARKER=/etc/opt/oxide/NO_INSTALL
if [[ -f "$MARKER" ]]; then
  echo "This system has the marker file $MARKER, aborting." >&2
  exit 1
fi

# Set the CWD to Omicron's source.
SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
cd "${SOURCE_DIR}/.."

function on_exit
{
  echo "Something went wrong, but this script is idempotent - If you can fix the issue, try re-running"
}

trap on_exit ERR

function usage
{
  echo "Usage: ./install_builder_prerequisites.sh <OPTIONS>"
  echo "  Options: "
  echo "   -y: Assume 'yes' instead of showing confirmation prompts"
  echo "   -p: Skip checking paths"
  echo "   -r: Number of retries to perform for network operations (default: 3)"
  exit 1
}

ASSUME_YES="false"
SKIP_PATH_CHECK="false"
OMIT_SUDO="false"
RETRY_ATTEMPTS=3
while getopts ypsr: flag
do
  case "${flag}" in
    y) ASSUME_YES="true" ;;
    p) SKIP_PATH_CHECK="true" ;;
    s) OMIT_SUDO="true" ;;
    r) RETRY_ATTEMPTS=${OPTARG} ;;
    *) usage
  esac
done

# Offers a confirmation prompt, unless we were passed `-y`.
#
# Args:
#  $1: Text to be displayed
function confirm
{
  if [[ "${ASSUME_YES}" == "true" ]]; then
    response=y
  else
    read -r -p "$1 (y/n): " response
  fi
  case $response in
    [yY])
      true
      ;;
    *)
      false
      ;;
  esac
}

# Function which executes all provided arguments, up to ${RETRY_ATTEMPTS}
# times, or until the command succeeds.
function retry
{
  attempts="${RETRY_ATTEMPTS}"
  # Always try at least once
  attempts=$((attempts < 1 ? 1 : attempts))
  for i in $(seq 1 $attempts); do
    retry_rc=0
    "$@" || retry_rc=$?;
    if [[ "$retry_rc" -eq 0 ]]; then
      return
    fi

    if [[ $i -ne $attempts ]]; then
      echo "Failed to run command -- will try $((attempts - i)) more times"
    fi
  done

  exit $retry_rc
}

function xtask
{
  if [ -z ${XTASK_BIN+x} ]; then
    cargo xtask "$@"
  else
    "$XTASK_BIN" "$@"
  fi
}

HOST_OS=$(uname -s)

function install_packages {
  if [[ "${HOST_OS}" == "Linux" ]]; then
    # TODO: configure a Nix flake for folks who prefer that

    packages=(
      'build-essential'
      'libclang-dev'
      'pkg-config'
    )
    if [[ "${OMIT_SUDO}" == "false" ]]; then
      maybe_sudo="sudo"
    else
      maybe_sudo=""
    fi
    $maybe_sudo apt-get update
    if [[ "${ASSUME_YES}" == "true" ]]; then
        $maybe_sudo apt-get install -y "${packages[@]}"
    else
        confirm "Install (or update) [${packages[*]}]?" && $maybe_sudo apt-get install "${packages[@]}"
    fi
  elif [[ "${HOST_OS}" == "SunOS" ]]; then
    CLANGVER=15
    PGVER=13
    packages=(
      "pkg:/package/pkg"
      "build-essential"
      "pkg-config"
      # "bindgen leverages libclang to preprocess, parse, and type check C and C++ header files."
      "pkg:/ooce/developer/clang-$CLANGVER"
    )

    # Install/update the set of packages.
    # Explicitly manage the return code using "rc" to observe the result of this
    # command without exiting the script entirely (due to bash's "errexit").
    rc=0
    confirm "Install (or update) [${packages[*]}]?" && { pfexec pkg install -v "${packages[@]}" || rc=$?; }
    # Return codes:
    #  0: Normal Success
    #  4: Failure because we're already up-to-date. Also acceptable.
    if ((rc != 4 && rc != 0)); then
      exit "$rc"
    fi

    confirm "Set mediators?" && {
      pfexec pkg set-mediator -V $CLANGVER clang llvm
    }

    pkg mediator -a
    pkg publisher
    pkg list -afv "${packages[@]}"
  elif [[ "${HOST_OS}" == "Darwin" ]]; then
    # clang is expected to be installed via the Xcode Command Line Tools.
    echo "Nothing to do on macOS"
  else
    echo "Unsupported OS: ${HOST_OS}"
    exit 1
  fi
}

retry install_packages

# Validate the PATH:
expected_in_path=(
  'pkg-config'
)

function show_hint
{
  case "$1" in
    "pkg-config")
      if [[ "${HOST_OS}" == "SunOS" ]]; then
        echo "On illumos, $1 is typically found in '/usr/bin'"
      fi
      ;;
    *)
      ;;
  esac
}

# Check all paths before returning an error, unless we were told not too.
if [[ "$SKIP_PATH_CHECK" == "true" ]]; then
  echo "All prerequisites installed successfully"
  exit 0
fi

ANY_PATH_ERROR="false"
for command in "${expected_in_path[@]}"; do
  rc=0
  which "$command" &> /dev/null || rc=$?
  if [[ "$rc" -ne 0 ]]; then
    echo "ERROR: $command seems installed, but was not found in PATH. Please add it."
    show_hint "$command"
    ANY_PATH_ERROR="true"
  fi
done

if [[ "$ANY_PATH_ERROR" == "true" ]]; then
  exit 1
fi

echo "All builder prerequisites installed successfully, and PATH looks valid"


================================================
FILE: xtask/Cargo.toml
================================================
[package]
name = "xtask"
version = "0.0.0"
edition = "2021"

[[bin]]
name = "xtask"
test = false
doctest = false

[dependencies]
anyhow.workspace = true
camino.workspace = true
cargo_metadata.workspace = true
clap = { workspace = true, features = ["derive"] }
escargot.workspace = true
serde = { workspace = true, features = ["derive"] }
owo-colors = { workspace = true, features = ["supports-colors"] }
serde_json = { workspace = true }
serde_yaml = { version = "0.9" }
glob = { version = "0.3.1" }
globset = { version = "0.4.11" }

================================================
FILE: xtask/src/external.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

//! External xtasks. (extasks?)

use std::ffi::OsString;
use std::os::unix::process::CommandExt;
use std::process::Command;

use anyhow::{Context, Result};
use clap::Parser;

/// Argument parser for external xtasks.
///
/// In general we want all developer tasks to be discoverable simply by running
/// `cargo xtask`, but some development tools end up with a particularly
/// large dependency tree. It's not ideal to have to pay the cost of building
/// our release engineering tooling if all the user wants to do is check for
/// workspace dependency issues.
///
/// `External` provides a pattern for creating xtasks that live in other crates.
/// An external xtask is defined on `crate::Cmds` as a tuple variant containing
/// `External`, which captures all arguments and options (even `--help`) as
/// a `Vec<OsString>`. The main function then calls `External::exec` with the
/// appropriate bin target name and any additional Cargo arguments.
#[derive(Parser)]
#[clap(
    disable_help_flag(true),
    disable_help_subcommand(true),
    disable_version_flag(true)
)]
pub struct External {
    #[clap(trailing_var_arg(true), allow_hyphen_values(true))]
    args: Vec<OsString>,

    // This stores an in-progress Command builder. `cargo_args` appends args
    // to it, and `exec` consumes it. Clap does not treat this as a command
    // (`skip`), but fills in this field by calling `new_command`.
    #[clap(skip = new_command())]
    command: Command,
}

impl External {
    pub fn exec_bin(
        self,
        package: impl AsRef<str>,
        bin_target: impl AsRef<str>,
    ) -> Result<()> {
        self.exec_common(&[
            "--package",
            package.as_ref(),
            "--bin",
            bin_target.as_ref(),
        ])
    }

    fn exec_common(mut self, args: &[&str]) -> Result<()> {
        let error = self.command.args(args).arg("--").args(self.args).exec();
        Err(error).context("failed to exec `cargo run`")
    }
}

fn new_command() -> Command {
    let mut command = cargo_command(CargoLocation::FromEnv);
    command.arg("run");
    command
}

/// Creates and prepares a `std::process::Command` for the `cargo` executable.
pub fn cargo_command(location: CargoLocation) -> Command {
    let mut command = location.resolve();

    for (key, _) in std::env::vars_os() {
        let Some(key) = key.to_str() else { continue };
        if SANITIZED_ENV_VARS.matches(key) {
            command.env_remove(key);
        }
    }

    command
}

/// How to determine the location of the `cargo` executable.
#[derive(Clone, Copy, Debug)]
pub enum CargoLocation {
    /// Use the `CARGO` environment variable, and fall back to `"cargo"` if it
    /// is not set.
    FromEnv,
}

impl CargoLocation {
    fn resolve(self) -> Command {
        match self {
            CargoLocation::FromEnv => {
                let cargo = std::env::var_os("CARGO")
                    .unwrap_or_else(|| OsString::from("cargo"));
                Command::new(&cargo)
            }
        }
    }
}

#[derive(Debug)]
struct SanitizedEnvVars {
    // At the moment we only ban some prefixes, but we may also want to ban env
    // vars by exact name in the future.
    prefixes: &'static [&'static str],
}

impl SanitizedEnvVars {
    const fn new() -> Self {
        // Remove many of the environment variables set in
        // https://doc.rust-lang.org/cargo/reference/environment-variables.html#environment-variables-cargo-sets-for-build-scripts.
        // This is done to avoid recompilation with crates like ring between
        // `cargo clippy` and `cargo xtask clippy`. (This is really a bug in
        // both ring's build script and in Cargo.)
        //
        // The current list is informed by looking at ring's build script, so
        // it's not guaranteed to be exhaustive and it may need to grow over
        // time.
        let prefixes = &["CARGO_PKG_", "CARGO_MANIFEST_", "CARGO_CFG_"];
        Self { prefixes }
    }

    fn matches(&self, key: &str) -> bool {
        self.prefixes.iter().any(|prefix| key.starts_with(prefix))
    }
}

static SANITIZED_ENV_VARS: SanitizedEnvVars = SanitizedEnvVars::new();


================================================
FILE: xtask/src/main.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Result;
use clap::{Parser, Subcommand};

mod external;
mod task_clippy;
mod task_fmt;
mod task_license;
mod task_phd;
mod task_prepush;
mod task_style;
mod util;

#[derive(Parser)]
#[command(name = "cargo xtask", about = "Builder tasks for Propolis")]
struct Args {
    #[command(subcommand)]
    cmd: Cmds,
}

#[derive(Subcommand)]
#[allow(clippy::large_enum_variant)]
enum Cmds {
    /// Run suite of clippy checks
    Clippy {
        /// Treat warnings as errors
        #[arg(short, long)]
        strict: bool,

        /// Suppress non-essential output
        #[arg(short, long)]
        quiet: bool,
    },
    /// Check style according to `rustfmt`
    Fmt,
    /// (Crudely) Check for appropriate license headers
    License,
    /// Manage OpenAPI documents
    Openapi(external::External),
    /// Preform pre-push checks (clippy, license, fmt, etc)
    Prepush {
        /// Suppress non-essential output
        #[arg(short, long)]
        quiet: bool,
    },
    /// Run the PHD test suite
    Phd {
        #[clap(subcommand)]
        cmd: task_phd::Cmd,
    },
    /// Perform misc style checks
    Style,
}

fn main() -> Result<()> {
    match Args::parse().cmd {
        Cmds::Clippy { strict, quiet } => {
            task_clippy::cmd_clippy(strict, quiet)
        }
        Cmds::Fmt => task_fmt::cmd_fmt(),
        Cmds::License => {
            task_license::cmd_license()?;

            println!("License checks pass");
            Ok(())
        }
        Cmds::Openapi(external) => external
            .exec_bin("propolis-dropshot-apis", "propolis-dropshot-apis"),
        Cmds::Phd { cmd } => cmd.run(),
        Cmds::Prepush { quiet } => {
            task_prepush::cmd_prepush(quiet)?;

            println!("Pre-push checks pass");
            Ok(())
        }
        Cmds::Style => {
            task_style::cmd_style()?;

            println!("Style checks pass");
            Ok(())
        }
    }
}


================================================
FILE: xtask/src/task_clippy.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::process::Command;

use anyhow::{bail, Result};

use crate::util::*;

pub(crate) fn cmd_clippy(strict: bool, quiet: bool) -> Result<()> {
    let wroot = workspace_root()?;

    let run_clippy = |args: &[&str]| -> Result<bool> {
        let mut cmd = Command::new("cargo");
        cmd.arg("clippy").args(args).current_dir(&wroot);

        if quiet {
            cmd.arg("--quiet");
        }

        // no-deps and subsequent options must follow `--`
        cmd.args(["--", "--no-deps"]);

        // Disable lossless cast warnings until
        // https://github.com/oxidecomputer/usdt/issues/240 is fixed.
        // cmd.args(["--warn", "clippy::cast_lossless"]);

        if strict {
            cmd.arg("-Dwarnings");
        }

        let status = cmd.spawn()?.wait()?;
        Ok(!status.success())
    };

    let mut failed = false;

    // Everything in the workspace (including tests, etc)
    failed |= run_clippy(&["--workspace", "--all-targets"])?;

    // Check the server as it is built for production
    failed |=
        run_clippy(&["-p", "propolis-server", "--features", "omicron-build"])?;

    // Check the Falcon bits
    //
    // TODO(jph): Currently specifying both the propolis-client and
    // propolis-server packages in a single clippy command will cause clippy
    // to fail because cargo finds 2 copies of propolis-client. This is because
    // dice-util depends on sled-agent-client, which depends on a rev of
    // propolis.
    //
    // We should clean this up by making sled-agent-client not re-export
    // propolis-client..
    failed |= run_clippy(&["--features", "falcon", "-p", "propolis-server"])?;
    failed |= run_clippy(&["-p", "propolis-client"])?;

    // Check the mock server
    failed |= run_clippy(&["-p", "propolis-mock-server"])?;

    // Check standalone with crucible enabled
    failed |=
        run_clippy(&["-p", "propolis-standalone", "--features", "crucible"])?;

    // Check PHD bits
    failed |= run_clippy(&["-p", "phd-runner"])?;

    if failed {
        bail!("Clippy failure(s) detected")
    }

    Ok(())
}


================================================
FILE: xtask/src/task_fmt.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::process::Command;

use anyhow::{bail, Result};

use crate::util::*;

pub(crate) fn cmd_fmt() -> Result<()> {
    let wroot = workspace_root()?;

    let mut cmd = Command::new("cargo");
    cmd.arg("fmt").arg("--check").current_dir(&wroot);

    if !cmd.spawn()?.wait()?.success() {
        bail!("rustfmt failure(s) detected")
    }

    Ok(())
}


================================================
FILE: xtask/src/task_license.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::fs::File;
use std::io::{BufRead, BufReader};

use anyhow::{bail, Context, Result};
use serde::Deserialize;

use crate::util::*;

#[derive(Deserialize, Debug)]
struct LicenseRc {
    header: LicenseHeader,
}
#[derive(Deserialize, Debug)]
struct LicenseHeader {
    license: LicenseEnt,
    paths: Vec<String>,
    #[serde(rename = "paths-ignore")]
    paths_ignore: Option<Vec<String>>,
}
#[derive(Deserialize, Debug)]
struct LicenseEnt {
    content: String,
}

fn commentify(raw: String) -> Vec<String> {
    raw.lines().map(|l| format!("// {l}")).collect::<Vec<String>>()
}

fn check_file(fp: File, needle: &[String]) -> Result<Option<String>> {
    let mut lines = BufReader::new(fp).lines();
    for (num, left) in needle.iter().enumerate() {
        if let Some(right) = lines.next() {
            let right = right?;
            if left == &right {
                continue;
            }
        }
        return Ok(Some(format!(
            "Expected license header not found at line {}",
            num + 1
        )));
    }
    Ok(None)
}

pub(crate) fn cmd_license() -> Result<()> {
    // Find licenserc file
    let ws_root = workspace_root()?;
    let mut lic_path = ws_root.clone();
    lic_path.push(".licenserc.yaml");

    let lic_fp =
        File::open(lic_path.as_path()).context("cannot open licenserc file")?;

    let config: LicenseRc = serde_yaml::from_reader(&lic_fp)
        .context("could not parse licenserc file")?;

    if config.header.paths.is_empty() {
        bail!("No file paths configured")
    }
    let ignore = match config.header.paths_ignore.as_ref() {
        Some(paths) => {
            let mut builder = globset::GlobSetBuilder::new();
            for path in paths.iter() {
                builder.add(globset::Glob::new(path).context(format!(
                    "'{path}' is not valid path-ignore glob"
                ))?);
            }
            Some(builder.build()?)
        }
        None => None,
    };

    let needle = commentify(config.header.license.content);

    let mut outcome = true;
    for item in config.header.paths.iter() {
        let mut glob_path = ws_root.clone();
        glob_path.push(item);
        let entries = glob::glob(glob_path.as_str())?;

        for entry in entries {
            let item_path = entry.context("item path not readable")?;
            let item_short_path = item_path.strip_prefix(&ws_root)?;

            if let Some(iglob) = ignore.as_ref() {
                if iglob.is_match(item_short_path) {
                    continue;
                }
            }
            let item_fp = File::open(item_path.as_path())
                .context("could not open item for reading")?;
            let result = check_file(item_fp, &needle)?;
            if let Some(err) = result {
                eprintln!("{}: {}", item_short_path.display(), err);
                outcome = false;
            }
        }
    }
    if !outcome {
        bail!("License errors detected")
    }

    Ok(())
}


================================================
FILE: xtask/src/task_phd.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use std::{collections::HashMap, fs, process::Command, time};

macro_rules! cargo_log {
    ($tag:literal, $($arg:tt)*) => {
        eprintln!(
            "{:>indent$} {}",
            owo_colors::OwoColorize::if_supports_color(
                &$tag,
                owo_colors::Stream::Stderr,
                |tag| owo_colors::Style::new().bold().green().style(tag),
            ),
            format_args!($($arg)*),
            indent = 12
        )
    }
}

macro_rules! cargo_warn {
    ($($arg:tt)*) => {
        eprintln!(
            "{}{} {}",
            owo_colors::OwoColorize::if_supports_color(
                &"warning",
                owo_colors::Stream::Stderr,
                |tag| owo_colors::Style::new().bold().yellow().style(tag),
            ),
            owo_colors::OwoColorize::if_supports_color(
                &":",
                owo_colors::Stream::Stderr,
                |tag| owo_colors::Style::new().bold().style(tag),
            ),
            format_args!($($arg)*),
        )
    }
}

#[derive(clap::Subcommand, Debug, Clone)]
#[allow(clippy::large_enum_variant)]
pub(crate) enum Cmd {
    /// Run the PHD test suite.
    Run {
        #[clap(flatten)]
        args: RunArgs,
    },

    /// Delete any temporary directories older than one day.
    Tidy,

    /// List PHD tests
    List {
        /// Arguments to pass to `phd-runner list`.
        #[clap(trailing_var_arg = true, allow_hyphen_values = true)]
        phd_args: Vec<String>,
    },

    /// Display `phd-runner`'s help output.
    RunnerHelp {
        /// Arguments to pass to `phd-runner help`.
        #[clap(trailing_var_arg = true, allow_hyphen_values = true)]
        phd_args: Vec<String>,
    },
}

#[derive(clap::Parser, Debug, Clone)]
pub(crate) struct RunArgs {
    /// If set, temporary directories older than one day will not be
    /// deleted.
    #[clap(long)]
    no_tidy: bool,

    /// Arguments to pass to `phd-runner run`.
    ///
    /// If the `--propolis-server-cmd`, `--crucible-downstairs-commit`,
    /// `--base-propolis-branch`, `--artifact-toml-path`, or
    /// `--artifact-directory` arguments are passed here, they will override
    /// `cargo xtask phd`'s default values for those arguments.
    ///
    /// Use `cargo xtask phd runner-help` for details on the arguments passed to
    /// `phd-runner`.
    #[clap(trailing_var_arg = true, allow_hyphen_values = true)]
    phd_args: Vec<String>,

    #[clap(flatten)]
    propolis_args: PropolisArgs,

    #[clap(flatten)]
    artifact_args: ArtifactStoreArgs,

    #[clap(flatten)]
    base_propolis_args: BasePropolisArgs,

    #[clap(flatten)]
    crucible_args: CrucibleArgs,
}

#[derive(Debug, Clone, clap::Parser)]
#[group(id = "propolis", required = false, multiple = false)]
#[command(next_help_heading = "Propolis Selection")]
struct PropolisArgs {
    /// The command to use to launch the Propolis server.
    ///
    /// If this is not present, a Propolis server binary will be built automatically.
    #[clap(long = "propolis-server-cmd", value_parser, value_hint = clap::ValueHint::FilePath)]
    server_cmd: Option<Utf8PathBuf>,

    /// If set, build `propolis-server` in release mode.
    #[clap(long, short = 'r')]
    release: bool,

    /// If set, build `propolis-server` with `omicron-build`. This may enable
    /// codepaths that only work on `stlouis`, or otherwise expect specific test
    /// environment configuration (such as having a reservoir large enough to
    /// allocate VMs).
    ///
    /// This can be helpful if you're debugging an issue that occurs in CI,
    /// where `propolis-server` is also tested with `omicron-build`.
    #[clap(long)]
    omicron_build: bool,
}

#[derive(Debug, Clone, clap::Parser)]
#[group(id = "base-propolis", required = false, multiple = false)]
#[command(next_help_heading = "Migration Base Propolis Selection")]
struct BasePropolisArgs {
    /// Git branch name to use for the "migration base" Propolis server artifact
    /// for migration-from-base tests.
    ///
    /// If this argument is provided, PHD will download the latest Propolis
    /// server artifact from Buildomat for the provided branch name, and use it
    /// to test migration from that Propolis version to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-commit`,
    /// `--base-propolis-cmd`, and `--no-base-propolis` arguments. If none of
    /// these arguments are provided, `cargo xtask phd` will automatically pass
    /// `--base-propolis-branch master` to `phd-runner`.
    #[clap(long, value_parser)]
    base_propolis_branch: Option<String>,

    /// Git commit hash to use for the "migration base" Propolis server artifact for
    /// migration from base tests.
    ///
    /// If this argument is provided, PHD will download the Propolis server
    /// artifact from Buildomat for the provided commit hash, and use it
    /// to test migration from that Propolis version to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-branch`,
    /// `--base-propolis-cmd`, and `--no-base-propolis` arguments. If none of
    /// these arguments are provided, `cargo xtask phd` will automatically pass
    /// `--base-propolis-branch master` to `phd-runner`.
    #[clap(long, value_parser)]
    base_propolis_commit: Option<String>,

    /// The path of a local command to use as the "migration base" Propolis
    /// server for migration-from-base tests.
    ///
    /// If this argument is provided, PHD will use the provided command to run
    /// to test migration from that Propolis binary to the Propolis revision
    /// under test.
    ///
    /// This argument conflicts with the `--base-propolis-branch`,
    /// `--base-propolis-commit`, and `--no-base-propolis` arguments. If none of
    /// these arguments are provided, `cargo xtask phd` will automatically pass
    /// `--base-propolis-branch master` to `phd-runner`.
    #[clap(
        long,
        value_hint = clap::ValueHint::FilePath,
        value_parser
    )]
    base_propolis_cmd: Option<Utf8PathBuf>,

    /// If set, skip migration-from-base tests.
    ///
    /// If this flag is present, `cargo xtask phd` will not pass
    /// `--base-propolis-branch master` to the `phd-runner` command.
    ///
    /// This flag conflicts with the `--base-propolis-branch`,
    /// `--base-propolis-commit`, and `--base-propolis-cmd` arguments. If none
    /// of these arguments are provided, `cargo xtask phd` will automatically
    /// pass `--base-propolis-branch master` to `phd-runner`.
    #[clap(long)]
    no_base_propolis: bool,
}

#[derive(Debug, Clone, clap::Parser)]
#[group(id = "crucible", required = false, multiple = false)]
#[command(next_help_heading = "Crucible Downstairs Selection")]
struct CrucibleArgs {
    /// The path of a local command to use to launch Crucible downstairs
    /// servers.
    ///
    /// This argument conflicts with the `--crucible-downstairs-commit` and
    /// `--no-crucible` arguments. If none of the `--crucible-downstairs-cmd`,
    /// `--crucible-downstairs-commit`, and `--no-crucible` arguments are
    /// provided, then `cargo xtask phd` will pass `--crucible-downstairs-commit
    /// auto` to `phd-runner`.
    #[clap(long, value_parser, value_hint = clap::ValueHint::FilePath)]
    crucible_downstairs_cmd: Option<Utf8PathBuf>,

    /// Git revision to use to download Crucible downstairs artifacts from
    /// Buildomat.
    ///
    /// This may either be the string 'auto' or a 40-character Git commit
    /// hash. If this is 'auto', then the Git revision of Crucible is determined
    /// automatically based on the Propolis workspace's Cargo git dependency on
    /// the `crucible` crate (determined when `phd-runner` is built). If an
    /// explicit commit hash is provided, that commit is downloaded from
    /// Buildomat, regardless of which version of the `crucible` crate Propolis
    /// depends on.
    ///
    /// This argument conflicts with the `--crucible-downstairs-cmd` and
    /// `--no-crucible` arguments. If none of the `--crucible-downstairs-cmd`,
    /// `--crucible-downstairs-commit`, and `--no-crucible` arguments are
    /// provided, then `cargo xtask phd` will pass `--crucible-downstairs-commit
    /// auto` to `phd-runner`.
    #[clap(long, value_parser)]
    crucible_downstairs_commit: Option<String>,

    /// If set, skip Crucible tests.
    ///
    /// If this flag is present, `cargo xtask phd` will not pass
    /// `--crucible-downstairs-commit auto` to the `phd-runner` command.
    ///
    /// This flag conflicts with the `--crucible-downstairs-cmd` and
    /// `--crucible-downstairs-commit` arguments. If none of the
    /// `--crucible-downstairs-cmd`, `--crucible-downstairs-commit`, and
    /// `--no-crucible` arguments are provided, then `cargo xtask phd` will pass
    /// `--crucible-downstairs-commit auto` to `phd-runner`.
    #[clap(long)]
    no_crucible: bool,
}

#[derive(Debug, Clone, clap::Parser)]
#[command(next_help_heading = "Artifact Store Options")]
struct ArtifactStoreArgs {
    /// The path to a TOML file describing the artifact store to use for this
    /// run.
    #[clap(long, value_parser, value_hint = clap::ValueHint::FilePath)]
    artifact_toml_path: Option<Utf8PathBuf>,

    /// The directory in which artifacts (guest OS images, bootroms, etc.)
    /// are to be stored.
    ///
    /// If this argument is not provided, the default artifact store directory
    /// will be created in `target/phd/artifacts`.
    #[clap(long, value_parser)]
    artifact_directory: Option<Utf8PathBuf>,
}

impl Cmd {
    pub(crate) fn run(self) -> anyhow::Result<()> {
        let meta = cargo_metadata::MetadataCommand::new()
            .no_deps()
            .exec()
            .context("Failed to run cargo metadata")?;
        let phd_dir = relativize(&meta.target_directory).join("phd");

        let mut tmp_dir = phd_dir.join("tmp");
        let now = time::SystemTime::now();

        let RunArgs {
            no_tidy,
            propolis_args,
            phd_args,
            artifact_args,
            base_propolis_args,
            crucible_args,
        } = match self {
            Self::Run { args } => args,
            Self::Tidy => {
                cargo_log!("Tidying up", "old temporary directories...");
                delete_old_tmps(tmp_dir, now)?;
                return Ok(());
            }
            Self::List { phd_args } => {
                let phd_runner = build_bin("phd-runner", false, None, None)?;
                let status = run_exit_code(
                    phd_runner.command().arg("list").args(phd_args),
                )?;
                std::process::exit(status);
            }

            Self::RunnerHelp { phd_args } => {
                let phd_runner = build_bin("phd-runner", false, None, None)?;
                let status = run_exit_code(
                    phd_runner.command().arg("help").args(phd_args),
                )?;
                std::process::exit(status);
            }
        };

        let propolis_local_path = match propolis_args.server_cmd {
            Some(cmd) => {
                cargo_log!("Using", "local Propolis server command {cmd}");
                cmd
            }
            None => {
                let mut server_build_env = HashMap::new();
                server_build_env
                    .insert("PHD_BUILD".to_string(), "true".to_string());

                // Some PHD tests specifically cover cases where a component in
                // the system has encountered an error, so enable
                // failure-injection. Do not enable `omicron-build` by default
                // because it configures `propolis-server` to expect an Omicron-
                // or stlouis-specific environment.
                let mut propolis_features = vec!["failure-injection"];
                if propolis_args.omicron_build {
                    // If you know your environment looks like we'd expect in
                    // Omicron, have at it!
                    propolis_features.push("omicron-build");
                }
                let bin = build_bin(
                    "propolis-server",
                    propolis_args.release,
                    Some(&propolis_features.join(",")),
                    Some(server_build_env),
                )?;
                let path = bin
                    .path()
                    .try_into()
                    .context("Propolis server path is not UTF-8")?;
                relativize(path).to_path_buf()
            }
        };

        let artifact_dir =
            artifact_args.artifact_directory.unwrap_or_else(|| {
                // if there's no explicitly overridden `artifact_dir` path, use
                // `target/phd/artifacts`.
                phd_dir.join("artifacts")
            });

        mkdir(&artifact_dir, "artifact directory")?;

        let tmp_dir = {
            if no_tidy {
                cargo_log!(
                    "Skipping",
                    "temp directory cleanup; disabled by `--no-tidy`"
                );
            } else {
                delete_old_tmps(&tmp_dir, now)?;
            }
            tmp_dir.push(
                now.duration_since(time::UNIX_EPOCH)
                    .unwrap()
                    .as_secs()
                    .to_string(),
            );
            tmp_dir
        };

        mkdir(&tmp_dir, "temp directory")?;

        let artifacts_toml =
            artifact_args.artifact_toml_path.unwrap_or_else(|| {
                // if there's no explicitly overridden `artifacts.toml` path,
                // determine the default one from the workspace path.
                relativize(&meta.workspace_root)
                    .join("phd-tests")
                    .join("artifacts.toml")
            });

        if artifacts_toml.exists() {
            cargo_log!("Found", "artifacts.toml at `{artifacts_toml}`")
        } else {
            anyhow::bail!("Missing artifacts config `{artifacts_toml}`!");
        }

        let phd_runner = build_bin("phd-runner", false, None, None)?;
        let mut cmd = if cfg!(target_os = "illumos") {
            let mut cmd = Command::new("pfexec");
            cmd.arg(phd_runner.path());
            cmd
        } else {
            // If we're not on Illumos, running the tests probably won't
            // actually work, because there's almost certainly no Bhyve. But,
            // we'll build and run the command anyway, because being able to run
            // on other systems may still be useful for PHD development (e.g.
            // testing changes to artifact management, etc).
            Command::new(phd_runner.path())
        };
        cmd.arg("run")
            .arg("--propolis-server-cmd")
            .arg(&propolis_local_path)
            .arg("--artifact-toml-path")
            .arg(&artifacts_toml)
            .arg("--artifact-directory")
            .arg(&artifact_dir)
            .arg("--tmp-directory")
            .arg(&tmp_dir);
        crucible_args.configure_command(&mut cmd);
        base_propolis_args.configure_command(&mut cmd);
        cmd.args(phd_args);

        let status = run_exit_code(&mut cmd)?;

        std::process::exit(status);
    }
}

impl CrucibleArgs {
    fn configure_command(&self, cmd: &mut Command) {
        if let Some(ref path) = self.crucible_downstairs_cmd {
            cargo_log!("Using", "local Crucible downstairs: {path}");
            cmd.arg("--crucible-downstairs-cmd").arg(path);
        } else if let Some(ref commit) = self.crucible_downstairs_commit {
            cargo_log!("Using", "Crucible downstairs from commit: {commit}");
            cmd.arg("--crucible-downstairs-commit").arg(commit);
        } else if self.no_crucible {
            cargo_log!("Skipping", "Crucible tests");
        } else {
            cmd.arg("--crucible-downstairs-commit").arg("auto");
        }
    }
}

impl BasePropolisArgs {
    fn configure_command(&self, cmd: &mut Command) {
        if let Some(ref path) = self.base_propolis_cmd {
            cargo_log!("Using", "local migration-base Propolis: {path}");
            cmd.arg("--base-propolis-cmd").arg(path);
        } else if let Some(ref commit) = self.base_propolis_commit {
            cargo_log!(
                "Using",
                "migration-base Propolis from commit: {commit}"
            );
            cmd.arg("--base-propolis-commit").arg(commit);
        } else if let Some(ref branch) = self.base_propolis_branch {
            cargo_log!(
                "Using",
                "migration-base Propolis from branch: {branch}"
            );
            cmd.arg("--base-propolis-branch").arg(branch);
        } else if self.no_base_propolis {
            cargo_log!("Skipping", "migration-from-base tests");
        } else {
            cmd.arg("--base-propolis-branch").arg("master");
        }
    }
}

/// Build the binary `name` in debug or release with an optional build
/// environment variables and a list of Cargo features.
///
/// `features` is passed directly to Cargo, and so must be a space or
/// comma-separated list of features to activate.
fn build_bin(
    name: impl AsRef<str>,
    release: bool,
    features: Option<&str>,
    build_env: Option<HashMap<String, String>>,
) -> anyhow::Result<escargot::CargoRun> {
    let name = name.as_ref();
    cargo_log!("Compiling", "{name}");

    let mut cmd =
        escargot::CargoBuild::new().package(name).bin(name).current_target();
    if let Some(features) = features {
        cmd = cmd.features(features);
    }
    if let Some(env) = build_env {
        for (k, v) in env {
            cmd = cmd.env(k, v);
        }
    }
    let profile = if release {
        cmd = cmd.release();
        "release [optimized]"
    } else {
        "dev [unoptimized + debuginfo]"
    };

    let t0 = time::Instant::now();
    let bin = cmd.run().with_context(|| format!("Failed to build {name}"))?;
    let t1 = t0.elapsed();
    cargo_log!("Finished", "{name} {profile} in {:0.2}s", t1.as_secs_f64());
    Ok(bin)
}

fn mkdir(
    path: impl AsRef<Utf8Path>,
    kind: impl std::fmt::Display,
) -> anyhow::Result<()> {
    let path = path.as_ref();
    if !path.exists() {
        std::fs::create_dir_all(path)
            .with_context(|| format!("Failed to create {kind} `{path}`"))?;
        cargo_log!("Created", "{kind} `{path}`");
    } else {
        cargo_log!("Found", "existing {kind} `{path}`");
    }
    Ok(())
}

fn run_exit_code(cmd: &mut Command) -> anyhow::Result<i32> {
    cargo_log!("Running", "{:#?}", PrettyCmd(cmd));
    cmd.status()
        .with_context(|| {
            format!("Failed to execute command {:?}", PrettyCmd(cmd))
        })?
        .code()
        .ok_or_else(|| {
            anyhow::anyhow!(
                "Command {:?} exited without a status code",
                PrettyCmd(cmd)
            )
        })
}

fn delete_old_tmps(
    tmp_dir: impl AsRef<Utf8Path>,
    now: time::SystemTime,
) -> anyhow::Result<()> {
    let tmp_dir = tmp_dir.as_ref();

    if !tmp_dir.exists() {
        return Ok(());
    }

    let mut deleted = 0;
    let mut sz = 0;
    let mut errs = 0;
    for entry in fs::read_dir(tmp_dir)
        .with_context(|| format!("Failed to read `{tmp_dir}`"))?
    {
        let entry = match entry {
            Ok(e) => e,
            Err(e) => {
                errs += 1;
                cargo_warn!("bad dir entry: {e}");
                continue;
            }
        };
        let path = entry.path();
        let meta = match entry.metadata() {
            Ok(e) => e,
            Err(e) => {
                errs += 1;
                cargo_warn!("failed to stat `{}`: {e}", path.display());
                continue;
            }
        };
        let modified = match meta.modified() {
            Ok(a) => a,
            Err(e) => {
                errs += 1;
                cargo_warn!(
                    "couldn't get last modified time for `{}`: {e}",
                    path.display(),
                );
                continue;
            }
        };
        if let Ok(age) = now.duration_since(modified) {
            const DAY_SECS: u64 = 60 * 60 * 24;
            if age.as_secs() > DAY_SECS {
                match fs::remove_dir_all(&path) {
                    Ok(()) => {
                        deleted += 1;
                        sz += meta.len();
                    }
                    Err(e) => {
                        errs += 1;
                        cargo_warn!(
                            "failed to remove `{}`: {e}",
                            path.display(),
                        );
                    }
                }
            }
        }
    }
    fn pluralize_dir(n: u64) -> &'static str {
        if n == 1 {
            "y"
        } else {
            "ies"
        }
    }

    if deleted > 0 {
        cargo_log!(
            "Tidied up",
            "{deleted} old temp director{}, {sz}B total",
            pluralize_dir(deleted)
        );
    }

    anyhow::ensure!(
        errs == 0,
        "{errs} temp director{} could not be tidied up!",
        pluralize_dir(errs)
    );

    Ok(())
}

struct PrettyCmd<'a>(&'a Command);

impl std::fmt::Debug for PrettyCmd<'_> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let &Self(cmd) = self;
        if let Some(path) =
            Utf8Path::from_path(std::path::Path::new(cmd.get_program()))
        {
            write!(f, "{}", relativize(path))?;
        } else {
            write!(f, "{}", cmd.get_program().to_string_lossy())?;
        }
        for arg in cmd.get_args() {
            let arg = arg.to_string_lossy();
            if f.alternate() && arg.starts_with("--") {
                write!(f, " \\\n\t{arg}")?;
            } else {
                write!(f, " {arg}")?;
            }
        }

        Ok(())
    }
}

fn relativize(path: &Utf8Path) -> &Utf8Path {
    use std::sync::OnceLock;

    static PWD: OnceLock<Utf8PathBuf> = OnceLock::new();
    let pwd = PWD.get_or_init(|| {
        std::env::current_dir()
            .expect("Failed to get current dir")
            .try_into()
            .expect("Current dir is not UTF-8")
    });
    path.strip_prefix(pwd).unwrap_or(path)
}


================================================
FILE: xtask/src/task_prepush.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use anyhow::{bail, Result};

use crate::{task_clippy, task_fmt, task_license, task_style};

pub(crate) fn cmd_prepush(quiet: bool) -> Result<()> {
    let mut errs = Vec::new();
    let checks: [(&str, &dyn Fn() -> bool); 4] = [
        ("clippy", &|| task_clippy::cmd_clippy(true, quiet).is_err()),
        ("fmt", &|| task_fmt::cmd_fmt().is_err()),
        ("license", &|| task_license::cmd_license().is_err()),
        ("style", &|| task_style::cmd_style().is_err()),
    ];

    for (name, func) in checks {
        if !quiet {
            println!("Checking {name}...");
        }
        if func() {
            errs.push(name);
        }
    }

    if !errs.is_empty() {
        bail!("Pre-push error(s) in: {}", errs.join(", "))
    }
    Ok(())
}


================================================
FILE: xtask/src/task_style.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use std::collections::BTreeSet;
use std::io::{BufRead, BufReader};
use std::process::{Command, Stdio};

use anyhow::Result;

use crate::util::*;

fn check_test_names() -> Result<()> {
    let wroot = workspace_root()?;

    // Get listing of all tests (excluding doctests)
    let mut cmd = Command::new("cargo");
    let child = cmd
        .args([
            "test",
            "--workspace",
            "--all-targets",
            "--",
            "--list",
            "--format=terse",
        ])
        .stdin(Stdio::null())
        .stdout(Stdio::piped())
        .stderr(Stdio::null())
        .current_dir(&wroot)
        .spawn()?;

    let problem_mods = BufReader::new(child.stdout.expect("stdout is present"))
        .lines()
        .map_while(std::result::Result::ok)
        .filter_map(|line| {
            // Look for "<test name>: test"
            let test_name = match line.rsplit_once(": ") {
                Some((p, "test")) => p,
                _ => return None,
            };

            // Check for `mod tests` instead of `mod test` as the last component of
            // the test name (prior to the test function name itself);
            let mut name_parts = test_name.rsplit("::");
            match (name_parts.next(), name_parts.next()) {
                (_fn_name, Some("tests")) => {
                    Some(test_name.rsplit_once("::").unwrap().0.to_owned())
                }
                _ => None,
            }
        })
        .collect::<BTreeSet<_>>();

    if !problem_mods.is_empty() {
        eprintln!("The following test module paths should use `mod test` instead of `mod tests`:");
        for path in problem_mods {
            eprintln!("\t{path}");
        }
        Err(anyhow::anyhow!("Unconforming test module names"))
    } else {
        Ok(())
    }
}

pub(crate) fn cmd_style() -> Result<()> {
    check_test_names()
}


================================================
FILE: xtask/src/util.rs
================================================
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.
use anyhow::{Context, Result};

pub(crate) fn workspace_root() -> Result<camino::Utf8PathBuf> {
    cargo_metadata::MetadataCommand::new()
        .no_deps()
        .exec()
        .context("Failed to run cargo metadata")
        .map(|meta| meta.workspace_root)
}