From a0a7164708c43de2b0b03d2570e3d54f3ff41c46 Mon Sep 17 00:00:00 2001 From: Drew Newberry Date: Thu, 7 May 2026 10:06:13 -0700 Subject: [PATCH] refactor(vm): remove legacy openshell-vm crate Signed-off-by: Drew Newberry --- AGENTS.md | 1 - Cargo.toml | 1 - architecture/build.md | 2 +- crates/openshell-bootstrap/src/paths.rs | 9 +- crates/openshell-driver-vm/README.md | 4 - crates/openshell-driver-vm/build.rs | 3 +- crates/openshell-driver-vm/runtime/README.md | 4 +- crates/openshell-driver-vm/runtime/pins.env | 13 +- crates/openshell-driver-vm/src/driver.rs | 5 - crates/openshell-driver-vm/src/rootfs.rs | 22 +- crates/openshell-vm/Cargo.toml | 50 - crates/openshell-vm/README.md | 204 -- crates/openshell-vm/build.rs | 142 -- crates/openshell-vm/entitlements.plist | 8 - crates/openshell-vm/scripts/build-rootfs.sh | 849 ------- .../scripts/check-vm-capabilities.sh | 234 -- .../scripts/openshell-vm-exec-agent.py | 322 --- .../openshell-vm/scripts/openshell-vm-init.sh | 833 ------- crates/openshell-vm/src/embedded.rs | 454 ---- crates/openshell-vm/src/exec.rs | 1176 ---------- crates/openshell-vm/src/ffi.rs | 340 --- crates/openshell-vm/src/health.rs | 204 -- crates/openshell-vm/src/lib.rs | 2069 ----------------- crates/openshell-vm/src/main.rs | 279 --- .../openshell-vm/tests/gateway_integration.rs | 155 -- e2e/rust/e2e-vm.sh | 2 +- rfc/0001-core-architecture/README.md | 4 +- tasks/scripts/vm/build-libkrun.sh | 2 +- 28 files changed, 13 insertions(+), 7378 deletions(-) delete mode 100644 crates/openshell-vm/Cargo.toml delete mode 100644 crates/openshell-vm/README.md delete mode 100644 crates/openshell-vm/build.rs delete mode 100644 crates/openshell-vm/entitlements.plist delete mode 100755 crates/openshell-vm/scripts/build-rootfs.sh delete mode 100755 crates/openshell-vm/scripts/check-vm-capabilities.sh delete mode 100644 crates/openshell-vm/scripts/openshell-vm-exec-agent.py delete mode 100755 crates/openshell-vm/scripts/openshell-vm-init.sh delete mode 100644 crates/openshell-vm/src/embedded.rs delete mode 100644 crates/openshell-vm/src/exec.rs delete mode 100644 crates/openshell-vm/src/ffi.rs delete mode 100644 crates/openshell-vm/src/health.rs delete mode 100644 crates/openshell-vm/src/lib.rs delete mode 100644 crates/openshell-vm/src/main.rs delete mode 100644 crates/openshell-vm/tests/gateway_integration.rs diff --git a/AGENTS.md b/AGENTS.md index feabeddc4..2395b8176 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -39,7 +39,6 @@ These pipelines connect skills into end-to-end workflows. Individual skill files | `crates/openshell-core/` | Shared core | Common types, configuration, error handling | | `crates/openshell-providers/` | Provider management | Credential provider backends | | `crates/openshell-tui/` | Terminal UI | Ratatui-based dashboard for monitoring | -| `crates/openshell-vm/` | MicroVM runtime | Experimental, work-in-progress libkrun-based VM execution | | `crates/openshell-driver-kubernetes/` | Kubernetes compute driver | In-process `ComputeDriver` backend for K8s sandbox pods | | `crates/openshell-driver-docker/` | Docker compute driver | In-process `ComputeDriver` backend for local Docker sandbox containers | | `crates/openshell-driver-vm/` | VM compute driver | Standalone libkrun-backed `ComputeDriver` subprocess (embeds its own rootfs + runtime) | diff --git a/Cargo.toml b/Cargo.toml index c9bfe6c91..9bc3f9ea2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,6 @@ [workspace] resolver = "2" members = ["crates/*"] -exclude = ["crates/openshell-vm"] [workspace.package] version = "0.0.0" diff --git a/architecture/build.md b/architecture/build.md index 2567285b5..baf44eba9 100644 --- a/architecture/build.md +++ b/architecture/build.md @@ -14,7 +14,7 @@ OpenShell builds these main artifacts: | CLI package and Python SDK | `python/openshell` plus Rust binaries where packaged | | Gateway container image | `deploy/docker/Dockerfile.images` | | Helm chart | `deploy/helm/openshell` | -| VM driver/runtime assets | `crates/openshell-driver-vm` and `crates/openshell-vm` | +| VM driver/runtime assets | `crates/openshell-driver-vm` | | Published docs site | `docs/` rendered by Fern config in `fern/` | Sandbox community images are built outside this repository. diff --git a/crates/openshell-bootstrap/src/paths.rs b/crates/openshell-bootstrap/src/paths.rs index 1c514f370..cd3cb7693 100644 --- a/crates/openshell-bootstrap/src/paths.rs +++ b/crates/openshell-bootstrap/src/paths.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use miette::Result; -use openshell_core::paths::{xdg_config_dir, xdg_data_dir}; +use openshell_core::paths::xdg_config_dir; use std::path::PathBuf; /// Path to the file that stores the active gateway name. @@ -26,13 +26,6 @@ pub fn last_sandbox_path(gateway: &str) -> Result { Ok(gateways_dir()?.join(gateway).join("last_sandbox")) } -/// Base directory for openshell-vm data (without version). -/// -/// Location: `$XDG_DATA_HOME/openshell/openshell-vm/` -pub fn openshell_vm_base_dir() -> Result { - Ok(xdg_data_dir()?.join("openshell").join("openshell-vm")) -} - #[cfg(test)] mod tests { use super::*; diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index 6cf982a7d..a3bdf9822 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -194,10 +194,6 @@ in `post_install`, and owns the `brew services` gateway lifecycle. The service also leaves `OPENSHELL_DRIVERS` unset so driver choice remains automatic unless the user explicitly overrides it. -## Relationship to `openshell-vm` - -`openshell-vm` is a separate, legacy crate that runs the **whole OpenShell gateway inside a single VM**. It remains in the repository for later deprecation or removal, but is excluded from normal workspace builds and release paths. `openshell-driver-vm` is the active compute driver called by a host-resident gateway to spawn **per-sandbox VMs**. The driver vendors its own rootfs handling and runtime loader so `openshell-server` never has to link libkrun. - ## TODOs - The gateway still configures the driver via CLI args; this will move to a gRPC bootstrap call so the driver interface is uniform across backends. See the `TODO(driver-abstraction)` notes in `crates/openshell-server/src/lib.rs` and `crates/openshell-server/src/compute/vm.rs`. diff --git a/crates/openshell-driver-vm/build.rs b/crates/openshell-driver-vm/build.rs index ea4c4d2e0..6ea845dc9 100644 --- a/crates/openshell-driver-vm/build.rs +++ b/crates/openshell-driver-vm/build.rs @@ -4,8 +4,7 @@ //! Build script for openshell-driver-vm. //! //! This crate embeds the sandbox supervisor plus the minimal libkrun runtime -//! artifacts it needs to boot VMs without depending on the openshell-vm binary -//! or crate. +//! artifacts it needs to boot VMs without a separate VM runtime binary. use std::path::{Path, PathBuf}; use std::{env, fs}; diff --git a/crates/openshell-driver-vm/runtime/README.md b/crates/openshell-driver-vm/runtime/README.md index 74afeb2be..17dc8dab7 100644 --- a/crates/openshell-driver-vm/runtime/README.md +++ b/crates/openshell-driver-vm/runtime/README.md @@ -12,9 +12,7 @@ runtime/ ``` `openshell-driver-vm` embeds libkrun, libkrunfw, gvproxy, and the bundled -`openshell-sandbox` supervisor. The legacy `crates/openshell-vm` crate remains -in the repository, but normal workspace builds and release workflows do not use -it. +`openshell-sandbox` supervisor. ## Why diff --git a/crates/openshell-driver-vm/runtime/pins.env b/crates/openshell-driver-vm/runtime/pins.env index 4a60c0225..b526947df 100644 --- a/crates/openshell-driver-vm/runtime/pins.env +++ b/crates/openshell-driver-vm/runtime/pins.env @@ -3,23 +3,18 @@ # Pinned dependency versions for the openshell-driver-vm runtime. # -# This file is sourced by build-rootfs.sh and -# build-libkrun.sh. It centralises version pins and content-addressed -# digests so that builds are reproducible and auditable. +# This file is sourced by VM runtime packaging scripts. It centralises version +# pins and content-addressed digests so that builds are reproducible and +# auditable. # # Environment variables override these defaults — CI and local dev workflows -# can still set IMAGE_TAG, K3S_VERSION, etc. as before. +# can still set the runtime dependency versions below. # # To update a dependency: # 1. Change the version/digest below. # 2. Run the relevant build script to verify. # 3. Commit pins.env alongside any script changes. -# ── k3s binary ───────────────────────────────────────────────────────── -K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" -K3S_ARM64_SHA256="${K3S_ARM64_SHA256:-228809a7ef47d25c1bdbe746944931ec2fd2edf842b9cf50f1dd4f9ec2505b0e}" -K3S_AMD64_SHA256="${K3S_AMD64_SHA256:-3ae8e35a62ac83e8e197c117858a564134057a7b8703cf73e67ce60d19f4a22b}" - # ── Base Docker image (digest-pinned) ────────────────────────────────── # Tag: nvcr.io/nvidia/base/ubuntu:noble-20251013 VM_BASE_IMAGE="${VM_BASE_IMAGE:-nvcr.io/nvidia/base/ubuntu@sha256:43fa5063e80fbbc533892af3ccca190868ce48db5a8928b19d7815c40436af8e}" diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index d79e5d922..92cab23af 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -3019,14 +3019,11 @@ mod tests { "bin/sed", "sbin/ip", "opt/openshell/bin/openshell-sandbox", - "usr/local/bin/k3s", ] { let path = source_rootfs.join(path); fs::create_dir_all(path.parent().unwrap()).unwrap(); fs::write(path, "").unwrap(); } - fs::create_dir_all(source_rootfs.join("opt/openshell/manifests")).unwrap(); - fs::write(source_rootfs.join("opt/openshell/manifests/old.yaml"), "").unwrap(); create_rootfs_archive_from_dir(&source_rootfs, &exported_rootfs).unwrap(); prepare_exported_rootfs_archive( @@ -3045,8 +3042,6 @@ mod tests { .join("opt/openshell/bin/openshell-sandbox") .is_file() ); - assert!(!extracted.join("usr/local/bin/k3s").exists()); - assert!(!extracted.join("opt/openshell/manifests").exists()); assert_eq!( fs::read_to_string(extracted.join("opt/openshell/.rootfs-type")).unwrap(), "sandbox\n" diff --git a/crates/openshell-driver-vm/src/rootfs.rs b/crates/openshell-driver-vm/src/rootfs.rs index 5ea687d15..e498bd779 100644 --- a/crates/openshell-driver-vm/src/rootfs.rs +++ b/crates/openshell-driver-vm/src/rootfs.rs @@ -137,16 +137,7 @@ fn append_symlink_to_archive( } fn prepare_sandbox_rootfs(rootfs: &Path) -> Result<(), String> { - for relative in [ - "usr/local/bin/k3s", - "usr/local/bin/kubectl", - "var/lib/rancher", - "etc/rancher", - "opt/openshell/charts", - "opt/openshell/manifests", - "opt/openshell/.initialized", - "opt/openshell/.rootfs-type", - ] { + for relative in ["opt/openshell/.initialized", "opt/openshell/.rootfs-type"] { remove_rootfs_path(rootfs, relative)?; } @@ -326,14 +317,8 @@ mod tests { let dir = unique_temp_dir(); let rootfs = dir.join("rootfs"); - fs::create_dir_all(rootfs.join("usr/local/bin")).expect("create usr/local/bin"); fs::create_dir_all(rootfs.join("etc")).expect("create etc"); - fs::create_dir_all(rootfs.join("var/lib/rancher")).expect("create var/lib/rancher"); - fs::create_dir_all(rootfs.join("opt/openshell/charts")).expect("create charts"); - fs::create_dir_all(rootfs.join("opt/openshell/manifests")).expect("create manifests"); fs::create_dir_all(rootfs.join("opt/openshell/bin")).expect("create openshell bin"); - fs::write(rootfs.join("usr/local/bin/k3s"), b"k3s").expect("write k3s"); - fs::write(rootfs.join("usr/local/bin/kubectl"), b"kubectl").expect("write kubectl"); fs::write(rootfs.join("opt/openshell/.initialized"), b"yes").expect("write initialized"); fs::write( rootfs.join("opt/openshell/bin/openshell-sandbox"), @@ -357,11 +342,6 @@ mod tests { prepare_sandbox_rootfs(&rootfs).expect("prepare sandbox rootfs"); validate_sandbox_rootfs(&rootfs).expect("validate sandbox rootfs"); - assert!(!rootfs.join("usr/local/bin/k3s").exists()); - assert!(!rootfs.join("usr/local/bin/kubectl").exists()); - assert!(!rootfs.join("var/lib/rancher").exists()); - assert!(!rootfs.join("opt/openshell/charts").exists()); - assert!(!rootfs.join("opt/openshell/manifests").exists()); assert!(rootfs.join("srv/openshell-vm-sandbox-init.sh").is_file()); assert!(!rootfs.join("sandbox").exists()); assert!( diff --git a/crates/openshell-vm/Cargo.toml b/crates/openshell-vm/Cargo.toml deleted file mode 100644 index 7d74b3139..000000000 --- a/crates/openshell-vm/Cargo.toml +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -[package] -name = "openshell-vm" -version.workspace = true -edition.workspace = true -rust-version.workspace = true -license.workspace = true -repository.workspace = true -description = "MicroVM runtime using libkrun for hardware-isolated execution" - -[lib] -name = "openshell_vm" -path = "src/lib.rs" - -[[bin]] -name = "openshell-vm" -path = "src/main.rs" - -[dependencies] -base64 = "0.22" -clap = { workspace = true } -indicatif = "0.17" -libc = "0.2" -libloading = "0.8" -miette = { workspace = true } -nix = { workspace = true } -openshell-bootstrap = { path = "../openshell-bootstrap" } -openshell-core = { path = "../openshell-core" } -serde = { workspace = true } -serde_json = "1" -tar = "0.4" -thiserror = { workspace = true } -tracing = { workspace = true } -tracing-subscriber = { workspace = true } -zstd = "0.13" - -# Async runtime and gRPC for health check -tokio = { workspace = true } -tonic = { workspace = true, features = ["tls", "tls-native-roots"] } -rustls = { workspace = true } -rustls-pemfile = { workspace = true } -tokio-rustls = { workspace = true } - -[build-dependencies] -zstd = "0.13" - -[lints] -workspace = true diff --git a/crates/openshell-vm/README.md b/crates/openshell-vm/README.md deleted file mode 100644 index 266818741..000000000 --- a/crates/openshell-vm/README.md +++ /dev/null @@ -1,204 +0,0 @@ -# openshell-vm - -> Status: Legacy. This crate remains in the repository for later deprecation or -> removal, but it is excluded from normal workspace builds, CI, and release -> paths. Active VM sandbox work lives in `crates/openshell-driver-vm`. - -MicroVM runtime for OpenShell, powered by [libkrun](https://github.com/containers/libkrun). Boots a lightweight ARM64 Linux VM on macOS (Apple Hypervisor.framework) or Linux (KVM) running a single-node k3s cluster with the OpenShell control plane. - -## Current Path - -Use `mise run gateway:vm` for the supported per-sandbox VM driver workflow. The -standalone `openshell-vm` tasks and wrappers are intentionally not part of the -normal task surface. - -## Prerequisites - -- **macOS (Apple Silicon)** or **Linux (aarch64 or x86_64 with KVM)** -- Rust toolchain -- Guest-supervisor cross-compile toolchain (needed on macOS, and on Linux when host arch ≠ guest arch): - - Matching rustup target: `rustup target add aarch64-unknown-linux-gnu` (or `x86_64-unknown-linux-gnu` for an amd64 guest) - - `cargo install --locked cargo-zigbuild` and `brew install zig` (or distro equivalent). `build-rootfs.sh` uses `cargo zigbuild` to cross-compile the in-VM `openshell-sandbox` supervisor binary. -- [mise](https://mise.jdx.dev/) task runner -- Docker (for rootfs builds) -- `gh` CLI (for downloading pre-built runtime) - -### macOS-Specific - -The binary must be codesigned with the Hypervisor.framework entitlement. To -codesign manually: - -```bash -codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm -``` - -## Setup - -### Download Pre-Built Runtime (Default) - -Downloads libkrun, libkrunfw, and gvproxy from the `vm-runtime` GitHub Release for -the active VM driver runtime: - -```bash -mise run vm:setup -``` - -### Build from Source - -Compiles the runtime from source (15-45 minutes, needed for custom kernel work): - -```bash -FROM_SOURCE=1 mise run vm:setup -``` - -On macOS this builds a custom libkrunfw (kernel firmware with bridge/netfilter support) via `krunvm`, then builds a portable libkrun. On Linux it builds both natively. - -## Build - -There is no first-class `mise` build task for the standalone binary. This crate -is no longer part of normal CI or release builds. - -## Rootfs - -The legacy rootfs scripts are kept with this crate for historical reference. -They are not used by `openshell-driver-vm`, which derives each sandbox guest -rootfs from a container image at create time. - -## Run - -### Default (Gateway Mode) - -Boots the full OpenShell gateway -- k3s + openshell-server + openshell-sandbox: - -Run the binary directly after manually building and signing it: - -```bash -./target/debug/openshell-vm -``` - -### Custom Process - -Run an arbitrary process inside a fresh VM instead of k3s: - -```bash -./target/debug/openshell-vm --exec /bin/sh --vcpus 2 --mem 2048 -``` - -### Execute in a Running VM - -Attach to a running VM and run a command: - -```bash -./target/debug/openshell-vm exec -- ls / -./target/debug/openshell-vm exec -- sh # interactive shell -``` - -### Named Instances - -Run multiple isolated VM instances side-by-side: - -```bash -./target/debug/openshell-vm --name dev -./target/debug/openshell-vm --name staging -``` - -Each instance gets its own extracted rootfs under `~/.local/share/openshell/openshell-vm//instances//rootfs`. - -## CLI Reference - -```text -openshell-vm [OPTIONS] [COMMAND] - -Options: - --rootfs Path to aarch64 Linux rootfs directory - --name Named VM instance (auto-clones rootfs) - --exec Run a custom process instead of k3s - --args ... Arguments to the executable - --env ... Environment variables - --workdir Working directory inside the VM [default: /] - -p, --port ... Port mappings (host_port:guest_port) - --vcpus Virtual CPUs [default: 4 gateway, 2 exec] - --mem RAM in MiB [default: 8192 gateway, 2048 exec] - --krun-log-level <0-5> libkrun log level [default: 1] - --net Networking: gvproxy, tsi, none [default: gvproxy] - --reset Wipe runtime state before booting - -Subcommands: - prepare-rootfs Ensure the target rootfs exists - exec Execute a command inside a running VM -``` - -## Tasks - -Standalone `openshell-vm` tasks have been removed from the normal task surface. -The remaining VM tasks (`vm:setup`, `vm:supervisor`, `gateway:vm`, `e2e:vm`, -and `vm:smoke:orphan-cleanup`) support `openshell-driver-vm`. - -## Architecture - -```text -Host (macOS / Linux) - openshell-vm binary - |-- Embedded runtime (libkrun, libkrunfw, gvproxy, rootfs.tar.zst) - |-- FFI: loads libkrun at runtime via dlopen - |-- gvproxy: virtio-net networking (real eth0 + DHCP) - |-- virtio-fs: shares rootfs with guest - \-- vsock: host-to-guest command execution (port 10777) - -Guest VM (aarch64 Linux) - PID 1: openshell-vm-init.sh - |-- Mounts filesystems, configures networking - |-- Sets up bridge CNI, generates PKI - \-- Execs k3s server - |-- openshell-server (gateway control plane) - \-- openshell-sandbox (pod supervisor) -``` - -## Environment Variables - -| Variable | When | Purpose | -|----------|------|---------| -| `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` | Build time | Path to compressed runtime artifacts | -| `OPENSHELL_VM_RUNTIME_DIR` | Runtime | Override the runtime bundle directory | -| `OPENSHELL_VM_DIAG=1` | Runtime | Enable diagnostic output inside the VM | -| `FROM_SOURCE=1` | `vm:setup` | Build runtime from source instead of downloading | - -## Custom Kernel (libkrunfw) - -The stock libkrunfw (e.g. from Homebrew) lacks bridge, netfilter, and conntrack support needed for pod networking. OpenShell builds a custom libkrunfw with these enabled. - -Build it via the setup command: - -```bash -FROM_SOURCE=1 mise run vm:setup -``` - -See [`runtime/README.md`](runtime/README.md) for details on the kernel config and troubleshooting. - -## Testing - -Integration tests require a built rootfs and macOS ARM64 with libkrun: - -```bash -cargo test -p openshell-vm -- --ignored -``` - -Individual tests: - -```bash -# Full gateway boot test (boots VM, waits for gRPC on port 30051) -cargo test -p openshell-vm gateway_boots -- --ignored - -# Run a command inside the VM -cargo test -p openshell-vm gateway_exec_runs -- --ignored - -# Exec into a running VM -cargo test -p openshell-vm gateway_exec_attaches -- --ignored -``` - -Verify kernel capabilities inside a running VM: - -```bash -./target/debug/openshell-vm exec -- /srv/check-vm-capabilities.sh -./target/debug/openshell-vm exec -- /srv/check-vm-capabilities.sh --json -``` diff --git a/crates/openshell-vm/build.rs b/crates/openshell-vm/build.rs deleted file mode 100644 index 6351be6e8..000000000 --- a/crates/openshell-vm/build.rs +++ /dev/null @@ -1,142 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Build script for openshell-vm. -//! -//! This script copies pre-compressed VM runtime artifacts (libkrun, libkrunfw, -//! gvproxy) to `OUT_DIR` for embedding via `include_bytes!()`. -//! -//! The compressed artifacts are expected to be prepared by: -//! `mise run vm:setup` (one-time) then `mise run vm:build` -//! -//! Environment: -//! `OPENSHELL_VM_RUNTIME_COMPRESSED_DIR` - Path to compressed artifacts - -use std::path::{Path, PathBuf}; -use std::{env, fs}; - -fn main() { - println!("cargo:rerun-if-env-changed=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR"); - - // Re-run if any compressed artifact changes. - if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { - println!("cargo:rerun-if-changed={dir}"); - for name in &[ - "libkrun.so.zst", - "libkrunfw.so.5.zst", - "libkrun.dylib.zst", - "libkrunfw.5.dylib.zst", - "gvproxy.zst", - "rootfs.tar.zst", - ] { - println!("cargo:rerun-if-changed={dir}/{name}"); - } - } - - let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set")); - let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); - let target_arch = env::var("CARGO_CFG_TARGET_ARCH").unwrap_or_default(); - - // Determine platform-specific file names - let (libkrun_name, libkrunfw_name) = match target_os.as_str() { - "macos" => ("libkrun.dylib", "libkrunfw.5.dylib"), - "linux" => ("libkrun.so", "libkrunfw.so.5"), - _ => { - println!("cargo:warning=VM runtime not available for {target_os}-{target_arch}"); - generate_stub_resources(&out_dir); - return; - } - }; - - // Check for pre-compressed artifacts from mise task - let compressed_dir = if let Ok(dir) = env::var("OPENSHELL_VM_RUNTIME_COMPRESSED_DIR") { - PathBuf::from(dir) - } else { - println!("cargo:warning=OPENSHELL_VM_RUNTIME_COMPRESSED_DIR not set"); - println!("cargo:warning=Run: mise run vm:setup"); - generate_stub_resources(&out_dir); - return; - }; - - if !compressed_dir.is_dir() { - println!( - "cargo:warning=Compressed runtime dir not found: {}", - compressed_dir.display() - ); - println!("cargo:warning=Run: mise run vm:setup"); - generate_stub_resources(&out_dir); - return; - } - - // Copy compressed files to OUT_DIR - let files = [ - (format!("{libkrun_name}.zst"), format!("{libkrun_name}.zst")), - ( - format!("{libkrunfw_name}.zst"), - format!("{libkrunfw_name}.zst"), - ), - ("gvproxy.zst".to_string(), "gvproxy.zst".to_string()), - ("rootfs.tar.zst".to_string(), "rootfs.tar.zst".to_string()), - ]; - - let mut all_found = true; - for (src_name, dst_name) in &files { - let src_path = compressed_dir.join(src_name); - let dst_path = out_dir.join(dst_name); - - if src_path.exists() { - // Remove existing file first (may be read-only from previous build) - if dst_path.exists() { - let _ = fs::remove_file(&dst_path); - } - fs::copy(&src_path, &dst_path).unwrap_or_else(|e| { - panic!( - "Failed to copy {} to {}: {}", - src_path.display(), - dst_path.display(), - e - ) - }); - let size = fs::metadata(&dst_path).map_or(0, |m| m.len()); - println!("cargo:warning=Embedded {src_name}: {size} bytes"); - } else { - println!( - "cargo:warning=Missing compressed artifact: {}", - src_path.display() - ); - all_found = false; - } - } - - if !all_found { - println!("cargo:warning=Some artifacts missing. Run: mise run vm:setup"); - generate_stub_resources(&out_dir); - } -} - -/// Generate stub (empty) resource files so the build can complete. -/// The embedded module will fail at runtime if these stubs are used. -fn generate_stub_resources(out_dir: &Path) { - let target_os = env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); - - let (libkrun_name, libkrunfw_name) = match target_os.as_str() { - "macos" => ("libkrun.dylib", "libkrunfw.5.dylib"), - _ => ("libkrun.so", "libkrunfw.so.5"), - }; - - let stubs = [ - format!("{libkrun_name}.zst"), - format!("{libkrunfw_name}.zst"), - "gvproxy.zst".to_string(), - "rootfs.tar.zst".to_string(), - ]; - - for name in &stubs { - let path = out_dir.join(name); - if !path.exists() { - // Write an empty file as a stub - fs::write(&path, b"") - .unwrap_or_else(|e| panic!("Failed to write stub {}: {}", path.display(), e)); - } - } -} diff --git a/crates/openshell-vm/entitlements.plist b/crates/openshell-vm/entitlements.plist deleted file mode 100644 index 154f3308e..000000000 --- a/crates/openshell-vm/entitlements.plist +++ /dev/null @@ -1,8 +0,0 @@ - - - - - com.apple.security.hypervisor - - - diff --git a/crates/openshell-vm/scripts/build-rootfs.sh b/crates/openshell-vm/scripts/build-rootfs.sh deleted file mode 100755 index bfafe8c85..000000000 --- a/crates/openshell-vm/scripts/build-rootfs.sh +++ /dev/null @@ -1,849 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Build a Ubuntu rootfs for the openshell-vm microVM. -# -# By default, produces a fully pre-initialized rootfs with k3s pre-installed, -# the OpenShell helm chart and manifests baked in, container images pre-loaded, -# AND a fully initialized k3s cluster state (database, TLS, images imported, -# all services deployed). On first VM boot, k3s resumes from this pre-baked -# state instead of cold-starting, achieving ~3-5s startup times. -# -# With --base, produces a lightweight rootfs (~200-300MB) with: -# - Base Ubuntu with k3s binary -# - OpenShell supervisor binary -# - Helm charts and Kubernetes manifests -# - NO pre-loaded container images (pulled on demand) -# - NO pre-initialized k3s state (cold start on first boot) -# First boot will be slower (~30-60s) as k3s initializes and pulls images. -# -# Supports aarch64 and x86_64 guest architectures. The target architecture -# is auto-detected from the host but can be overridden with --arch. -# -# Usage: -# ./build-rootfs.sh [--base] [--arch aarch64|x86_64] [output_dir] -# -# If output_dir is omitted, the rootfs is built under target/rootfs-build. -# -# Requires: Docker (or compatible container runtime), curl, helm -# Full mode (default) also requires: zstd, sqlite3, a built openshell-vm binary - -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Source container engine abstraction (provides ce, ce_build, etc.) -_CE_SEARCH="${SCRIPT_DIR}/../../../tasks/scripts/container-engine.sh" -if [ -f "${_CE_SEARCH}" ]; then - source "${_CE_SEARCH}" -else - # Fallback: if run from a different working directory, try repo root. - source "$(cd "${SCRIPT_DIR}/../../.." && pwd)/tasks/scripts/container-engine.sh" -fi - -# Source pinned dependency versions (digests, checksums, commit SHAs). -# Environment variables override pins. -PINS_FILE="${SCRIPT_DIR}/../../openshell-driver-vm/runtime/pins.env" -if [ -f "$PINS_FILE" ]; then - # shellcheck source=../../openshell-driver-vm/runtime/pins.env - source "$PINS_FILE" -fi - -# ── Argument parsing ─────────────────────────────────────────────────── -BASE_ONLY=false -GUEST_ARCH="" -POSITIONAL_ARGS=() -while [[ $# -gt 0 ]]; do - case "$1" in - --base) - BASE_ONLY=true; shift ;; - --arch) - GUEST_ARCH="$2"; shift 2 ;; - *) - POSITIONAL_ARGS+=("$1"); shift ;; - esac -done - -# ── Architecture detection ───────────────────────────────────────────── -# Allow override via --arch flag; default to host architecture. -if [ -z "$GUEST_ARCH" ]; then - case "$(uname -m)" in - aarch64|arm64) GUEST_ARCH="aarch64" ;; - x86_64) GUEST_ARCH="x86_64" ;; - *) - echo "ERROR: Unsupported host architecture: $(uname -m)" >&2 - echo " Use --arch aarch64 or --arch x86_64 to override." >&2 - exit 1 - ;; - esac -fi - -case "$GUEST_ARCH" in - aarch64) - DOCKER_PLATFORM="linux/arm64" - K3S_BINARY_SUFFIX="-arm64" - K3S_CHECKSUM_VAR="K3S_ARM64_SHA256" - RUST_TARGET="aarch64-unknown-linux-gnu" - ;; - x86_64) - DOCKER_PLATFORM="linux/amd64" - K3S_BINARY_SUFFIX="" # x86_64 binary has no suffix - K3S_CHECKSUM_VAR="K3S_AMD64_SHA256" - RUST_TARGET="x86_64-unknown-linux-gnu" - ;; - *) - echo "ERROR: Unsupported guest architecture: ${GUEST_ARCH}" >&2 - echo " Supported: aarch64, x86_64" >&2 - exit 1 - ;; -esac - -# Project root (two levels up from crates/openshell-vm/scripts/) -PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)" -DEFAULT_ROOTFS="${PROJECT_ROOT}/target/rootfs-build" -ROOTFS_DIR="${POSITIONAL_ARGS[0]:-${DEFAULT_ROOTFS}}" -CONTAINER_NAME="krun-rootfs-builder" -BASE_IMAGE_TAG="krun-rootfs:openshell-vm" -# K3S_VERSION uses the semver "+" form for GitHub releases. -# The mise env may provide the Docker-tag form with "-" instead of "+"; -# normalise to "+" so the GitHub download URL works. -K3S_VERSION="${K3S_VERSION:-v1.35.2+k3s1}" -K3S_VERSION="${K3S_VERSION//-k3s/+k3s}" - -# Container images to pre-load into k3s (full mode only). -# AGENT_SANDBOX_IMAGE and COMMUNITY_SANDBOX_IMAGE are digest-pinned in pins.env. -# SERVER_IMAGE is intentionally unpinned (local dev artifact). -IMAGE_REPO_BASE="${IMAGE_REPO_BASE:-openshell}" -IMAGE_TAG="${IMAGE_TAG:-dev}" -SERVER_IMAGE="${IMAGE_REPO_BASE}/gateway:${IMAGE_TAG}" - -# Cross-platform checksum helper -verify_checksum() { - local expected="$1" file="$2" - if command -v sha256sum &>/dev/null; then - echo "${expected} ${file}" | sha256sum -c - - else - echo "${expected} ${file}" | shasum -a 256 -c - - fi -} - -ensure_build_nofile_limit() { - local desired="${OPENSHELL_VM_BUILD_NOFILE_LIMIT:-8192}" - local minimum=1024 - local current="" - local hard="" - local target="" - - [ "$(uname -s)" = "Darwin" ] || return 0 - command -v cargo-zigbuild >/dev/null 2>&1 || return 0 - - current="$(ulimit -n 2>/dev/null || echo "")" - case "${current}" in - ''|*[!0-9]*) - return 0 - ;; - esac - - if [ "${current}" -ge "${desired}" ]; then - return 0 - fi - - hard="$(ulimit -Hn 2>/dev/null || echo "")" - target="${desired}" - case "${hard}" in - ''|unlimited|infinity) - ;; - *[!0-9]*) - ;; - *) - if [ "${hard}" -lt "${target}" ]; then - target="${hard}" - fi - ;; - esac - - if [ "${target}" -gt "${current}" ] && ulimit -n "${target}" 2>/dev/null; then - echo "==> Raised open file limit for cargo-zigbuild: ${current} -> $(ulimit -n)" - fi - - current="$(ulimit -n 2>/dev/null || echo "${current}")" - case "${current}" in - ''|*[!0-9]*) - return 0 - ;; - esac - - if [ "${current}" -lt "${desired}" ]; then - echo "WARNING: Open file limit is ${current}; cargo-zigbuild is more reliable at ${desired}+ on macOS." - fi - - if [ "${current}" -lt "${minimum}" ]; then - echo "ERROR: Open file limit (${current}) is too low for cargo-zigbuild on macOS." - echo " Zig 0.14+ can fail with ProcessFdQuotaExceeded while linking large binaries." - echo " Run: ulimit -n ${desired}" - echo " Then re-run this script." - exit 1 - fi -} - -if [ "$BASE_ONLY" = true ]; then - echo "==> Building base openshell-vm rootfs" - echo " Guest arch: ${GUEST_ARCH}" - echo " k3s version: ${K3S_VERSION}" - echo " Output: ${ROOTFS_DIR}" - echo " Mode: base (no pre-loaded images, cold start)" -else - echo "==> Building openshell-vm rootfs" - echo " Guest arch: ${GUEST_ARCH}" - echo " k3s version: ${K3S_VERSION}" - echo " Images: ${SERVER_IMAGE}, ${COMMUNITY_SANDBOX_IMAGE}" - echo " Output: ${ROOTFS_DIR}" - echo " Mode: full (pre-loaded images, pre-initialized)" -fi -echo "" - -# cargo-zigbuild on macOS can exhaust the default per-process file descriptor -# limit while linking larger targets with Zig 0.14+. -ensure_build_nofile_limit - -# ── Check for running VM ──────────────────────────────────────────────── -# If an openshell-vm is using this rootfs via virtio-fs, wiping the rootfs -# corrupts the VM's filesystem (e.g. /var disappears) causing cascading -# k3s failures. We use two checks: -# -# 1. flock: The Rust openshell-vm process holds an exclusive flock on the lock -# file for its entire lifetime. This is the primary guard — it works -# even if the state file was deleted, and the OS releases the lock -# automatically when the process dies (including SIGKILL). -# -# 2. State file: Fallback check for the PID in the state file. This -# catches VMs launched before the flock guard was added. - -VM_LOCK_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm.lock" -if [ -f "${VM_LOCK_FILE}" ]; then - # Try to acquire the lock non-blocking. Use Python's fcntl.flock() - # because the `flock` CLI tool is not available on macOS. - if ! python3 -c " -import fcntl, os, sys -fd = os.open(sys.argv[1], os.O_RDONLY) -try: - fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) - fcntl.flock(fd, fcntl.LOCK_UN) -except BlockingIOError: - sys.exit(1) -finally: - os.close(fd) -" "${VM_LOCK_FILE}" 2>/dev/null; then - HOLDER_PID=$(cat "${VM_LOCK_FILE}" 2>/dev/null | tr -d '[:space:]') - echo "" - echo "ERROR: An openshell-vm (pid ${HOLDER_PID:-unknown}) holds a lock on this rootfs." - echo " Wiping the rootfs while the VM is running will corrupt its" - echo " filesystem and cause k3s failures." - echo "" - echo " Stop the VM first: kill ${HOLDER_PID:-}" - echo " Then re-run this script." - echo "" - exit 1 - fi -fi - -VM_STATE_FILE="$(dirname "${ROOTFS_DIR}")/$(basename "${ROOTFS_DIR}")-vm-state.json" -if [ -f "${VM_STATE_FILE}" ]; then - VM_PID=$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['pid'])" "${VM_STATE_FILE}" 2>/dev/null || echo "") - if [ -n "${VM_PID}" ] && kill -0 "${VM_PID}" 2>/dev/null; then - echo "" - echo "ERROR: An openshell-vm is running (pid ${VM_PID}) using this rootfs." - echo " Wiping the rootfs while the VM is running will corrupt its" - echo " filesystem and cause k3s failures." - echo "" - echo " Stop the VM first: kill ${VM_PID}" - echo " Then re-run this script." - echo "" - exit 1 - else - # Stale state file — VM is no longer running. Clean it up. - rm -f "${VM_STATE_FILE}" - fi -fi - -# ── Download k3s binary (outside Docker — much faster) ───────────────── - -K3S_BIN="/tmp/k3s-${GUEST_ARCH}-${K3S_VERSION}" -if [ -f "${K3S_BIN}" ]; then - echo "==> Using cached k3s binary: ${K3S_BIN}" -else - echo "==> Downloading k3s ${K3S_VERSION} for ${GUEST_ARCH}..." - curl -fSL "https://github.com/k3s-io/k3s/releases/download/${K3S_VERSION}/k3s${K3S_BINARY_SUFFIX}" \ - -o "${K3S_BIN}" - chmod +x "${K3S_BIN}" -fi - -# Verify k3s binary integrity. -K3S_CHECKSUM="${!K3S_CHECKSUM_VAR:-}" -if [ -n "${K3S_CHECKSUM}" ]; then - echo "==> Verifying k3s binary checksum..." - verify_checksum "${K3S_CHECKSUM}" "${K3S_BIN}" -else - echo "WARNING: ${K3S_CHECKSUM_VAR} not set, skipping checksum verification" -fi - -# ── Build base image with dependencies ───────────────────────────────── - -# Clean up any previous run -ce rm -f "${CONTAINER_NAME}" 2>/dev/null || true - -echo "==> Building base image..." -ce build --platform "${DOCKER_PLATFORM}" -t "${BASE_IMAGE_TAG}" \ - --build-arg "BASE_IMAGE=${VM_BASE_IMAGE}" -f - . <<'DOCKERFILE' -ARG BASE_IMAGE -FROM ${BASE_IMAGE} -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - ca-certificates \ - e2fsprogs \ - iptables \ - iproute2 \ - python3 \ - busybox-static \ - sqlite3 \ - util-linux \ - zstd \ - && rm -rf /var/lib/apt/lists/* -# busybox-static provides udhcpc for DHCP inside the VM. -RUN mkdir -p /usr/share/udhcpc && \ - ln -sf /bin/busybox /sbin/udhcpc -RUN mkdir -p /var/lib/rancher/k3s /etc/rancher/k3s -DOCKERFILE - -# Create a container and export the filesystem -echo "==> Creating container..." -ce create --platform "${DOCKER_PLATFORM}" --name "${CONTAINER_NAME}" "${BASE_IMAGE_TAG}" /bin/true - -echo "==> Exporting filesystem..." -# Previous builds may leave overlayfs work/ dirs with permissions that -# prevent rm on macOS. Force-fix permissions before removing. -if [ -d "${ROOTFS_DIR}" ]; then - chmod -R u+rwx "${ROOTFS_DIR}" 2>/dev/null || true - rm -rf "${ROOTFS_DIR}" -fi -mkdir -p "${ROOTFS_DIR}" -ce export "${CONTAINER_NAME}" | tar -C "${ROOTFS_DIR}" -xf - - -ce rm "${CONTAINER_NAME}" - -# ── Inject k3s binary ──────────────────────────────────────────────── - -echo "==> Injecting k3s binary..." -cp "${K3S_BIN}" "${ROOTFS_DIR}/usr/local/bin/k3s" -chmod +x "${ROOTFS_DIR}/usr/local/bin/k3s" -ln -sf /usr/local/bin/k3s "${ROOTFS_DIR}/usr/local/bin/kubectl" - -# k3s self-extracts runtime binaries (containerd, runc, CNI plugins, -# coreutils, etc.) into a versioned data directory the first time it -# runs. On the pre-initialized rootfs these were extracted during the -# Docker build or VM pre-init phase. docker export and macOS virtio-fs -# can strip execute bits from Linux ELF binaries, so fix them here. -echo " Fixing execute permissions on k3s data binaries..." -chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/* 2>/dev/null || true -chmod +x "${ROOTFS_DIR}"/var/lib/rancher/k3s/data/*/bin/aux/* 2>/dev/null || true - -# ── Inject scripts ──────────────────────────────────────────────────── - -echo "==> Injecting scripts..." -mkdir -p "${ROOTFS_DIR}/srv" -cp "${SCRIPT_DIR}/openshell-vm-init.sh" "${ROOTFS_DIR}/srv/openshell-vm-init.sh" -chmod +x "${ROOTFS_DIR}/srv/openshell-vm-init.sh" - -# Inject VM capability checker for runtime diagnostics. -cp "${SCRIPT_DIR}/check-vm-capabilities.sh" "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" -chmod +x "${ROOTFS_DIR}/srv/check-vm-capabilities.sh" - -# Inject the openshell-vm exec agent used by `openshell-vm exec`. -cp "${SCRIPT_DIR}/openshell-vm-exec-agent.py" "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" -chmod +x "${ROOTFS_DIR}/srv/openshell-vm-exec-agent.py" - -# ── Build and inject openshell-sandbox supervisor binary ───────────── -# The supervisor binary runs inside every sandbox pod. It is side-loaded -# from the node filesystem via a read-only hostPath volume mount at -# /opt/openshell/bin. Container images consume a prebuilt supervisor -# binary; here we cross-compile from the host using cargo-zigbuild. - -SUPERVISOR_TARGET="${RUST_TARGET}" -SUPERVISOR_BIN="${PROJECT_ROOT}/target/${SUPERVISOR_TARGET}/release/openshell-sandbox" - -echo "==> Building openshell-sandbox supervisor binary (${SUPERVISOR_TARGET})..." -SUPERVISOR_BUILD_LOG="$(mktemp -t openshell-supervisor-build.XXXXXX.log)" -run_supervisor_build() { - if command -v cargo-zigbuild >/dev/null 2>&1; then - cargo zigbuild --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" \ - --manifest-path "${PROJECT_ROOT}/Cargo.toml" - else - # Fallback: use plain cargo build when cargo-zigbuild is not available. - # This works for native builds (e.g. building x86_64 on x86_64) but - # will fail for true cross-compilation without a cross toolchain. - echo " cargo-zigbuild not found, falling back to cargo build..." - cargo build --release -p openshell-sandbox --target "${SUPERVISOR_TARGET}" \ - --manifest-path "${PROJECT_ROOT}/Cargo.toml" - fi -} -if run_supervisor_build >"${SUPERVISOR_BUILD_LOG}" 2>&1; then - tail -5 "${SUPERVISOR_BUILD_LOG}" - rm -f "${SUPERVISOR_BUILD_LOG}" -else - status=$? - echo "ERROR: supervisor build failed. Full output:" >&2 - cat "${SUPERVISOR_BUILD_LOG}" >&2 - echo " (log saved at ${SUPERVISOR_BUILD_LOG})" >&2 - exit "${status}" -fi - -if [ ! -f "${SUPERVISOR_BIN}" ]; then - echo "ERROR: supervisor binary not found at ${SUPERVISOR_BIN}" - exit 1 -fi - -echo " Injecting supervisor binary into rootfs..." -mkdir -p "${ROOTFS_DIR}/opt/openshell/bin" -cp "${SUPERVISOR_BIN}" "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" -chmod +x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" -echo " Size: $(du -h "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" | cut -f1)" - -# ── Package and inject helm chart ──────────────────────────────────── - -HELM_CHART_DIR="${PROJECT_ROOT}/deploy/helm/openshell" -CHART_DEST="${ROOTFS_DIR}/var/lib/rancher/k3s/server/static/charts" - -if [ -d "${HELM_CHART_DIR}" ]; then - echo "==> Packaging helm chart..." - mkdir -p "${CHART_DEST}" - helm package "${HELM_CHART_DIR}" -d "${CHART_DEST}" - echo " $(ls "${CHART_DEST}"/*.tgz 2>/dev/null | xargs -I{} basename {})" - # Also stage to /opt/openshell/charts/ so the init script can - # restore them after a --reset wipes server/static/charts/. - mkdir -p "${ROOTFS_DIR}/opt/openshell/charts" - cp "${CHART_DEST}"/*.tgz "${ROOTFS_DIR}/opt/openshell/charts/" -else - echo "WARNING: Helm chart not found at ${HELM_CHART_DIR}, skipping" -fi - -# ── Inject Kubernetes manifests ────────────────────────────────────── -# These are copied to /opt/openshell/manifests/ (staging). openshell-vm-init.sh -# moves them to /var/lib/rancher/k3s/server/manifests/ at boot so the -# k3s Helm Controller auto-deploys them. - -MANIFEST_SRC="${PROJECT_ROOT}/deploy/kube/manifests" -MANIFEST_DEST="${ROOTFS_DIR}/opt/openshell/manifests" - -echo "==> Injecting Kubernetes manifests..." -mkdir -p "${MANIFEST_DEST}" - -for manifest in openshell-helmchart.yaml agent-sandbox.yaml; do - if [ -f "${MANIFEST_SRC}/${manifest}" ]; then - cp "${MANIFEST_SRC}/${manifest}" "${MANIFEST_DEST}/" - echo " ${manifest}" - else - echo "WARNING: ${manifest} not found in ${MANIFEST_SRC}" - fi -done - -# ── Base mode: mark rootfs type and skip pre-loading ─────────────────── - -if [ "$BASE_ONLY" = true ]; then - # k3s expects this directory to exist for airgap image loading. - mkdir -p "${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" - - # Mark as base (not pre-initialized). The init script checks for - # this file to determine if cold start is expected. - echo "base" > "${ROOTFS_DIR}/opt/openshell/.rootfs-type" - - # ── Verify ───────────────────────────────────────────────────────── - if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then - echo "ERROR: k3s binary not found in rootfs." - exit 1 - fi - - if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then - echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." - exit 1 - fi - - echo "" - echo "==> Base rootfs ready at: ${ROOTFS_DIR}" - echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" - echo " Type: base (cold start, images pulled on demand)" - echo "" - echo "Note: First boot will take ~30-60s as k3s initializes." - echo " Container images will be pulled from registries on first use." - exit 0 -fi - -# ══════════════════════════════════════════════════════════════════════════ -# Full mode: pre-load images and pre-initialize k3s cluster state -# ══════════════════════════════════════════════════════════════════════════ - -# ── Pre-load container images ──────────────────────────────────────── -# Pull images for the target architecture and save as tarballs in the -# k3s airgap images directory. k3s auto-imports from -# /var/lib/rancher/k3s/agent/images/ on startup, so no internet access -# is needed at boot time. -# -# Tarballs are cached in a persistent directory outside the rootfs so -# they survive rebuilds. This avoids re-pulling and re-saving ~1 GiB -# of images each time. - -IMAGES_DIR="${ROOTFS_DIR}/var/lib/rancher/k3s/agent/images" -IMAGE_CACHE_DIR="${XDG_CACHE_HOME:-${HOME}/.cache}/openshell/openshell-vm/images" -mkdir -p "${IMAGES_DIR}" "${IMAGE_CACHE_DIR}" - -echo "==> Pre-loading container images (${GUEST_ARCH})..." - -pull_and_save() { - local image="$1" - local output="$2" - local cache="${IMAGE_CACHE_DIR}/$(basename "${output}")" - - # Use cached tarball if available. - if [ -f "${cache}" ]; then - echo " cached: $(basename "${output}")" - cp "${cache}" "${output}" - return 0 - fi - - # Try to pull; if the registry is unavailable, fall back to the - # local Docker image cache (image may exist from a previous pull). - echo " pulling: ${image}..." - if ! ce pull --platform "${DOCKER_PLATFORM}" "${image}" --quiet 2>/dev/null; then - echo " pull failed, checking local image cache..." - if ! ce image inspect "${image}" >/dev/null 2>&1; then - echo "ERROR: image ${image} not available locally or from registry" - exit 1 - fi - echo " using locally cached image" - fi - - echo " saving: $(basename "${output}")..." - # Pipe through zstd for faster decompression and smaller tarballs. - # k3s auto-imports .tar.zst files from the airgap images directory. - # -T0 uses all CPU cores; -3 is a good speed/ratio tradeoff. - ce save "${image}" | zstd -T0 -3 -o "${output}" - # Cache for next rebuild. - cp "${output}" "${cache}" -} - -pull_and_save "${SERVER_IMAGE}" "${IMAGES_DIR}/openshell-server.tar.zst" -pull_and_save "${AGENT_SANDBOX_IMAGE}" "${IMAGES_DIR}/agent-sandbox-controller.tar.zst" -pull_and_save "${COMMUNITY_SANDBOX_IMAGE}" "${IMAGES_DIR}/community-sandbox-base.tar.zst" - -# ── Pre-initialize k3s cluster state ───────────────────────────────── -# Boot k3s inside a Docker container using the rootfs we just built. -# Wait for it to fully initialize (import images, deploy manifests, -# create database), then capture the state back into the rootfs. -# -# This eliminates cold-start latency: on VM boot, k3s finds existing -# state and resumes in ~3-5 seconds instead of 30-60s. - -echo "" -echo "==> Pre-initializing k3s cluster state..." -echo " This boots k3s in a container, waits for full readiness," -echo " then captures the initialized state into the rootfs." - -# Patch the HelmChart manifest for the init container (same patches -# openshell-vm-init.sh applies at runtime). -INIT_MANIFESTS="${ROOTFS_DIR}/var/lib/rancher/k3s/server/manifests" -mkdir -p "${INIT_MANIFESTS}" - -# Copy manifests from staging to the k3s manifest directory. -for manifest in "${MANIFEST_DEST}"/*.yaml; do - [ -f "$manifest" ] || continue - cp "$manifest" "${INIT_MANIFESTS}/" -done - -# Patch HelmChart for local images and VM settings. -HELMCHART="${INIT_MANIFESTS}/openshell-helmchart.yaml" -if [ -f "$HELMCHART" ]; then - # Use local images — explicitly imported into containerd. - sed -i '' 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" - sed -i '' 's|__SANDBOX_IMAGE_PULL_POLICY__|"IfNotPresent"|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|__SANDBOX_IMAGE_PULL_POLICY__|"IfNotPresent"|g' "$HELMCHART" - sed -i '' 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" - # Use the locally imported image references. - sed -i '' -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" 2>/dev/null \ - || sed -i -E "s|repository:[[:space:]]*[^[:space:]]+|repository: ${SERVER_IMAGE%:*}|" "$HELMCHART" - sed -i '' -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" 2>/dev/null \ - || sed -i -E "s|tag:[[:space:]]*\"?[^\"[:space:]]+\"?|tag: \"${IMAGE_TAG}\"|" "$HELMCHART" - # Clear SSH gateway placeholders. - sed -i '' 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" - sed -i '' 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" - sed -i '' 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" - sed -i '' 's|__DISABLE_TLS__|false|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|__DISABLE_TLS__|false|g' "$HELMCHART" - sed -i '' 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" 2>/dev/null \ - || sed -i 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" - sed -i '' '/__CHART_CHECKSUM__/d' "$HELMCHART" 2>/dev/null \ - || sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" -fi - -# Patch agent-sandbox manifest for VM networking constraints. -AGENT_MANIFEST="${INIT_MANIFESTS}/agent-sandbox.yaml" -if [ -f "$AGENT_MANIFEST" ]; then - # Keep agent-sandbox on pod networking to avoid host port clashes. - # Point in-cluster client traffic at the API server node IP because - # kube-proxy is disabled in VM mode. - sed -i '' '/hostNetwork: true/d' "$AGENT_MANIFEST" 2>/dev/null \ - || sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" - sed -i '' '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" 2>/dev/null \ - || sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" - sed -i '' 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ - args:\ - - -metrics-bind-address=:8082\ - env:\ - - name: KUBERNETES_SERVICE_HOST\ - value: 192.168.127.2\ - - name: KUBERNETES_SERVICE_PORT\ - value: "6443"|g' "$AGENT_MANIFEST" 2>/dev/null \ - || sed -i 's|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0|image: registry.k8s.io/agent-sandbox/agent-sandbox-controller:v0.1.0\ - args:\ - - -metrics-bind-address=:8082\ - env:\ - - name: KUBERNETES_SERVICE_HOST\ - value: 192.168.127.2\ - - name: KUBERNETES_SERVICE_PORT\ - value: "6443"|g' "$AGENT_MANIFEST" - if grep -q 'hostNetwork: true' "$AGENT_MANIFEST" \ - || grep -q 'ClusterFirstWithHostNet' "$AGENT_MANIFEST" \ - || ! grep -q 'KUBERNETES_SERVICE_HOST' "$AGENT_MANIFEST" \ - || ! grep -q 'metrics-bind-address=:8082' "$AGENT_MANIFEST"; then - echo "ERROR: failed to patch agent-sandbox manifest for VM networking constraints: $AGENT_MANIFEST" >&2 - exit 1 - fi -fi - -# local-path-provisioner (deployed by k3s from local-storage.yaml) provides -# PVC storage for sandbox workspace volumes. It requires CNI bridge -# networking, which is now available in the VM kernel. - -# ── Pre-initialize using the actual libkrun VM ────────────────────────── -# Boot the real VM with the rootfs we just built. This uses the same -# kernel, networking, and kube-proxy config as production — eliminating -# Docker IP mismatches, snapshotter mismatches, and the Docker volume -# copy-back dance. The VM writes state directly into the rootfs via -# virtio-fs. -# -# Requirements: the openshell-vm binary must be built and codesigned. -# mise run vm:build handles this. - -GATEWAY_BIN="${PROJECT_ROOT}/target/debug/openshell-vm" -RUNTIME_DIR="${PROJECT_ROOT}/target/debug/openshell-vm.runtime" - -if [ ! -x "${GATEWAY_BIN}" ]; then - echo "ERROR: openshell-vm binary not found at ${GATEWAY_BIN}" - echo " Run: mise run vm:build" - exit 1 -fi - -if [ ! -d "${RUNTIME_DIR}" ]; then - echo "ERROR: VM runtime bundle not found at ${RUNTIME_DIR}" - echo " Run: mise run vm:build" - exit 1 -fi - -# Helper: run a command inside the VM via the exec agent. -vm_exec() { - if [ "$(uname -s)" = "Darwin" ]; then - DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" \ - "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 - else - LD_LIBRARY_PATH="${RUNTIME_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \ - "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" exec -- "$@" 2>&1 - fi -} - -# Ensure no stale VM is using this rootfs. -echo " Starting VM for pre-initialization..." -if [ "$(uname -s)" = "Darwin" ]; then - export DYLD_FALLBACK_LIBRARY_PATH="${RUNTIME_DIR}${DYLD_FALLBACK_LIBRARY_PATH:+:${DYLD_FALLBACK_LIBRARY_PATH}}" -else - export LD_LIBRARY_PATH="${RUNTIME_DIR}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" -fi -# Pre-initialize directly on virtio-fs. Runtime boots attach a separate -# block-backed state disk and seed it from the rootfs on first launch. -OPENSHELL_VM_DISABLE_STATE_DISK=1 "${GATEWAY_BIN}" --rootfs "${ROOTFS_DIR}" --reset & -VM_PID=$! - -# Ensure the VM is cleaned up on script exit. -cleanup_vm() { - if kill -0 "${VM_PID}" 2>/dev/null; then - echo " Stopping VM (pid ${VM_PID})..." - kill "${VM_PID}" 2>/dev/null || true - wait "${VM_PID}" 2>/dev/null || true - fi -} -trap cleanup_vm EXIT - -# Wait for the exec agent to become reachable. -echo " Waiting for VM exec agent..." -for i in $(seq 1 120); do - if vm_exec true >/dev/null 2>&1; then - echo " Exec agent ready (${i}s)" - break - fi - if [ "$i" -eq 120 ]; then - echo "ERROR: VM exec agent did not become reachable in 120s" - exit 1 - fi - sleep 1 -done - -# Wait for containerd to be ready. -echo " Waiting for containerd..." -for i in $(seq 1 60); do - if vm_exec k3s ctr version >/dev/null 2>&1; then - echo " Containerd ready (${i}s)" - break - fi - if [ "$i" -eq 60 ]; then - echo "ERROR: containerd did not become ready in 60s" - exit 1 - fi - sleep 1 -done - -# Wait for the openshell namespace (Helm controller creates it). -echo " Waiting for openshell namespace..." -for i in $(seq 1 180); do - if vm_exec kubectl get namespace openshell -o name 2>/dev/null | grep -q openshell; then - echo " Namespace ready (${i}s)" - break - fi - if [ "$i" -eq 180 ]; then - echo "ERROR: openshell namespace did not appear in 180s" - exit 1 - fi - sleep 1 -done - -# Wait for the openshell StatefulSet to have a ready replica. -# The VM init script generates PKI and writes TLS secrets manifests -# automatically — no host-side PKI generation needed. -echo " Waiting for openshell pod to be ready..." -for i in $(seq 1 180); do - ready=$(vm_exec kubectl -n openshell get statefulset openshell \ - -o jsonpath='{.status.readyReplicas}' 2>/dev/null || echo "0") - if [ "$ready" = "1" ]; then - echo " OpenShell pod ready (${i}s)" - break - fi - if [ "$i" -eq 180 ]; then - echo "WARNING: openshell pod not ready after 180s, continuing anyway" - vm_exec kubectl -n openshell get pods 2>/dev/null | sed 's/^/ /' || true - break - fi - sleep 1 -done - -# Pre-unpack container images so the overlayfs snapshotter has ready-to-use -# snapshots on first boot. The snapshotter now runs directly on virtio-fs, -# so these unpacked layers persist across VM restarts — eliminating the -# per-boot layer extraction that previously added ~3-5s per container. -echo " Pre-unpacking container images..." -for img in \ - "ghcr.io/nvidia/openshell-community/sandboxes/base:latest" \ - "ghcr.io/nvidia/openshell/gateway:latest"; do - if vm_exec k3s ctr -n k8s.io images ls -q 2>/dev/null | grep -qF "$img"; then - echo " unpacking: $img" - vm_exec k3s ctr -n k8s.io run --rm "$img" "pre-unpack-$(date +%s)" true 2>/dev/null || true - fi -done -echo " Image pre-unpack complete." - -# Stop the VM so the kine SQLite DB is flushed. -echo " Stopping VM..." -kill "${VM_PID}" 2>/dev/null || true -wait "${VM_PID}" 2>/dev/null || true - -# Surgically clean the kine SQLite DB. Runtime objects (pods, events, -# leases) created during pre-initialization would cause the VM's kubelet -# to reconcile against an empty containerd on first real boot. -# -# NOTE: This is build-time cleanup only — it produces a clean rootfs -# image. At runtime, state.db is preserved across VM restarts so that -# pods and other cluster objects persist. The init script -# (openshell-vm-init.sh) handles stale bootstrap lock cleanup via -# sqlite3, and the host-side Rust code (exec.rs) handles actual DB -# corruption by removing the file. -echo " Cleaning runtime objects from kine DB..." -DB="${ROOTFS_DIR}/var/lib/rancher/k3s/server/db/state.db" -if [ -f "$DB" ]; then - echo " Before: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" - sqlite3 "$DB" <<'EOSQL' -DELETE FROM kine WHERE name LIKE '/registry/pods/%'; -DELETE FROM kine WHERE name LIKE '/registry/events/%'; -DELETE FROM kine WHERE name LIKE '/registry/leases/%'; -DELETE FROM kine WHERE name LIKE '/registry/endpointslices/%'; -DELETE FROM kine WHERE name LIKE '/registry/masterleases/%'; -PRAGMA wal_checkpoint(TRUNCATE); -VACUUM; -EOSQL - echo " After: $(sqlite3 "$DB" "SELECT COUNT(*) FROM kine;") kine records" -else - echo "WARNING: state.db not found at ${DB}" -fi - -# Clean up runtime artifacts that shouldn't persist. -echo " Cleaning runtime artifacts..." -rm -rf "${ROOTFS_DIR}/var/lib/rancher/k3s/server/tls/temporary-certs" 2>/dev/null || true -rm -f "${ROOTFS_DIR}/var/lib/rancher/k3s/server/kine.sock" 2>/dev/null || true -find "${ROOTFS_DIR}/var/lib/rancher/k3s" -name '*.sock' -delete 2>/dev/null || true -find "${ROOTFS_DIR}/run" -name '*.sock' -delete 2>/dev/null || true - -# Write sentinel file so openshell-vm-init.sh and the host-side bootstrap -# know this rootfs has pre-initialized state. -echo "$(date -u +%Y-%m-%dT%H:%M:%SZ)" > "${ROOTFS_DIR}/opt/openshell/.initialized" - -echo " Pre-initialization complete." - -# ── Verify ──────────────────────────────────────────────────────────── - -if [ ! -f "${ROOTFS_DIR}/usr/local/bin/k3s" ]; then - echo "ERROR: k3s binary not found in rootfs. Something went wrong." - exit 1 -fi - -if [ ! -f "${ROOTFS_DIR}/opt/openshell/.initialized" ]; then - echo "WARNING: Pre-initialization sentinel not found. Cold starts will be slow." -fi - -if [ ! -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then - echo "ERROR: openshell-sandbox supervisor binary not found in rootfs." - echo " Sandbox pods will fail with CreateContainerError." - exit 1 -fi - -echo "" -echo "==> Rootfs ready at: ${ROOTFS_DIR}" -echo " Size: $(du -sh "${ROOTFS_DIR}" | cut -f1)" -echo " Pre-initialized: $(cat "${ROOTFS_DIR}/opt/openshell/.initialized" 2>/dev/null || echo 'no')" - -# Show k3s data size -K3S_DATA="${ROOTFS_DIR}/var/lib/rancher/k3s" -if [ -d "${K3S_DATA}" ]; then - echo " k3s state: $(du -sh "${K3S_DATA}" | cut -f1)" -fi - -# PKI is generated at first VM boot by the init script — not baked. - -# Show supervisor binary -if [ -x "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" ]; then - echo " Supervisor: $(du -h "${ROOTFS_DIR}/opt/openshell/bin/openshell-sandbox" | cut -f1)" -fi - -echo "" -echo "Next steps:" -echo " 1. Run: openshell-vm" -echo " Expected startup time: ~3-5 seconds (pre-initialized)" diff --git a/crates/openshell-vm/scripts/check-vm-capabilities.sh b/crates/openshell-vm/scripts/check-vm-capabilities.sh deleted file mode 100755 index f88a1340c..000000000 --- a/crates/openshell-vm/scripts/check-vm-capabilities.sh +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# VM Kernel Capability Checker -# -# Runs inside the guest VM (or a container with the same rootfs) to -# verify that the kernel has the capabilities required for bridge CNI -# networking, kube-proxy, and Kubernetes pod networking. -# -# Usage: -# ./check-vm-capabilities.sh [--json] -# -# Exit codes: -# 0 = all required capabilities present -# 1 = one or more required capabilities missing -# 2 = script error - -set -euo pipefail - -JSON_OUTPUT=false -if [ "${1:-}" = "--json" ]; then - JSON_OUTPUT=true -fi - -PASS=0 -FAIL=0 -WARN=0 -RESULTS=() - -# ── Helpers ───────────────────────────────────────────────────────────── - -check() { - local name="$1" - local category="$2" - local required="$3" # "required" or "optional" - local description="$4" - shift 4 - local cmd=("$@") - - if eval "${cmd[@]}" >/dev/null 2>&1; then - RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"pass\",\"required\":\"$required\",\"description\":\"$description\"}") - PASS=$((PASS + 1)) - if [ "$JSON_OUTPUT" = false ]; then - printf " ✓ %-40s %s\n" "$name" "$description" - fi - else - if [ "$required" = "required" ]; then - RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"fail\",\"required\":\"$required\",\"description\":\"$description\"}") - FAIL=$((FAIL + 1)) - if [ "$JSON_OUTPUT" = false ]; then - printf " ✗ %-40s %s (REQUIRED)\n" "$name" "$description" - fi - else - RESULTS+=("{\"name\":\"$name\",\"category\":\"$category\",\"status\":\"warn\",\"required\":\"$required\",\"description\":\"$description\"}") - WARN=$((WARN + 1)) - if [ "$JSON_OUTPUT" = false ]; then - printf " ~ %-40s %s (optional)\n" "$name" "$description" - fi - fi - fi -} - -check_module() { - local module="$1" - # Check /proc/modules (loaded), /proc/config.gz (builtin), or /sys/module - if [ -d "/sys/module/$module" ]; then - return 0 - fi - if grep -q "^${module} " /proc/modules 2>/dev/null; then - return 0 - fi - # Check if compiled in via /proc/config.gz or /boot/config - local config_key - config_key="CONFIG_$(echo "$module" | tr '[:lower:]-' '[:upper:]_')" - if [ -f /proc/config.gz ]; then - if zcat /proc/config.gz 2>/dev/null | grep -q "^${config_key}=[ym]"; then - return 0 - fi - fi - return 1 -} - -# ── Capability Checks ────────────────────────────────────────────────── - -if [ "$JSON_OUTPUT" = false ]; then - echo "VM Kernel Capability Check" - echo "==========================" - echo "" - echo "Kernel: $(uname -r)" - echo "" -fi - -# --- Network Namespaces --- -if [ "$JSON_OUTPUT" = false ]; then echo "[Network Namespaces]"; fi - -check "net_namespace" "netns" "required" \ - "network namespace support (CONFIG_NET_NS)" \ - "test -d /proc/self/ns && ls /proc/self/ns/net" - -check "veth_pair" "netns" "required" \ - "veth pair creation (CONFIG_VETH)" \ - "ip link add _chk0 type veth peer name _chk1 && ip link del _chk0" - -# --- Linux Bridge --- -if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Linux Bridge]"; fi - -check "bridge_module" "bridge" "required" \ - "bridge device support (CONFIG_BRIDGE)" \ - "ip link add _chkbr0 type bridge && ip link del _chkbr0" - -check "bridge_nf_call" "bridge" "required" \ - "bridge netfilter (CONFIG_BRIDGE_NETFILTER)" \ - "check_module bridge && test -f /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null || check_module br_netfilter" - -# --- Netfilter / iptables --- -if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Netfilter / iptables]"; fi - -check "netfilter" "netfilter" "required" \ - "netfilter framework (CONFIG_NETFILTER)" \ - "check_module nf_conntrack || check_module ip_tables || test -d /proc/sys/net/netfilter" - -check "nf_conntrack" "netfilter" "required" \ - "connection tracking (CONFIG_NF_CONNTRACK)" \ - "check_module nf_conntrack" - -check "nf_nat" "netfilter" "required" \ - "NAT support (CONFIG_NF_NAT)" \ - "check_module nf_nat" - -check "iptables_filter" "netfilter" "required" \ - "iptables filter (CONFIG_IP_NF_FILTER)" \ - "check_module ip_tables || iptables -L -n >/dev/null 2>&1" - -check "iptables_nat" "netfilter" "required" \ - "iptables NAT (CONFIG_IP_NF_NAT)" \ - "check_module iptable_nat || iptables -t nat -L -n >/dev/null 2>&1" - -check "iptables_mangle" "netfilter" "optional" \ - "iptables mangle (CONFIG_IP_NF_MANGLE)" \ - "check_module iptable_mangle || iptables -t mangle -L -n >/dev/null 2>&1" - -check "nf_conntrack_netlink" "netfilter" "optional" \ - "conntrack netlink (CONFIG_NF_CT_NETLINK)" \ - "check_module nf_conntrack_netlink" - -check "nftables" "netfilter" "optional" \ - "nftables (CONFIG_NF_TABLES)" \ - "check_module nf_tables || nft list ruleset >/dev/null 2>&1" - -# --- IP Forwarding / Routing --- -if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[IP Forwarding]"; fi - -check "ip_forward" "routing" "required" \ - "IP forwarding (sysctl)" \ - "test -f /proc/sys/net/ipv4/ip_forward" - -check "ip_route" "routing" "required" \ - "IP routing" \ - "ip route show >/dev/null 2>&1" - -# --- CNI Plugin Dependencies --- -if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[CNI Plugins]"; fi - -check "cni_bridge_bin" "cni" "required" \ - "bridge CNI plugin binary" \ - "test -x /opt/cni/bin/bridge || find /var/lib/rancher/k3s/data -name bridge -type f 2>/dev/null | head -1 | grep -q ." - -check "cni_host_local_bin" "cni" "required" \ - "host-local IPAM plugin binary" \ - "test -x /opt/cni/bin/host-local || find /var/lib/rancher/k3s/data -name host-local -type f 2>/dev/null | head -1 | grep -q ." - -check "cni_loopback_bin" "cni" "required" \ - "loopback CNI plugin binary" \ - "test -x /opt/cni/bin/loopback || find /var/lib/rancher/k3s/data -name loopback -type f 2>/dev/null | head -1 | grep -q ." - -check "cni_portmap_bin" "cni" "optional" \ - "portmap CNI plugin binary (needs iptables)" \ - "test -x /opt/cni/bin/portmap || find /var/lib/rancher/k3s/data -name portmap -type f 2>/dev/null | head -1 | grep -q ." - -# --- Userspace Tools --- -if [ "$JSON_OUTPUT" = false ]; then echo ""; echo "[Userspace Tools]"; fi - -check "iptables_bin" "userspace" "required" \ - "iptables binary" \ - "command -v iptables" - -check "conntrack_bin" "userspace" "optional" \ - "conntrack binary" \ - "command -v conntrack" - -check "ip_bin" "userspace" "required" \ - "iproute2 (ip command)" \ - "command -v ip" - -# ── Summary ──────────────────────────────────────────────────────────── - -if [ "$JSON_OUTPUT" = true ]; then - echo "{" - echo " \"kernel\": \"$(uname -r)\"," - echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," - echo " \"pass\": $PASS," - echo " \"fail\": $FAIL," - echo " \"warn\": $WARN," - echo " \"results\": [" - local_first=true - for r in "${RESULTS[@]}"; do - if [ "$local_first" = true ]; then - local_first=false - else - echo "," - fi - printf " %s" "$r" - done - echo "" - echo " ]" - echo "}" -else - echo "" - echo "─────────────────────────────────────────" - printf "Results: %d passed, %d failed, %d warnings\n" "$PASS" "$FAIL" "$WARN" - - if [ "$FAIL" -gt 0 ]; then - echo "" - echo "FAIL: $FAIL required capabilities missing." - echo "The VM kernel needs to be rebuilt with the missing features." - echo "See: crates/openshell-driver-vm/runtime/README.md" - exit 1 - else - echo "" - echo "PASS: All required capabilities present." - exit 0 - fi -fi diff --git a/crates/openshell-vm/scripts/openshell-vm-exec-agent.py b/crates/openshell-vm/scripts/openshell-vm-exec-agent.py deleted file mode 100644 index f2b384cf9..000000000 --- a/crates/openshell-vm/scripts/openshell-vm-exec-agent.py +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import base64 -import fcntl -import json -import os -import pty -import socket -import struct -import subprocess -import sys -import termios -import threading - - -PORT = 10777 - - -def recv_line(sock_file): - line = sock_file.readline() - if not line: - return None - return json.loads(line.decode("utf-8")) - - -def send_frame(sock_file, lock, frame): - data = (json.dumps(frame, separators=(",", ":")) + "\n").encode("utf-8") - with lock: - sock_file.write(data) - sock_file.flush() - - -def validate_env(env_items): - env = {} - for item in env_items: - if "=" not in item: - raise ValueError(f"invalid env item: {item}") - key, value = item.split("=", 1) - if not key or not (key[0] == "_" or key[0].isalpha()): - raise ValueError(f"invalid env key: {key}") - if not all(ch == "_" or ch.isalnum() for ch in key): - raise ValueError(f"invalid env key: {key}") - env[key] = value - return env - - -def set_winsize(fd, cols, rows): - winsize = struct.pack("HHHH", rows, cols, 0, 0) - fcntl.ioctl(fd, termios.TIOCSWINSZ, winsize) - - -def stream_reader(pipe, frame_type, sock_file, lock): - try: - while True: - chunk = pipe.read(8192) - if not chunk: - break - send_frame( - sock_file, - lock, - {"type": frame_type, "data": base64.b64encode(chunk).decode("ascii")}, - ) - finally: - pipe.close() - - -def stdin_writer(proc, sock_file, sock, lock): - """Forward stdin frames from the client to the subprocess. - - When the client sends ``stdin_close`` (or the connection drops), we - close the subprocess's stdin pipe so it sees EOF. We must NOT - terminate the subprocess or shut down the socket here — the main - thread needs the process to finish naturally and the stdout/stderr - reader threads still need to flush their data back to the client. - """ - try: - while True: - frame = recv_line(sock_file) - if frame is None: - break - kind = frame.get("type") - if kind == "stdin": - payload = base64.b64decode(frame.get("data", "")) - if proc.stdin is not None: - proc.stdin.write(payload) - proc.stdin.flush() - elif kind == "stdin_close": - break - elif kind == "resize": - pass - else: - send_frame( - sock_file, - lock, - {"type": "error", "message": f"unknown frame type: {kind}"}, - ) - break - except BrokenPipeError: - pass - finally: - try: - if proc.stdin is not None: - proc.stdin.close() - except OSError: - pass - - -def handle_client_pipe(conn, request, sock_file): - """Handle a client connection using pipes (non-TTY mode).""" - lock = threading.Lock() - try: - argv = request.get("argv") or ["sh"] - cwd = request.get("cwd") - env = os.environ.copy() - env.update(validate_env(request.get("env") or [])) - - proc = subprocess.Popen( - argv, - cwd=cwd or "/", - env=env, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - ) - - stdout_thread = threading.Thread( - target=stream_reader, - args=(proc.stdout, "stdout", sock_file, lock), - daemon=True, - ) - stderr_thread = threading.Thread( - target=stream_reader, - args=(proc.stderr, "stderr", sock_file, lock), - daemon=True, - ) - stdin_thread = threading.Thread( - target=stdin_writer, args=(proc, sock_file, conn, lock), daemon=True - ) - - stdout_thread.start() - stderr_thread.start() - stdin_thread.start() - - code = proc.wait() - stdout_thread.join() - stderr_thread.join() - send_frame(sock_file, lock, {"type": "exit", "code": code}) - except Exception as exc: - try: - send_frame(sock_file, lock, {"type": "error", "message": str(exc)}) - except Exception: - pass - finally: - try: - sock_file.close() - except Exception: - pass - conn.close() - - -def handle_client_tty(conn, request, sock_file): - """Handle a client connection with PTY allocation.""" - lock = threading.Lock() - master_fd = -1 - try: - argv = request.get("argv") or ["sh"] - cwd = request.get("cwd") - env = os.environ.copy() - env.update(validate_env(request.get("env") or [])) - env.setdefault("TERM", "xterm-256color") - - master_fd, slave_fd = pty.openpty() - - # Consume any resize frame sent right after the ExecRequest. - # The host sends it before starting the stdin pump, so it - # should arrive quickly. Use a short socket timeout. - conn.settimeout(0.5) - try: - pending = sock_file.readline() - if pending: - frame = json.loads(pending.decode("utf-8")) - if frame.get("type") == "resize": - set_winsize( - slave_fd, - frame.get("cols", 80), - frame.get("rows", 24), - ) - except (socket.timeout, ValueError, OSError): - pass - finally: - conn.settimeout(None) - - proc = subprocess.Popen( - argv, - cwd=cwd or "/", - env=env, - stdin=slave_fd, - stdout=slave_fd, - stderr=slave_fd, - preexec_fn=os.setsid, - ) - os.close(slave_fd) - - def pty_reader(): - try: - while True: - try: - chunk = os.read(master_fd, 8192) - except OSError: - break - if not chunk: - break - send_frame( - sock_file, - lock, - { - "type": "stdout", - "data": base64.b64encode(chunk).decode("ascii"), - }, - ) - except Exception: - pass - - def pty_stdin_writer(): - try: - while True: - frame = recv_line(sock_file) - if frame is None: - break - kind = frame.get("type") - if kind == "stdin": - payload = base64.b64decode(frame.get("data", "")) - try: - os.write(master_fd, payload) - except OSError: - break - elif kind == "resize": - try: - set_winsize( - master_fd, - frame.get("cols", 80), - frame.get("rows", 24), - ) - except OSError: - pass - elif kind == "stdin_close": - break - else: - send_frame( - sock_file, - lock, - {"type": "error", "message": f"unknown frame type: {kind}"}, - ) - break - except (BrokenPipeError, OSError): - pass - - reader_thread = threading.Thread(target=pty_reader, daemon=True) - stdin_thread = threading.Thread(target=pty_stdin_writer, daemon=True) - reader_thread.start() - stdin_thread.start() - - code = proc.wait() - reader_thread.join(timeout=2) - send_frame(sock_file, lock, {"type": "exit", "code": code}) - except Exception as exc: - try: - send_frame(sock_file, lock, {"type": "error", "message": str(exc)}) - except Exception: - pass - finally: - if master_fd >= 0: - try: - os.close(master_fd) - except OSError: - pass - try: - sock_file.close() - except Exception: - pass - conn.close() - - -def handle_client(conn): - sock_file = conn.makefile("rwb", buffering=0) - try: - request = recv_line(sock_file) - if request is None: - sock_file.close() - conn.close() - return - except Exception: - sock_file.close() - conn.close() - return - - if request.get("tty"): - handle_client_tty(conn, request, sock_file) - else: - handle_client_pipe(conn, request, sock_file) - - -def main(): - if not hasattr(socket, "AF_VSOCK"): - print("AF_VSOCK is not available", file=sys.stderr) - return 1 - - server = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) - server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind((socket.VMADDR_CID_ANY, PORT)) - server.listen(16) - - while True: - conn, _addr = server.accept() - thread = threading.Thread(target=handle_client, args=(conn,), daemon=True) - thread.start() - - -if __name__ == "__main__": - raise SystemExit(main()) diff --git a/crates/openshell-vm/scripts/openshell-vm-init.sh b/crates/openshell-vm/scripts/openshell-vm-init.sh deleted file mode 100755 index 1cb686a31..000000000 --- a/crates/openshell-vm/scripts/openshell-vm-init.sh +++ /dev/null @@ -1,833 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -# Init script for the openshell-vm microVM. Runs as PID 1 inside the libkrun VM. -# -# Mounts essential virtual filesystems, configures networking, then execs -# k3s server. If the rootfs was pre-initialized by build-rootfs.sh (sentinel -# at /opt/openshell/.initialized), the full manifest setup is skipped and -# k3s resumes from its persisted state (~3-5s startup). - -set -euo pipefail - -BOOT_START=$(date +%s%3N 2>/dev/null || date +%s) - -ts() { - local now - now=$(date +%s%3N 2>/dev/null || date +%s) - local elapsed=$(( (now - BOOT_START) )) - printf "[%d.%03ds] %s\n" $((elapsed / 1000)) $((elapsed % 1000)) "$*" -} - -PRE_INITIALIZED=false -if [ -f /opt/openshell/.initialized ]; then - PRE_INITIALIZED=true - ts "pre-initialized rootfs detected (fast path)" -fi - -# ── Mount essential filesystems (parallel) ────────────────────────────── -# These are independent; mount them concurrently. - -mount -t proc proc /proc 2>/dev/null & -mount -t sysfs sysfs /sys 2>/dev/null & -mount -t tmpfs tmpfs /tmp 2>/dev/null & -mount -t tmpfs tmpfs /run 2>/dev/null & -mount -t devtmpfs devtmpfs /dev 2>/dev/null & -wait - -# These depend on /dev being mounted. -mkdir -p /dev/pts /dev/shm -mount -t devpts devpts /dev/pts 2>/dev/null & -mount -t tmpfs tmpfs /dev/shm 2>/dev/null & - -# cgroup2 (unified hierarchy) — required by k3s/containerd. -mkdir -p /sys/fs/cgroup -mount -t cgroup2 cgroup2 /sys/fs/cgroup 2>/dev/null & -wait - -ts "filesystems mounted" - -# ── Networking ────────────────────────────────────────────────────────── - -# Non-critical: hostname is cosmetic. -hostname openshell-vm 2>/dev/null || true - -# Ensure loopback is up (k3s binds to 127.0.0.1). -ip link set lo up 2>/dev/null || true - -# Detect whether we have a real network interface (gvproxy) or need a -# dummy interface (TSI / no networking). -if ip link show eth0 >/dev/null 2>&1; then - # gvproxy networking — bring up eth0 and get an IP via DHCP. - # gvproxy has a built-in DHCP server that assigns 192.168.127.2/24 - # with gateway 192.168.127.1 and configures ARP properly. - ts "detected eth0 (gvproxy networking)" - ip link set eth0 up 2>/dev/null || true - - # Use DHCP to get IP and configure routes. gvproxy's DHCP server - # handles ARP resolution which static config does not. - if command -v udhcpc >/dev/null 2>&1; then - # udhcpc needs a script to apply the lease. Use the busybox - # default script if available, otherwise write a minimal one. - UDHCPC_SCRIPT="/usr/share/udhcpc/default.script" - if [ ! -f "$UDHCPC_SCRIPT" ]; then - mkdir -p /usr/share/udhcpc - cat > "$UDHCPC_SCRIPT" << 'DHCP_SCRIPT' -#!/bin/sh -case "$1" in - bound|renew) - ip addr flush dev "$interface" - ip addr add "$ip/$mask" dev "$interface" - if [ -n "$router" ]; then - ip route add default via $router dev "$interface" - fi - if [ -n "$dns" ]; then - echo -n > /etc/resolv.conf - for d in $dns; do - echo "nameserver $d" >> /etc/resolv.conf - done - fi - ;; -esac -DHCP_SCRIPT - chmod +x "$UDHCPC_SCRIPT" - fi - # -f: stay in foreground, -q: quit after obtaining lease, - # -n: exit if no lease, -T 1: 1s between retries, -t 3: 3 retries - # -A 1: wait 1s before first retry (aggressive for local gvproxy) - if ! udhcpc -i eth0 -f -q -n -T 1 -t 3 -A 1 -s "$UDHCPC_SCRIPT" 2>&1; then - ts "WARNING: DHCP failed, falling back to static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true - fi - else - # Fallback to static config if no DHCP client available. - ts "no DHCP client, using static config" - ip addr add 192.168.127.2/24 dev eth0 2>/dev/null || true - ip route add default via 192.168.127.1 2>/dev/null || true - fi - - # Ensure DNS is configured. DHCP should have set /etc/resolv.conf, - # but if it didn't (or static fallback was used), provide a default. - if [ ! -s /etc/resolv.conf ]; then - echo "nameserver 8.8.8.8" > /etc/resolv.conf - echo "nameserver 8.8.4.4" >> /etc/resolv.conf - fi - - # Read back the IP we got (from DHCP or static). - NODE_IP=$(ip -4 addr show eth0 2>/dev/null | awk '/inet / {split($2,a,"/"); print a[1]; exit}') - NODE_IP="${NODE_IP:-192.168.127.2}" - ts "eth0 IP: $NODE_IP" -else - # TSI or no networking — create a dummy interface for k3s. - ts "no eth0 found, using dummy interface (TSI mode)" - ip link add dummy0 type dummy 2>/dev/null || true - ip addr add 10.0.2.15/24 dev dummy0 2>/dev/null || true - ip link set dummy0 up 2>/dev/null || true - ip route add default dev dummy0 2>/dev/null || true - - NODE_IP="10.0.2.15" -fi - -# ── k3s data directories ─────────────────────────────────────────────── - -mkdir -p /var/lib/rancher/k3s -mkdir -p /etc/rancher/k3s - -ROOTFS_CONTAINERD_DIR="/var/lib/rancher/k3s/agent/containerd" -CONTAINERD_DIR="$ROOTFS_CONTAINERD_DIR" - -# ── State disk: mount ALL mutable runtime state on the block device ──── -# -# The virtio-fs share is the immutable OS image (read-only at runtime). -# All state that changes after first boot lives on an ext4 virtio-blk -# disk (/dev/vda). This gives full filesystem semantics (chown, hard -# links, fsync) and keeps every writable path off the host filesystem. -# -# Directories on the state disk: -# containerd/ → k3s/agent/containerd (overlayfs snapshotter) -# k3s-agent/ → k3s/agent (kubelet certs, kubeconfigs) -# k3s-server-db/ → k3s/server/db (kine SQLite) -# k3s-server-tls/ → k3s/server/tls (cluster TLS certs) -# k3s-server-cred/ → k3s/server/cred (bootstrap credentials) -# k3s-server-etc/ → k3s/server/etc (k3s-generated config) -# local-path-storage/ → k3s/storage (PVC data) -# pki/ → opt/openshell/pki (mTLS CA + server/client certs) -# -# Directories that stay on virtio-fs (read-only seeds from build-rootfs.sh): -# k3s/server/manifests (k3s auto-deploy manifests, written by init script) -# k3s/server/static (k3s bundled charts) -# k3s/agent/images (airgap image tarballs, seeded once then on disk) - -STATE_DISK_DEVICE="${OPENSHELL_VM_STATE_DISK_DEVICE:-/dev/vda}" -STATE_MOUNT_DIR="/mnt/openshell-state" -STATE_DISK_ACTIVE=false -mkdir -p "$STATE_MOUNT_DIR" - -if [ -b "$STATE_DISK_DEVICE" ]; then - ts "configuring block-backed runtime state on ${STATE_DISK_DEVICE}" - if ! blkid "$STATE_DISK_DEVICE" >/dev/null 2>&1; then - mkfs.ext4 -F -L openshell-state "$STATE_DISK_DEVICE" >/dev/null 2>&1 - ts "formatted state disk" - fi - mount -t ext4 -o noatime "$STATE_DISK_DEVICE" "$STATE_MOUNT_DIR" - - # ── k3s agent: seed images once, then bind entire agent dir ────────── - # agent/images contains airgap image tarballs baked into the rootfs. - # Seed them to the block device on first use so containerd can import - # them; after that they live on the block device alongside everything else. - STATE_K3S_AGENT_DIR="${STATE_MOUNT_DIR}/k3s-agent" - mkdir -p "$STATE_K3S_AGENT_DIR" - if [ ! -f "${STATE_MOUNT_DIR}/.seeded-agent-images" ]; then - VIRTIOFS_AGENT_IMAGES="/var/lib/rancher/k3s/agent/images" - if [ -d "$VIRTIOFS_AGENT_IMAGES" ] && [ -n "$(ls -A "$VIRTIOFS_AGENT_IMAGES" 2>/dev/null)" ]; then - ts "seeding agent images to block device" - mkdir -p "${STATE_K3S_AGENT_DIR}/images" - tar -C "$VIRTIOFS_AGENT_IMAGES" -cf - . | tar -C "${STATE_K3S_AGENT_DIR}/images" -xf - - fi - date -u +%Y-%m-%dT%H:%M:%SZ > "${STATE_MOUNT_DIR}/.seeded-agent-images" - fi - mkdir -p /var/lib/rancher/k3s/agent - mount --bind "$STATE_K3S_AGENT_DIR" /var/lib/rancher/k3s/agent - - # ── containerd: bind on top of agent ───────────────────────────────── - # Seeded from the virtiofs rootfs on first use (overlayfs snapshots, - # content store, meta.db pre-populated by build-rootfs.sh). - STATE_CONTAINERD_DIR="${STATE_MOUNT_DIR}/containerd" - mkdir -p "$STATE_CONTAINERD_DIR" - if [ ! -f "${STATE_MOUNT_DIR}/.seeded-containerd" ]; then - if [ -d "$ROOTFS_CONTAINERD_DIR" ] && [ -n "$(ls -A "$ROOTFS_CONTAINERD_DIR" 2>/dev/null)" ]; then - ts "seeding containerd state to block device" - tar -C "$ROOTFS_CONTAINERD_DIR" -cf - . | tar -C "$STATE_CONTAINERD_DIR" -xf - - else - ts "containerd state is empty; starting fresh" - fi - date -u +%Y-%m-%dT%H:%M:%SZ > "${STATE_MOUNT_DIR}/.seeded-containerd" - fi - mkdir -p "$ROOTFS_CONTAINERD_DIR" - mount --bind "$STATE_CONTAINERD_DIR" "$ROOTFS_CONTAINERD_DIR" - - # ── k3s server runtime state ────────────────────────────────────────── - # server/manifests and server/static stay on virtiofs (written by init - # script each boot from /opt/openshell/manifests; read-only after that). - for pair in \ - "k3s-server-db:/var/lib/rancher/k3s/server/db" \ - "k3s-server-tls:/var/lib/rancher/k3s/server/tls" \ - "k3s-server-cred:/var/lib/rancher/k3s/server/cred" \ - "k3s-server-etc:/var/lib/rancher/k3s/server/etc" - do - src="${STATE_MOUNT_DIR}/${pair%%:*}" - dst="${pair#*:}" - mkdir -p "$src" "$dst" - mount --bind "$src" "$dst" - done - - # ── local-path PVC storage ───────────────────────────────────────────── - mkdir -p "${STATE_MOUNT_DIR}/local-path-storage" /var/lib/rancher/k3s/storage - mount --bind "${STATE_MOUNT_DIR}/local-path-storage" /var/lib/rancher/k3s/storage - - # ── PKI ──────────────────────────────────────────────────────────────── - # Certs live on the block device; the host reads them via the exec - # agent (vsock port 10777) instead of polling the virtiofs rootfs path. - mkdir -p "${STATE_MOUNT_DIR}/pki" /opt/openshell/pki - mount --bind "${STATE_MOUNT_DIR}/pki" /opt/openshell/pki - - STATE_DISK_ACTIVE=true - ts "all runtime state mounted from block device" -else - ts "no block device found; using virtiofs-backed runtime state" -fi - -# Clean stale sockets from previous boots. Sockets live in /run (tmpfs) -# and /var/lib/rancher/k3s — they're stale on every boot regardless of -# whether state is on virtiofs or the block device. -find /var/lib/rancher/k3s -name '*.sock' -delete 2>/dev/null || true -find /run -name '*.sock' -delete 2>/dev/null || true -# On the block-device path, node-passwd is regenerated by k3s on each -# start; clear it so k3s doesn't fail node re-registration validation. -rm -f /var/lib/rancher/k3s/server/cred/node-passwd 2>/dev/null || true - -# Clean stale containerd runtime state from previous boots. -# -# The rootfs persists across VM restarts via virtio-fs. The overlayfs -# snapshotter now lives on the host-backed state disk when present, so -# snapshot data and meta.db persist across boots. We only clean runtime -# state (shim PIDs, sockets) that becomes stale when the VM restarts. -if [ -d "$CONTAINERD_DIR" ]; then - # Remove runtime task state (stale shim PIDs, sockets from dead processes). - rm -rf "${CONTAINERD_DIR}/io.containerd.runtime.v2.task" 2>/dev/null || true - # Remove sandbox controller shim state. Stale sandbox records cause - # containerd to reuse network namespaces from previous boots, which - # already have routes configured. The CNI bridge plugin then fails - # with "file exists" when adding the default route on retry. - rm -rf "${CONTAINERD_DIR}/io.containerd.sandbox.controller.v1.shim" 2>/dev/null || true - # Clean stale ingest temp files from the content store. - rm -rf "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" 2>/dev/null || true - mkdir -p "${CONTAINERD_DIR}/io.containerd.content.v1.content/ingest" - # meta.db and overlayfs snapshots persist across boots on virtio-fs. - # No need to delete meta.db — snapshot metadata remains valid since - # the snapshotter directory is no longer backed by volatile tmpfs. - ts "cleaned containerd runtime state (meta.db + snapshots preserved)" -fi -rm -rf /run/k3s 2>/dev/null || true - -# Ensure the overlayfs snapshotter directory exists. The snapshotter -# runs directly on virtio-fs, so layer data and snapshot metadata -# persist across VM restarts. This eliminates the need to re-import -# image tarballs and re-extract layers on every boot, significantly -# reducing sandbox creation time. -OVERLAYFS_DIR="${CONTAINERD_DIR}/io.containerd.snapshotter.v1.overlayfs" -mkdir -p "$OVERLAYFS_DIR" -if [ "$STATE_DISK_ACTIVE" = true ]; then - ts "overlayfs snapshotter on block-backed containerd state" -else - ts "overlayfs snapshotter on virtio-fs (persistent)" -fi - -ts "stale artifacts cleaned" - -# ── Clean stale CNI / pod networking state ────────────────────────────── -# The rootfs persists across VM restarts via virtio-fs. Previous pod -# sandboxes leave behind veth pairs, bridge routes, host-local IPAM -# allocations, and network namespaces. If not cleaned, the bridge CNI -# plugin fails with: -# "failed to add route ... file exists" -# because the default route via cni0 already exists from the prior boot, -# or a stale network namespace already has the route configured. - -# Tear down the CNI bridge and its associated routes. -if ip link show cni0 >/dev/null 2>&1; then - ip link set cni0 down 2>/dev/null || true - ip link delete cni0 2>/dev/null || true - ts "deleted stale cni0 bridge" -fi - -# Remove any leftover veth pairs (CNI bridge plugin creates vethXXXX). -veths=$(ip -o link show type veth 2>/dev/null | awk -F': ' '{print $2}' | cut -d'@' -f1 || true) -for veth in $veths; do - ip link delete "$veth" 2>/dev/null || true -done - -# Flush host-local IPAM allocations so IPs can be reassigned cleanly. -rm -rf /var/lib/cni/networks 2>/dev/null || true -rm -rf /var/lib/cni/results 2>/dev/null || true - -# Flush any stale CNI-added routes for the pod CIDR. These can conflict -# with routes the bridge plugin tries to add on the next boot. -ip route flush 10.42.0.0/24 2>/dev/null || true - -# Clean up stale pod network namespaces from previous boots. Containerd -# creates named netns under /var/run/netns/ for each pod sandbox. If -# these persist across VM restarts, the CNI bridge plugin fails when -# adding routes because the stale netns already has the default route -# configured from the prior boot. Removing all named network namespaces -# forces containerd to create fresh ones. -if [ -d /var/run/netns ]; then - netns_list=$(ip netns list 2>/dev/null | awk '{print $1}' || true) - for ns in $netns_list; do - ip netns delete "$ns" 2>/dev/null || true - done -fi -# Also clean the netns bind-mount directory used by containerd/CRI. -# Containerd may use /run/netns/ or /var/run/netns/ (same via tmpfs). -rm -rf /run/netns/* 2>/dev/null || true -rm -rf /var/run/netns/* 2>/dev/null || true - -ts "stale CNI networking state cleaned" - -# ── Network profile detection ─────────────────────────────────────────── -# Detect early so manifest patching and k3s flags both use the same value. -# -# "bridge" is the only supported profile. It requires a custom libkrunfw -# with CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT built in. If the -# kernel lacks these capabilities the VM cannot run pod networking and we -# fail fast with an actionable error. - -NET_PROFILE="bridge" - -ts "network profile: ${NET_PROFILE}" - -# Validate that the kernel actually has the required capabilities. -_caps_ok=true -if ! ip link add _cap_br0 type bridge 2>/dev/null; then - echo "ERROR: kernel lacks bridge support (CONFIG_BRIDGE). Use a custom libkrunfw." >&2 - _caps_ok=false -else - ip link del _cap_br0 2>/dev/null || true -fi -if [ ! -d /proc/sys/net/netfilter ] && [ ! -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then - echo "ERROR: kernel lacks netfilter support (CONFIG_NETFILTER). Use a custom libkrunfw." >&2 - _caps_ok=false -fi -if [ "$_caps_ok" = false ]; then - echo "FATAL: required kernel capabilities missing — cannot configure pod networking." >&2 - echo "See: architecture/custom-vm-runtime.md for build instructions." >&2 - exit 1 -fi - -# ── Deploy bundled manifests (cold boot only) ─────────────────────────── -# On pre-initialized rootfs, manifests are already in place from the -# build-time k3s boot. Skip this entirely for fast startup. - -K3S_MANIFESTS="/var/lib/rancher/k3s/server/manifests" -BUNDLED_MANIFESTS="/opt/openshell/manifests" - -if [ "$PRE_INITIALIZED" = false ]; then - - mkdir -p "$K3S_MANIFESTS" - - if [ -d "$BUNDLED_MANIFESTS" ]; then - ts "deploying bundled manifests (cold boot)..." - for manifest in "$BUNDLED_MANIFESTS"/*.yaml; do - [ ! -f "$manifest" ] && continue - cp "$manifest" "$K3S_MANIFESTS/" - done - - # Remove stale OpenShell-managed manifests from previous boots. - for existing in "$K3S_MANIFESTS"/openshell-*.yaml \ - "$K3S_MANIFESTS"/agent-*.yaml; do - [ ! -f "$existing" ] && continue - basename=$(basename "$existing") - if [ ! -f "$BUNDLED_MANIFESTS/$basename" ]; then - rm -f "$existing" - fi - done - fi - - # Restore helm chart tarballs from staging. A --reset wipes - # server/static/charts/ but the bundled charts survive in - # /opt/openshell/charts/. - BUNDLED_CHARTS="/opt/openshell/charts" - K3S_CHARTS="/var/lib/rancher/k3s/server/static/charts" - if [ -d "$BUNDLED_CHARTS" ]; then - mkdir -p "$K3S_CHARTS" - cp "$BUNDLED_CHARTS"/*.tgz "$K3S_CHARTS/" 2>/dev/null || true - ts "helm charts restored from staging" - fi - - ts "manifests deployed" -else - ts "skipping manifest deploy (pre-initialized)" -fi - -# Patch manifests for VM deployment constraints. -HELMCHART="$K3S_MANIFESTS/openshell-helmchart.yaml" -if [ -f "$HELMCHART" ]; then - # Use pre-loaded images and a tmp-backed database in the VM. - sed -i 's|__IMAGE_PULL_POLICY__|IfNotPresent|g' "$HELMCHART" - sed -i 's|__SANDBOX_IMAGE_PULL_POLICY__|"IfNotPresent"|g' "$HELMCHART" - sed -i 's|__DB_URL__|"sqlite:/tmp/openshell.db"|g' "$HELMCHART" - # Clear SSH gateway placeholders (default 127.0.0.1 is correct for local VM). - sed -i 's|sshGatewayHost: __SSH_GATEWAY_HOST__|sshGatewayHost: ""|g' "$HELMCHART" - sed -i 's|sshGatewayPort: __SSH_GATEWAY_PORT__|sshGatewayPort: 0|g' "$HELMCHART" - sed -i 's|__DISABLE_GATEWAY_AUTH__|false|g' "$HELMCHART" - sed -i 's|__DISABLE_TLS__|false|g' "$HELMCHART" - sed -i 's|hostGatewayIP: __HOST_GATEWAY_IP__|hostGatewayIP: ""|g' "$HELMCHART" - sed -i '/__CHART_CHECKSUM__/d' "$HELMCHART" -fi - -AGENT_MANIFEST="$K3S_MANIFESTS/agent-sandbox.yaml" -if [ -f "$AGENT_MANIFEST" ]; then - # Bridge CNI: agent-sandbox uses normal pod networking. - # kube-proxy is enabled so kubernetes.default.svc is reachable - # via ClusterIP — no need for KUBERNETES_SERVICE_HOST override. - sed -i '/hostNetwork: true/d' "$AGENT_MANIFEST" - sed -i '/dnsPolicy: ClusterFirstWithHostNet/d' "$AGENT_MANIFEST" - ts "agent-sandbox: using pod networking (bridge profile)" -fi - -# ── CNI configuration (bridge) ────────────────────────────────────────── -# Uses the bridge CNI plugin with iptables masquerade. Requires -# CONFIG_BRIDGE, CONFIG_NETFILTER, CONFIG_NF_NAT in the VM kernel -# (validated above at boot). kube-proxy uses nftables mode for service -# VIP routing. - -CNI_CONF_DIR="/etc/cni/net.d" -CNI_BIN_DIR="/opt/cni/bin" -mkdir -p "$CNI_CONF_DIR" "$CNI_BIN_DIR" - -# Enable IP forwarding (required for masquerade). -if ! echo 1 > /proc/sys/net/ipv4/ip_forward 2>/dev/null; then - echo "FATAL: failed to enable IP forwarding — pod networking will not work" >&2 - exit 1 -fi - -# Enable bridge netfilter call (required for CNI bridge masquerade to -# see bridged traffic). -if [ -f /proc/sys/net/bridge/bridge-nf-call-iptables ]; then - if ! echo 1 > /proc/sys/net/bridge/bridge-nf-call-iptables 2>/dev/null; then - ts "WARNING: failed to enable bridge-nf-call-iptables — CNI masquerade may not work" - fi -fi - -cat > "$CNI_CONF_DIR/10-bridge.conflist" << 'CNICFG' -{ - "cniVersion": "1.0.0", - "name": "bridge", - "plugins": [ - { - "type": "bridge", - "bridge": "cni0", - "isGateway": true, - "isDefaultGateway": true, - "ipMasq": true, - "hairpinMode": true, - "ipam": { - "type": "host-local", - "ranges": [[{ "subnet": "10.42.0.0/24" }]] - } - }, - { - "type": "portmap", - "capabilities": { "portMappings": true }, - "snat": true - }, - { - "type": "loopback" - } - ] -} -CNICFG - -# Remove any stale legacy ptp config. -rm -f "$CNI_CONF_DIR/10-ptp.conflist" 2>/dev/null || true - -ts "bridge CNI configured (cni0 + iptables masquerade)" - -# Start the local exec agent before k3s so `openshell-vm exec` works as soon as -# the VM has booted. It only listens on vsock, not on the guest network. -if command -v python3 >/dev/null 2>&1; then - ts "starting openshell-vm exec agent" - mkdir -p /run/openshell - setsid python3 /srv/openshell-vm-exec-agent.py >/run/openshell/openshell-vm-exec-agent.log 2>&1 & -else - ts "WARNING: python3 missing, openshell-vm exec agent disabled" -fi - -# Symlink k3s-bundled CNI binaries to the default containerd bin path. -# k3s extracts its tools to /var/lib/rancher/k3s/data//bin/ at startup. -# On cold boot this directory doesn't exist yet (k3s hasn't run), so we -# first try synchronously, then fall back to a background watcher that -# polls until k3s extracts the binaries and creates the symlinks before -# any pods can schedule. -link_cni_binaries() { - local data_bin="$1" - # Ensure execute permissions on all binaries. The rootfs may have - # been built on macOS where virtio-fs or docker export can strip - # execute bits from Linux ELF binaries. - chmod +x "$data_bin"/* 2>/dev/null || true - if [ -d "$data_bin/aux" ]; then - chmod +x "$data_bin/aux"/* 2>/dev/null || true - fi - for plugin in bridge host-local loopback bandwidth portmap; do - [ -e "$data_bin/$plugin" ] && ln -sf "$data_bin/$plugin" "$CNI_BIN_DIR/$plugin" - done -} - -# Find the k3s data bin dir, excluding temporary extraction directories -# (k3s extracts to -tmp/ then renames to /). -find_k3s_data_bin() { - find /var/lib/rancher/k3s/data -maxdepth 2 -name bin -type d 2>/dev/null \ - | grep -v '\-tmp/' | head -1 || true -} - -K3S_DATA_BIN=$(find_k3s_data_bin) -if [ -n "$K3S_DATA_BIN" ]; then - link_cni_binaries "$K3S_DATA_BIN" - ts "CNI binaries linked from $K3S_DATA_BIN" -else - # Cold boot: k3s hasn't extracted binaries yet. Launch a background - # watcher that polls until the data dir appears (k3s creates it in - # the first ~2s of startup) and then symlinks the CNI plugins. - # We exclude -tmp directories to avoid symlinking to the transient - # extraction path that k3s renames once extraction completes. - ts "CNI binaries not yet available, starting background watcher" - setsid sh -c ' - CNI_BIN_DIR="/opt/cni/bin" - for i in $(seq 1 60); do - K3S_DATA_BIN=$(find /var/lib/rancher/k3s/data -maxdepth 2 -name bin -type d 2>/dev/null \ - | grep -v "\-tmp/" | head -1) - if [ -n "$K3S_DATA_BIN" ]; then - chmod +x "$K3S_DATA_BIN"/* 2>/dev/null || true - if [ -d "$K3S_DATA_BIN/aux" ]; then - chmod +x "$K3S_DATA_BIN/aux"/* 2>/dev/null || true - fi - for plugin in bridge host-local loopback bandwidth portmap; do - [ -e "$K3S_DATA_BIN/$plugin" ] && ln -sf "$K3S_DATA_BIN/$plugin" "$CNI_BIN_DIR/$plugin" - done - echo "[cni-watcher] CNI binaries linked from $K3S_DATA_BIN after ${i}s" - exit 0 - fi - sleep 1 - done - echo "[cni-watcher] ERROR: k3s data bin dir not found after 60s" - ' & -fi - -# Also clean up any flannel config from the k3s-specific CNI directory -# (pre-baked state from the Docker build used host-gw flannel). -rm -f "/var/lib/rancher/k3s/agent/etc/cni/net.d/10-flannel.conflist" 2>/dev/null || true - -# ── PKI: generate once, read via exec agent ─────────────────────────── -# Certs are generated on first boot and stored at /opt/openshell/pki/. -# With the block-device layout this path is on the state disk, fully -# isolated from the virtiofs host filesystem. -# The host-side bootstrap reads certs via the exec agent (vsock port -# 10777) by running `cat` on each PEM file. - -PKI_DIR="/opt/openshell/pki" -if [ ! -f "$PKI_DIR/ca.crt" ]; then - ts "generating PKI (first boot)..." - mkdir -p "$PKI_DIR" - - # CA - openssl req -x509 -newkey ec -pkeyopt ec_paramgen_curve:prime256v1 \ - -keyout "$PKI_DIR/ca.key" -out "$PKI_DIR/ca.crt" \ - -days 3650 -nodes -subj "/O=openshell/CN=openshell-ca" 2>/dev/null - - # Server cert with SANs - cat > "$PKI_DIR/server.cnf" </dev/null - openssl x509 -req -in "$PKI_DIR/server.csr" \ - -CA "$PKI_DIR/ca.crt" -CAkey "$PKI_DIR/ca.key" -CAcreateserial \ - -out "$PKI_DIR/server.crt" -days 3650 \ - -extensions v3_req -extfile "$PKI_DIR/server.cnf" 2>/dev/null - - # Client cert (must be v3 — rustls rejects v1) - cat > "$PKI_DIR/client.cnf" </dev/null - openssl x509 -req -in "$PKI_DIR/client.csr" \ - -CA "$PKI_DIR/ca.crt" -CAkey "$PKI_DIR/ca.key" -CAcreateserial \ - -out "$PKI_DIR/client.crt" -days 3650 \ - -extensions v3_client -extfile "$PKI_DIR/client.cnf" 2>/dev/null - - # Clean up CSRs - rm -f "$PKI_DIR"/*.csr "$PKI_DIR"/*.cnf "$PKI_DIR"/*.srl - - ts "PKI generated" -else - ts "existing PKI found, skipping generation" -fi - -SSH_HANDSHAKE_SECRET_FILE="${PKI_DIR}/ssh-handshake-secret" -if [ ! -f "$SSH_HANDSHAKE_SECRET_FILE" ]; then - ts "generating SSH handshake secret (first boot)..." - head -c 32 /dev/urandom | od -A n -t x1 | tr -d ' \n' > "$SSH_HANDSHAKE_SECRET_FILE" - chmod 600 "$SSH_HANDSHAKE_SECRET_FILE" -else - ts "existing SSH handshake secret found, reusing" -fi - -# Write TLS secrets as a k3s auto-deploy manifest. k3s applies any YAML -# in server/manifests/ on startup. We write this on every boot so that: -# - A --reset (which wipes the kine DB and server/ tree) gets secrets re-applied. -# - A corrupt kine DB (removed by the host-side corruption check) gets secrets -# re-applied on the fresh database. -# This is idempotent — k3s checksums manifests and only re-applies on change. -ts "writing TLS secrets manifest..." -mkdir -p "$K3S_MANIFESTS" -CA_CRT_B64=$(base64 -w0 < "$PKI_DIR/ca.crt") -SERVER_CRT_B64=$(base64 -w0 < "$PKI_DIR/server.crt") -SERVER_KEY_B64=$(base64 -w0 < "$PKI_DIR/server.key") -CLIENT_CRT_B64=$(base64 -w0 < "$PKI_DIR/client.crt") -CLIENT_KEY_B64=$(base64 -w0 < "$PKI_DIR/client.key") -SSH_HANDSHAKE_SECRET_B64=$(base64 -w0 < "$SSH_HANDSHAKE_SECRET_FILE") - -cat > "$K3S_MANIFESTS/openshell-tls-secrets.yaml" < "$DIAG" - exit 1 - fi - { - echo "=== [DIAG $(date +%s)] nft binary: $NFT ===" - echo "=== [DIAG] nft list tables ===" - "$NFT" list tables 2>&1 - echo "=== [DIAG] nft list ruleset (kube-proxy) ===" - "$NFT" list ruleset 2>&1 - echo "=== [DIAG] ss -tlnp ===" - ss -tlnp 2>&1 || busybox netstat -tlnp 2>&1 || echo "ss/netstat not available" - echo "=== [DIAG] ip addr ===" - ip addr 2>&1 - echo "=== [DIAG] ip route ===" - ip route 2>&1 - echo "=== [DIAG] iptables -t nat -L -n -v ===" - iptables -t nat -L -n -v 2>&1 - echo "=== [DIAG] kube-proxy healthz ===" - wget -q -O - http://127.0.0.1:10256/healthz 2>&1 || echo "healthz failed" - echo "=== [DIAG] conntrack -L ===" - conntrack -L 2>&1 || echo "conntrack not available" - echo "=== [DIAG] done ===" - } > "$DIAG" 2>&1 -' & -fi - -# ── Clear stale kine bootstrap lock ───────────────────────────────────── -# k3s uses kine with a SQLite backend at state.db. When k3s starts, kine -# sets a bootstrap lock row; if k3s is killed before completing bootstrap -# (SIGKILL, host crash, power loss), the lock persists and the next k3s -# instance hangs forever on: -# "Bootstrap key already locked — waiting for data to be populated by -# another server" -# -# We clear the lock row before starting k3s so that a warm boot with -# persistent state.db succeeds. If state.db doesn't exist (first boot or -# --reset), this is a harmless no-op. If state.db is corrupt, sqlite3 -# fails silently (|| true) and the host-side corruption check in exec.rs -# will have already removed the file. -KINE_DB="/var/lib/rancher/k3s/server/db/state.db" -if [ -f "$KINE_DB" ]; then - ts "clearing stale kine bootstrap lock (if any)" - # If sqlite3 fails (corrupt DB, missing binary), log the failure. - # The host-side corruption check in exec.rs handles the corrupt case, - # but we should still know about it. - if ! sqlite3 "$KINE_DB" "DELETE FROM kine WHERE name LIKE '/bootstrap/%';" 2>/dev/null; then - ts "WARNING: failed to clear kine bootstrap lock — k3s may hang if DB is corrupt" - fi - if ! sqlite3 "$KINE_DB" "PRAGMA wal_checkpoint(TRUNCATE);" 2>/dev/null; then - ts "WARNING: failed to checkpoint kine WAL" - fi -fi - -exec /usr/local/bin/k3s server "${K3S_ARGS[@]}" diff --git a/crates/openshell-vm/src/embedded.rs b/crates/openshell-vm/src/embedded.rs deleted file mode 100644 index 537e7d725..000000000 --- a/crates/openshell-vm/src/embedded.rs +++ /dev/null @@ -1,454 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Embedded VM runtime resources. -//! -//! Native libraries (libkrun, libkrunfw, gvproxy) and the rootfs are embedded as -//! zstd-compressed byte arrays and extracted to XDG cache directories on first use. -//! -//! Cache locations: -//! - Runtime: `~/.local/share/openshell/vm-runtime/{version}/` -//! - Rootfs: `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs/` - -use std::fs; -use std::io::{Read, Write}; -use std::path::{Path, PathBuf}; - -use indicatif::{ProgressBar, ProgressStyle}; - -use crate::VmError; - -// ── Platform-specific embedded resources ─────────────────────────────────── - -#[cfg(all(target_os = "macos", target_arch = "aarch64"))] -mod resources { - pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.dylib.zst")); - pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.5.dylib.zst")); - pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); - pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); - pub const LIBKRUN_NAME: &str = "libkrun.dylib"; - pub const LIBKRUNFW_NAME: &str = "libkrunfw.5.dylib"; -} - -#[cfg(all(target_os = "linux", target_arch = "aarch64"))] -mod resources { - pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); - pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); - pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); - pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); - pub const LIBKRUN_NAME: &str = "libkrun.so"; - pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; -} - -#[cfg(all(target_os = "linux", target_arch = "x86_64"))] -mod resources { - pub const LIBKRUN: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrun.so.zst")); - pub const LIBKRUNFW: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/libkrunfw.so.5.zst")); - pub const GVPROXY: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/gvproxy.zst")); - pub const ROOTFS: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/rootfs.tar.zst")); - pub const LIBKRUN_NAME: &str = "libkrun.so"; - pub const LIBKRUNFW_NAME: &str = "libkrunfw.so.5"; -} - -// Fallback for unsupported platforms (will fail at runtime) -#[cfg(not(any( - all(target_os = "macos", target_arch = "aarch64"), - all(target_os = "linux", target_arch = "aarch64"), - all(target_os = "linux", target_arch = "x86_64"), -)))] -mod resources { - pub const LIBKRUN: &[u8] = &[]; - pub const LIBKRUNFW: &[u8] = &[]; - pub const GVPROXY: &[u8] = &[]; - pub const ROOTFS: &[u8] = &[]; - pub const LIBKRUN_NAME: &str = "libkrun"; - pub const LIBKRUNFW_NAME: &str = "libkrunfw"; -} - -const VERSION: &str = env!("CARGO_PKG_VERSION"); - -// ── Public API ───────────────────────────────────────────────────────────── - -/// Ensures the embedded VM runtime is extracted to the cache directory. -/// -/// Returns the path to the runtime directory containing: -/// - libkrun.{dylib,so} -/// - libkrunfw.{5.dylib,.so.5} -/// - gvproxy -/// -/// On first call, this extracts the compressed embedded resources to the cache. -/// Subsequent calls return the cached path if valid. -pub fn ensure_runtime_extracted() -> Result { - // Check if embedded resources are available (non-empty) - if resources::LIBKRUN.is_empty() { - return Err(VmError::HostSetup( - "VM runtime not embedded for this platform. \ - Supported: macOS ARM64, Linux ARM64, Linux x86_64" - .to_string(), - )); - } - - let cache_dir = runtime_cache_dir()?; - let version_marker = cache_dir.join(".version"); - - // Cache key: version + content fingerprint (so dev builds at 0.0.0 - // still invalidate when the embedded libraries change). - let cache_key = runtime_cache_key(); - - // Check if already extracted with the correct cache key - if version_marker.exists() - && let Ok(cached_key) = fs::read_to_string(&version_marker) - && cached_key.trim() == cache_key - { - // Validate files exist - if validate_runtime_dir(&cache_dir).is_ok() { - tracing::debug!( - path = %cache_dir.display(), - "Using cached VM runtime" - ); - return Ok(cache_dir); - } - } - - // Clean up old versions before extracting new one - cleanup_old_versions(&cache_dir)?; - - // Create fresh directory - if cache_dir.exists() { - fs::remove_dir_all(&cache_dir) - .map_err(|e| VmError::HostSetup(format!("remove old cache: {e}")))?; - } - fs::create_dir_all(&cache_dir) - .map_err(|e| VmError::HostSetup(format!("create cache dir: {e}")))?; - - tracing::info!( - path = %cache_dir.display(), - version = VERSION, - "Extracting embedded VM runtime" - ); - - // Extract all resources - extract_resource(resources::LIBKRUN, &cache_dir.join(resources::LIBKRUN_NAME))?; - extract_resource( - resources::LIBKRUNFW, - &cache_dir.join(resources::LIBKRUNFW_NAME), - )?; - extract_resource(resources::GVPROXY, &cache_dir.join("gvproxy"))?; - - // On macOS, libkrun.dylib references libkrunfw via @loader_path/libkrunfw.dylib - // (the unversioned name), but we embed as libkrunfw.5.dylib. Create the - // unversioned name so dyld can resolve the dependency. - #[cfg(target_os = "macos")] - { - let unversioned = cache_dir.join("libkrunfw.dylib"); - if !unversioned.exists() { - std::os::unix::fs::symlink(resources::LIBKRUNFW_NAME, &unversioned) - .map_err(|e| VmError::HostSetup(format!("symlink libkrunfw.dylib: {e}")))?; - } - } - - // Make gvproxy executable - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt; - fs::set_permissions(cache_dir.join("gvproxy"), fs::Permissions::from_mode(0o755)) - .map_err(|e| VmError::HostSetup(format!("chmod gvproxy: {e}")))?; - } - - // Write version marker (includes content fingerprint for cache invalidation) - fs::write(&version_marker, runtime_cache_key()) - .map_err(|e| VmError::HostSetup(format!("write version marker: {e}")))?; - - tracing::info!( - path = %cache_dir.display(), - "VM runtime extracted successfully" - ); - - Ok(cache_dir) -} - -/// Returns the path where the runtime would be cached (without extracting). -pub fn runtime_cache_path() -> Result { - runtime_cache_dir() -} - -/// Extract the embedded rootfs to the given destination directory. -/// -/// If the destination already exists, it is returned as-is (no re-extraction). -/// Otherwise the embedded `rootfs.tar.zst` is decompressed and unpacked into `dest`. -/// -/// A `.version` marker is written after successful extraction so that -/// version-mismatched rootfs directories are detected and rebuilt. -pub fn extract_rootfs_to(dest: &Path) -> Result<(), VmError> { - if resources::ROOTFS.is_empty() { - return Err(VmError::HostSetup( - "Rootfs not embedded. Build with: mise run vm:build:embedded".to_string(), - )); - } - - let version_marker = dest.join(".version"); - - // Already extracted with the correct version — nothing to do. - if version_marker.exists() - && let Ok(cached_version) = fs::read_to_string(&version_marker) - && cached_version.trim() == VERSION - { - tracing::debug!( - path = %dest.display(), - "Using cached rootfs" - ); - return Ok(()); - } - - // Remove existing if present (version mismatch or incomplete extraction). - if dest.exists() { - eprintln!("Removing outdated rootfs at {}...", dest.display()); - fs::remove_dir_all(dest) - .map_err(|e| VmError::HostSetup(format!("remove old rootfs: {e}")))?; - } - - // Extract with progress bar. - extract_rootfs_with_progress(resources::ROOTFS, dest)?; - - // Write version marker. - fs::write(&version_marker, VERSION) - .map_err(|e| VmError::HostSetup(format!("write rootfs version marker: {e}")))?; - - Ok(()) -} - -/// Clean up rootfs directories from older versions. -/// -/// Call this periodically (e.g. at startup) to reclaim disk from previous -/// releases. Removes all version directories under the openshell-vm base -/// except the current version. -pub fn cleanup_old_rootfs() -> Result<(), VmError> { - let base = rootfs_cache_base()?; - if !base.exists() { - return Ok(()); - } - - let current_version_dir = base.join(VERSION); - cleanup_old_versions_in_base(&base, ¤t_version_dir); - Ok(()) -} - -/// Check if the rootfs is embedded (non-empty). -pub fn has_embedded_rootfs() -> bool { - !resources::ROOTFS.is_empty() -} - -// ── Internal helpers ─────────────────────────────────────────────────────── - -/// Build a cache key that combines the version string with a short content -/// fingerprint of the embedded runtime bytes. -/// -/// Using the version alone is insufficient for dev builds (all `0.0.0`) -/// because the embedded libraries can change between compiles without the -/// version changing. The fingerprint is a simple XOR-fold of the first few -/// bytes of each embedded resource — cheap to compute at startup without -/// pulling in a hash dependency. -fn runtime_cache_key() -> String { - // XOR-fold the first 64 bytes of each resource to get a cheap fingerprint. - let mut fp: u64 = 0; - for (i, chunk) in [resources::LIBKRUN, resources::LIBKRUNFW, resources::GVPROXY] - .iter() - .enumerate() - { - let sample = &chunk[..chunk.len().min(64)]; - let mut word: u64 = 0; - for (j, &b) in sample.iter().enumerate() { - word ^= u64::from(b) << ((j % 8) * 8); - } - // Mix in resource index so identical resources don't cancel out. - let i_u32 = u32::try_from(i).unwrap_or(u32::MAX); - fp ^= word.rotate_left(i_u32 * 13 + 7); - // Also mix in the total length so size changes are detected. - fp ^= (chunk.len() as u64).rotate_left(i_u32 * 17 + 3); - } - format!("{VERSION}-{fp:016x}") -} - -fn runtime_cache_dir() -> Result { - let base = openshell_core::paths::xdg_data_dir() - .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; - Ok(base.join("openshell").join("vm-runtime").join(VERSION)) -} - -fn runtime_cache_base() -> Result { - let base = openshell_core::paths::xdg_data_dir() - .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; - Ok(base.join("openshell").join("vm-runtime")) -} - -fn rootfs_cache_base() -> Result { - let base = openshell_core::paths::xdg_data_dir() - .map_err(|e| VmError::HostSetup(format!("resolve XDG data dir: {e}")))?; - Ok(base.join("openshell").join("openshell-vm")) -} - -fn cleanup_old_versions(current_dir: &Path) -> Result<(), VmError> { - cleanup_old_versions_in_base(&runtime_cache_base()?, current_dir); - Ok(()) -} - -fn cleanup_old_versions_in_base(base: &Path, current_dir: &Path) { - if !base.exists() { - return; - } - - let Ok(entries) = fs::read_dir(base) else { - return; // Can't read, skip cleanup - }; - - for entry in entries.filter_map(Result::ok) { - let path = entry.path(); - // Skip if this is the current version directory or a parent of it - if path.is_dir() && !current_dir.starts_with(&path) && path != current_dir { - tracing::debug!( - path = %path.display(), - "Cleaning up old version" - ); - if let Err(e) = fs::remove_dir_all(&path) { - tracing::warn!( - path = %path.display(), - error = %e, - "Failed to clean up old version" - ); - } - } - } -} - -fn extract_resource(compressed: &[u8], dest: &Path) -> Result<(), VmError> { - if compressed.is_empty() { - return Err(VmError::HostSetup(format!( - "embedded resource is empty: {}", - dest.display() - ))); - } - - let decompressed = zstd::decode_all(compressed) - .map_err(|e| VmError::HostSetup(format!("decompress {}: {e}", dest.display())))?; - - let mut file = fs::File::create(dest) - .map_err(|e| VmError::HostSetup(format!("create {}: {e}", dest.display())))?; - - file.write_all(&decompressed) - .map_err(|e| VmError::HostSetup(format!("write {}: {e}", dest.display())))?; - - tracing::debug!( - path = %dest.display(), - compressed_size = compressed.len(), - decompressed_size = decompressed.len(), - "Extracted resource" - ); - - Ok(()) -} - -fn extract_rootfs_with_progress(compressed: &[u8], dest: &Path) -> Result<(), VmError> { - eprintln!("Extracting VM environment (first run)..."); - - // Create progress bar for decompression - let pb = ProgressBar::new(compressed.len() as u64); - pb.set_style( - ProgressStyle::default_bar() - .template(" Decompressing [{bar:40.cyan/blue}] {bytes}/{total_bytes}") - .unwrap() - .progress_chars("=>-"), - ); - - // Wrap the compressed data in a progress reader - let reader = ProgressReader::new(std::io::Cursor::new(compressed), pb.clone()); - - // Decompress zstd stream - let decoder = zstd::Decoder::new(reader) - .map_err(|e| VmError::HostSetup(format!("create zstd decoder: {e}")))?; - - pb.finish_and_clear(); - - // Create destination directory - fs::create_dir_all(dest).map_err(|e| VmError::HostSetup(format!("create rootfs dir: {e}")))?; - - // Extract tar archive with progress - eprintln!(" Extracting rootfs..."); - let mut archive = tar::Archive::new(decoder); - archive - .unpack(dest) - .map_err(|e| VmError::HostSetup(format!("extract rootfs tarball: {e}")))?; - - eprintln!(" Rootfs extracted to {}", dest.display()); - - Ok(()) -} - -/// A reader wrapper that updates a progress bar as data is read. -struct ProgressReader { - inner: R, - progress: ProgressBar, -} - -impl ProgressReader { - fn new(inner: R, progress: ProgressBar) -> Self { - Self { inner, progress } - } -} - -impl Read for ProgressReader { - fn read(&mut self, buf: &mut [u8]) -> std::io::Result { - let n = self.inner.read(buf)?; - self.progress.inc(n as u64); - Ok(n) - } -} - -fn validate_runtime_dir(dir: &Path) -> Result<(), VmError> { - let libkrun = dir.join(resources::LIBKRUN_NAME); - let libkrunfw = dir.join(resources::LIBKRUNFW_NAME); - let gvproxy = dir.join("gvproxy"); - - for path in [&libkrun, &libkrunfw, &gvproxy] { - if !path.exists() { - return Err(VmError::HostSetup(format!( - "missing runtime file: {}", - path.display() - ))); - } - - // Check file is not empty (would indicate a stub) - let size = fs::metadata(path).map_or(0, |m| m.len()); - if size == 0 { - return Err(VmError::HostSetup(format!( - "runtime file is empty (stub): {}", - path.display() - ))); - } - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_resources_not_empty() { - // On supported platforms, resources should be non-empty - #[cfg(any( - all(target_os = "macos", target_arch = "aarch64"), - all(target_os = "linux", target_arch = "aarch64"), - all(target_os = "linux", target_arch = "x86_64"), - ))] - { - // Note: This test only passes if `mise run vm:setup` was run - // before building. In CI without compressed artifacts, resources will be - // empty stubs. - if !resources::LIBKRUN.is_empty() { - assert!(!resources::LIBKRUNFW.is_empty()); - assert!(!resources::GVPROXY.is_empty()); - } - } - } -} diff --git a/crates/openshell-vm/src/exec.rs b/crates/openshell-vm/src/exec.rs deleted file mode 100644 index 63771509e..000000000 --- a/crates/openshell-vm/src/exec.rs +++ /dev/null @@ -1,1176 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -use std::fs::{self, File}; -use std::io::{BufRead, BufReader, Read, Write}; -use std::os::fd::{AsFd, BorrowedFd}; -use std::os::unix::net::UnixStream; -use std::path::{Path, PathBuf}; -use std::thread; -use std::time::{SystemTime, UNIX_EPOCH}; - -use base64::Engine as _; -use nix::sys::termios::{self, SetArg, Termios}; -use serde::{Deserialize, Serialize}; - -use crate::VmError; - -/// Remove a directory, safely handling symlinks. -/// -/// Uses `symlink_metadata` (lstat) to detect symlinks. If the path is a -/// symlink (e.g. `var/run -> /run` in a Linux rootfs), the symlink itself -/// is removed without following it — preventing traversal attacks where a -/// symlink could redirect `remove_dir_all` to an arbitrary host path. -/// If the path is a real directory, it is removed recursively. -fn safe_remove_dir_all(path: &Path) -> Result { - match fs::symlink_metadata(path) { - Ok(meta) => { - if meta.file_type().is_symlink() { - // Remove the symlink itself, not the target it points to. - fs::remove_file(path).map_err(|e| { - VmError::RuntimeState(format!("reset: remove symlink {}: {e}", path.display())) - })?; - return Ok(true); - } - if !meta.is_dir() { - return Ok(false); // Not a directory — nothing to remove. - } - fs::remove_dir_all(path).map_err(|e| { - VmError::RuntimeState(format!("reset: remove {}: {e}", path.display())) - })?; - Ok(true) - } - Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(false), - Err(e) => Err(VmError::RuntimeState(format!( - "stat {}: {e}", - path.display() - ))), - } -} - -pub const VM_EXEC_VSOCK_PORT: u32 = 10_777; - -const VM_STATE_NAME: &str = "vm-state.json"; -const VM_LOCK_NAME: &str = "vm.lock"; -const KUBECONFIG_ENV: &str = "KUBECONFIG=/etc/rancher/k3s/k3s.yaml"; - -#[derive(Debug, Clone)] -pub struct VmExecOptions { - pub rootfs: Option, - pub command: Vec, - pub workdir: Option, - pub env: Vec, - pub tty: bool, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct VmRuntimeState { - pub pid: i32, - pub exec_vsock_port: u32, - pub socket_path: PathBuf, - pub rootfs: PathBuf, - pub console_log: PathBuf, - pub started_at_ms: u128, - /// PID of the gvproxy process (if networking uses gvproxy). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub gvproxy_pid: Option, -} - -#[derive(Debug, Serialize)] -struct ExecRequest { - argv: Vec, - env: Vec, - cwd: Option, - tty: bool, -} - -#[derive(Debug, Serialize)] -#[serde(tag = "type", rename_all = "snake_case")] -enum ClientFrame { - Stdin { data: String }, - StdinClose, - Resize { cols: u16, rows: u16 }, -} - -#[derive(Debug, Deserialize)] -#[serde(tag = "type", rename_all = "snake_case")] -enum ServerFrame { - Stdout { data: String }, - Stderr { data: String }, - Exit { code: i32 }, - Error { message: String }, -} - -struct RawModeGuard { - raw_fd: i32, - original: Termios, -} - -impl RawModeGuard { - fn enter() -> Result { - let stdin = std::io::stdin(); - let fd = stdin.as_fd(); - let original = - termios::tcgetattr(fd).map_err(|e| VmError::Exec(format!("tcgetattr: {e}")))?; - let mut raw = original.clone(); - termios::cfmakeraw(&mut raw); - termios::tcsetattr(fd, SetArg::TCSANOW, &raw) - .map_err(|e| VmError::Exec(format!("tcsetattr: {e}")))?; - Ok(Self { - raw_fd: std::os::unix::io::AsRawFd::as_raw_fd(&stdin), - original, - }) - } -} - -impl Drop for RawModeGuard { - fn drop(&mut self) { - let fd = unsafe { BorrowedFd::borrow_raw(self.raw_fd) }; - let _ = termios::tcsetattr(fd, SetArg::TCSANOW, &self.original); - } -} - -fn get_terminal_size() -> Option<(u16, u16)> { - let fd = std::os::unix::io::AsRawFd::as_raw_fd(&std::io::stdout()); - let mut ws: libc::winsize = unsafe { std::mem::zeroed() }; - let rc = unsafe { libc::ioctl(fd, libc::TIOCGWINSZ, &mut ws) }; - if rc == 0 && ws.ws_col > 0 && ws.ws_row > 0 { - Some((ws.ws_col, ws.ws_row)) - } else { - None - } -} - -pub fn vm_exec_socket_path(rootfs: &Path) -> PathBuf { - // Prefer XDG_RUNTIME_DIR (per-user, restricted permissions on Linux), - // fall back to /tmp. Ownership/symlink validation happens in - // secure_socket_base() when the gvproxy socket dir is created; here - // we just compute the path. The parent directory is created (with - // permission checks) at launch time via create_dir_all. - let base = std::env::var_os("XDG_RUNTIME_DIR").map_or_else( - || { - let mut base = PathBuf::from("/tmp"); - if !base.is_dir() { - base = std::env::temp_dir(); - } - base - }, - PathBuf::from, - ); - let dir = base.join("ovm-exec"); - let id = hash_path_id(rootfs); - dir.join(format!("{id}.sock")) -} - -fn hash_path_id(path: &Path) -> String { - let mut hash: u64 = 0xcbf2_9ce4_8422_2325; - for byte in path.to_string_lossy().as_bytes() { - hash ^= u64::from(*byte); - hash = hash.wrapping_mul(0x0100_0000_01b3); - } - format!("{:012x}", hash & 0x0000_ffff_ffff_ffff) -} - -pub fn write_vm_runtime_state( - rootfs: &Path, - pid: i32, - console_log: &Path, - gvproxy_pid: Option, -) -> Result<(), VmError> { - let state = VmRuntimeState { - pid, - exec_vsock_port: VM_EXEC_VSOCK_PORT, - socket_path: vm_exec_socket_path(rootfs), - rootfs: rootfs.to_path_buf(), - console_log: console_log.to_path_buf(), - started_at_ms: now_ms()?, - gvproxy_pid, - }; - let path = vm_state_path(rootfs); - let bytes = serde_json::to_vec_pretty(&state) - .map_err(|e| VmError::RuntimeState(format!("serialize VM runtime state: {e}")))?; - fs::create_dir_all(vm_run_dir(rootfs)) - .map_err(|e| VmError::RuntimeState(format!("create VM runtime dir: {e}")))?; - fs::write(&path, bytes) - .map_err(|e| VmError::RuntimeState(format!("write {}: {e}", path.display())))?; - Ok(()) -} - -pub fn clear_vm_runtime_state(rootfs: &Path) { - let state_path = vm_state_path(rootfs); - let socket_path = vm_exec_socket_path(rootfs); - let _ = fs::remove_file(state_path); - let _ = fs::remove_file(socket_path); -} - -/// Wipe stale container runtime state from the rootfs. -/// -/// After a crash or unclean shutdown, containerd and kubelet can retain -/// references to pod sandboxes and containers that no longer exist. This -/// causes `ContainerCreating` → `context deadline exceeded` loops because -/// containerd blocks trying to clean up orphaned resources. -/// -/// This function removes: -/// - containerd runtime task state (running container metadata) -/// - containerd sandbox controller shim state -/// - containerd CRI plugin state (pod/container tracking) -/// - containerd tmp mounts -/// - kubelet pod state (volume mounts, pod status) -/// -/// It preserves: -/// - containerd images and content (no re-pull needed) -/// - containerd snapshots (no re-extract needed) -/// - containerd metadata database (meta.db — image/snapshot tracking) -/// -/// **Note:** This is the only path that wipes the kine/SQLite database. -/// Normal boots preserve `state.db` (and all cluster objects) across -/// restarts. The init script clears stale bootstrap locks via `sqlite3`, -/// and `recover_corrupt_kine_db` handles actual file corruption. -pub fn reset_runtime_state(rootfs: &Path, gateway_name: &str) -> Result<(), VmError> { - // Full reset: wipe all runtime state so the VM cold-starts from scratch. - // - // With the block-device layout, k3s server/agent state, containerd, PVCs, - // and PKI all live on the state disk — the caller in lib.rs deletes the - // entire state disk image file, which achieves a complete wipe in one - // operation without touching the virtiofs rootfs. - // - // We still clean the virtiofs rootfs for paths that are NOT on the state - // disk: kubelet pod volumes, CNI state, and the pre-init sentinel. These - // paths are present in the rootfs regardless of the storage layout. - let dirs_to_remove = [ - // Stale pod volume mounts and projected secrets - rootfs.join("var/lib/kubelet/pods"), - // CNI state: stale network namespace references from dead pods - rootfs.join("var/lib/cni"), - // Runtime state (PIDs, sockets) — on virtiofs, not block device - rootfs.join("var/run"), - ]; - - let mut cleaned = 0usize; - for dir in &dirs_to_remove { - if safe_remove_dir_all(dir)? { - cleaned += 1; - } - } - - // Remove the pre-initialized sentinel so the init script knows - // this is a cold start and deploys manifests from staging. - // We write a marker file so ensure-vm-rootfs.sh still sees the - // rootfs as built (avoiding a full rebuild) while the init script - // detects the cold start via the missing .initialized sentinel. - let sentinel = rootfs.join("opt/openshell/.initialized"); - let reset_marker = rootfs.join("opt/openshell/.reset"); - if sentinel.exists() { - fs::remove_file(&sentinel).map_err(|e| { - VmError::RuntimeState(format!( - "reset: remove sentinel {}: {e}", - sentinel.display() - )) - })?; - fs::write(&reset_marker, "").map_err(|e| { - VmError::RuntimeState(format!( - "reset: write marker {}: {e}", - reset_marker.display() - )) - })?; - cleaned += 1; - } - - // PKI lives on the state disk; deleting the state disk image (done by - // the caller) rotates it automatically. Just note it for the log. - eprintln!("Reset: PKI will be regenerated on next boot (state disk wiped)"); - - // Wipe host-side mTLS credentials so bootstrap_gateway() takes the - // first-boot path and fetches new certs from the VM via the exec agent. - if let Ok(home) = std::env::var("HOME") { - let config_base = - std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); - let mtls_dir = PathBuf::from(&config_base) - .join("openshell/gateways") - .join(gateway_name) - .join("mtls"); - if mtls_dir.is_dir() { - fs::remove_dir_all(&mtls_dir).map_err(|e| { - VmError::RuntimeState(format!( - "reset: remove mTLS dir {}: {e}", - mtls_dir.display() - )) - })?; - } - // Also remove metadata so is_warm_boot() returns false. - let metadata = PathBuf::from(&config_base) - .join("openshell/gateways") - .join(gateway_name) - .join("metadata.json"); - if metadata.is_file() { - fs::remove_file(&metadata).map_err(|e| { - VmError::RuntimeState(format!( - "reset: remove metadata {}: {e}", - metadata.display() - )) - })?; - } - } - - eprintln!("Reset: cleaned {cleaned} state directories (full reset)"); - Ok(()) -} - -/// Remove a corrupt kine (`SQLite`) database so k3s can recreate it on boot. -/// -/// k3s uses kine with a `SQLite` backend at `var/lib/rancher/k3s/server/db/state.db`. -/// If the VM is killed mid-write (SIGKILL, host crash, power loss), the database -/// file may be left in a corrupt state — the `SQLite` header magic is missing or the -/// file is truncated. k3s would open the DB, get `SQLITE_NOTADB` / -/// `SQLITE_CORRUPT`, and crash at startup. -/// -/// This function checks the `SQLite` file header (first 100 bytes only) and removes -/// the database plus its WAL/SHM sidecar files if the header is invalid. k3s will -/// create a fresh database on startup and cluster state will be re-applied from -/// the auto-deploy manifests in `server/manifests/`. -/// -/// **Stale bootstrap locks** (a kine application-level issue where a killed k3s -/// server leaves a lock row that causes the next instance to hang) are handled -/// separately by the init script (`openshell-vm-init.sh`), which runs -/// `sqlite3 state.db "DELETE FROM kine WHERE name LIKE '/bootstrap/%'"` before -/// starting k3s. This allows the database — and all persistent cluster state — to -/// survive normal restarts. -/// -/// **What is lost on corruption:** all cluster object records (Pods, Deployments, -/// Secrets, `ConfigMaps`, CRDs, etc.) and the bootstrap token. These are re-created -/// from manifests on the next boot. -/// -/// **What is always preserved:** container images and snapshots (under -/// `k3s/agent/`), PKI, and the `.initialized` sentinel. -/// -/// This function is a no-op if `state.db` does not exist (e.g. first boot or -/// after a full `--reset`). -pub fn recover_corrupt_kine_db(rootfs: &Path) -> Result<(), VmError> { - // The SQLite file format begins with a 16-byte magic string. - // Reference: https://www.sqlite.org/fileformat.html#the_database_header - const SQLITE_MAGIC: &[u8] = b"SQLite format 3\x00"; - - let db_path = rootfs.join("var/lib/rancher/k3s/server/db/state.db"); - if !db_path.exists() { - return Ok(()); // Nothing to check — first boot or post-reset. - } - - // Read only the first 100 bytes (the minimum valid SQLite header size) - // instead of loading the entire database into memory. - let has_invalid_header = match File::open(&db_path).and_then(|mut f| { - let mut buf = [0u8; 100]; - let n = f.read(&mut buf)?; - Ok((n, buf)) - }) { - Err(_) => true, // Can't read → treat as corrupt. - Ok((n, _)) if n < 100 => true, // Too short to be a valid DB. - Ok((_, buf)) => !buf.starts_with(SQLITE_MAGIC), - }; - - if !has_invalid_header { - return Ok(()); // Valid database — preserve it for warm boot. - } - - eprintln!( - "Warning: kine database is corrupt ({}), removing for clean boot", - db_path.display() - ); - - remove_kine_db_files(&db_path)?; - - Ok(()) -} - -/// Remove the kine `SQLite` database and its WAL/SHM sidecar files. -fn remove_kine_db_files(db_path: &Path) -> Result<(), VmError> { - if let Err(e) = fs::remove_file(db_path) { - return Err(VmError::RuntimeState(format!( - "failed to remove kine database {}: {e}", - db_path.display() - ))); - } - // Also remove any WAL/SHM sidecar files left by an interrupted write. - let _ = fs::remove_file(db_path.with_extension("db-wal")); - let _ = fs::remove_file(db_path.with_extension("db-shm")); - Ok(()) -} - -/// Acquire an exclusive lock on the rootfs lock file. -/// -/// The lock is held for the lifetime of the returned `File` handle. When -/// the process exits (even via SIGKILL), the OS releases the lock -/// automatically. This provides a reliable guard against two VM processes -/// sharing the same rootfs — even if the state file is deleted. -/// -/// Returns `Ok(File)` on success. The caller must keep the `File` alive -/// for as long as the VM is running. -pub fn acquire_rootfs_lock(rootfs: &Path) -> Result { - let lock_path = vm_lock_path(rootfs); - fs::create_dir_all(vm_run_dir(rootfs)) - .map_err(|e| VmError::RuntimeState(format!("create VM runtime dir: {e}")))?; - - // Open (or create) the lock file without truncating so we can read - // the holder's PID for the error message if the lock is held. - let file = fs::OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(false) - .open(&lock_path) - .map_err(|e| { - VmError::RuntimeState(format!("open lock file {}: {e}", lock_path.display())) - })?; - - // Try non-blocking exclusive lock. - let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); - let rc = unsafe { libc::flock(fd, libc::LOCK_EX | libc::LOCK_NB) }; - if rc != 0 { - let err = std::io::Error::last_os_error(); - if err.raw_os_error() == Some(libc::EWOULDBLOCK) { - // Another process holds the lock — read its PID for diagnostics. - let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); - let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); - } - return Err(VmError::RuntimeState(format!( - "lock rootfs {}: {err}", - lock_path.display() - ))); - } - - // Lock acquired — write our PID (truncate first, then write). - // This is informational only; the flock is the real guard. - let _ = file.set_len(0); - { - let mut f = &file; - let _ = write!(f, "{}", std::process::id()); - } - - Ok(file) -} - -/// Check whether the rootfs lock file is currently held by another process. -/// -/// Returns `Ok(())` if the lock is free (or can be acquired), and an -/// `Err` if another process holds it. Does NOT acquire the lock — use -/// [`acquire_rootfs_lock`] for that. -fn check_rootfs_lock_free(rootfs: &Path) -> Result<(), VmError> { - let lock_path = vm_lock_path(rootfs); - if !lock_path.exists() { - return Ok(()); - } - - let Ok(file) = File::open(&lock_path) else { - return Ok(()); // Can't open → treat as free - }; - - let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); - let rc = unsafe { libc::flock(fd, libc::LOCK_EX | libc::LOCK_NB) }; - if rc != 0 { - let err = std::io::Error::last_os_error(); - if err.raw_os_error() == Some(libc::EWOULDBLOCK) { - let holder_pid = fs::read_to_string(&lock_path).unwrap_or_default(); - let holder_pid = holder_pid.trim(); - return Err(VmError::RuntimeState(format!( - "another process (pid {holder_pid}) is using rootfs {}. \ - Stop the running VM first", - rootfs.display() - ))); - } - } else { - // We acquired the lock — release it immediately since we're only probing. - unsafe { libc::flock(fd, libc::LOCK_UN) }; - } - - Ok(()) -} - -pub fn ensure_vm_not_running(rootfs: &Path) -> Result<(), VmError> { - // Primary guard: check the flock. This works even if the state file - // has been deleted, because the kernel holds the lock until the - // owning process exits. - check_rootfs_lock_free(rootfs)?; - - // Secondary guard: check the state file for any stale state. - match load_vm_runtime_state(Some(rootfs)) { - Ok(state) => Err(VmError::RuntimeState(format!( - "VM is already running (pid {}) with exec socket {}", - state.pid, - state.socket_path.display() - ))), - Err(VmError::RuntimeState(message)) - if message.starts_with("read VM runtime state") - || message.starts_with("VM is not running") => - { - clear_vm_runtime_state(rootfs); - Ok(()) - } - Err(err) => Err(err), - } -} - -pub fn exec_running_vm(options: VmExecOptions) -> Result { - let state = load_vm_runtime_state(options.rootfs.as_deref())?; - let mut stream = UnixStream::connect(&state.socket_path).map_err(|e| { - VmError::Exec(format!( - "connect to VM exec socket {}: {e}", - state.socket_path.display() - )) - })?; - let mut writer = stream - .try_clone() - .map_err(|e| VmError::Exec(format!("clone VM exec socket: {e}")))?; - - let mut env = options.env; - validate_env_vars(&env)?; - if !env.iter().any(|item| item.starts_with("KUBECONFIG=")) { - env.push(KUBECONFIG_ENV.to_string()); - } - - let request = ExecRequest { - argv: options.command, - env, - cwd: options.workdir, - tty: options.tty, - }; - send_json_line(&mut writer, &request)?; - - let tty = options.tty; - let _raw_guard = if tty { - if let Some((cols, rows)) = get_terminal_size() { - send_json_line(&mut writer, &ClientFrame::Resize { cols, rows })?; - } - Some(RawModeGuard::enter()?) - } else { - None - }; - - let stdin_writer = writer; - thread::spawn(move || { - let _ = pump_stdin(stdin_writer, tty); - }); - - let mut reader = BufReader::new(&mut stream); - let mut line = String::new(); - let stdout = std::io::stdout(); - let stderr = std::io::stderr(); - let mut stdout = stdout.lock(); - let mut stderr = stderr.lock(); - let mut exit_code = None; - - loop { - line.clear(); - let bytes = reader - .read_line(&mut line) - .map_err(|e| VmError::Exec(format!("read VM exec response from guest agent: {e}")))?; - if bytes == 0 { - break; - } - - let frame: ServerFrame = serde_json::from_str(line.trim_end()) - .map_err(|e| VmError::Exec(format!("decode VM exec response frame: {e}")))?; - - match frame { - ServerFrame::Stdout { data } => { - let bytes = decode_payload(&data)?; - stdout - .write_all(&bytes) - .map_err(|e| VmError::Exec(format!("write guest stdout: {e}")))?; - stdout - .flush() - .map_err(|e| VmError::Exec(format!("flush guest stdout: {e}")))?; - } - ServerFrame::Stderr { data } => { - let bytes = decode_payload(&data)?; - stderr - .write_all(&bytes) - .map_err(|e| VmError::Exec(format!("write guest stderr: {e}")))?; - stderr - .flush() - .map_err(|e| VmError::Exec(format!("flush guest stderr: {e}")))?; - } - ServerFrame::Exit { code } => { - exit_code = Some(code); - break; - } - ServerFrame::Error { message } => { - return Err(VmError::Exec(message)); - } - } - } - - exit_code.ok_or_else(|| { - VmError::Exec("VM exec agent disconnected before returning an exit code".to_string()) - }) -} - -/// Run a command inside the guest via the exec agent and capture its stdout. -/// -/// Unlike [`exec_running_vm`], this function does not pump host stdin or write -/// to the terminal. It collects all stdout frames into a `Vec` and returns -/// them on success (exit code 0). Stderr output is discarded. -/// -/// This is the building block for internal host→guest queries (e.g. reading -/// files from the guest filesystem) without requiring a dedicated vsock server. -pub fn exec_capture(socket_path: &Path, argv: Vec) -> Result, VmError> { - let mut stream = UnixStream::connect(socket_path).map_err(|e| { - VmError::Exec(format!( - "connect to VM exec socket {}: {e}", - socket_path.display() - )) - })?; - let mut writer = stream - .try_clone() - .map_err(|e| VmError::Exec(format!("clone VM exec socket: {e}")))?; - - let request = ExecRequest { - argv, - env: vec![], - cwd: None, - tty: false, - }; - send_json_line(&mut writer, &request)?; - - // Close stdin immediately — we have no input to send. - send_json_line(&mut writer, &ClientFrame::StdinClose)?; - - let mut reader = BufReader::new(&mut stream); - let mut line = String::new(); - let mut stdout_buf = Vec::new(); - - loop { - line.clear(); - let bytes = reader - .read_line(&mut line) - .map_err(|e| VmError::Exec(format!("read VM exec response: {e}")))?; - if bytes == 0 { - break; - } - - let frame: ServerFrame = serde_json::from_str(line.trim_end()) - .map_err(|e| VmError::Exec(format!("decode VM exec response frame: {e}")))?; - - match frame { - ServerFrame::Stdout { data } => { - stdout_buf.extend_from_slice(&decode_payload(&data)?); - } - ServerFrame::Stderr { .. } => { - // Discard stderr for capture mode. - } - ServerFrame::Exit { code } => { - if code != 0 { - return Err(VmError::Exec(format!( - "guest command exited with code {code}" - ))); - } - return Ok(stdout_buf); - } - ServerFrame::Error { message } => { - return Err(VmError::Exec(message)); - } - } - } - - Err(VmError::Exec( - "VM exec agent disconnected before returning an exit code".to_string(), - )) -} - -fn vm_run_dir(rootfs: &Path) -> PathBuf { - rootfs.parent().unwrap_or(rootfs).to_path_buf() -} - -pub fn vm_state_path(rootfs: &Path) -> PathBuf { - vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_STATE_NAME)) -} - -fn vm_lock_path(rootfs: &Path) -> PathBuf { - vm_run_dir(rootfs).join(format!("{}-{}", rootfs_key(rootfs), VM_LOCK_NAME)) -} - -fn rootfs_key(rootfs: &Path) -> String { - let name = rootfs - .file_name() - .and_then(|part| part.to_str()) - .unwrap_or("openshell-vm"); - let mut out = String::with_capacity(name.len()); - for ch in name.chars() { - if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { - out.push(ch); - } else { - out.push('_'); - } - } - if out.is_empty() { - "openshell-vm".to_string() - } else { - out - } -} - -fn default_rootfs() -> Result { - crate::named_rootfs_dir("default") -} - -fn load_vm_runtime_state(rootfs: Option<&Path>) -> Result { - let rootfs = match rootfs { - Some(rootfs) => rootfs.to_path_buf(), - None => default_rootfs()?, - }; - let path = vm_state_path(&rootfs); - let bytes = fs::read(&path).map_err(|e| { - VmError::RuntimeState(format!( - "read VM runtime state {}: {e}. Start the VM with `openshell-vm` first", - path.display() - )) - })?; - let state: VmRuntimeState = serde_json::from_slice(&bytes) - .map_err(|e| VmError::RuntimeState(format!("decode VM runtime state: {e}")))?; - - if !process_alive(state.pid) { - clear_vm_runtime_state(&state.rootfs); - return Err(VmError::RuntimeState(format!( - "VM is not running (stale pid {})", - state.pid - ))); - } - - if !state.socket_path.exists() { - return Err(VmError::RuntimeState(format!( - "VM exec socket is not ready: {}", - state.socket_path.display() - ))); - } - - Ok(state) -} - -fn validate_env_vars(items: &[String]) -> Result<(), VmError> { - for item in items { - let (key, _value) = item.split_once('=').ok_or_else(|| { - VmError::Exec(format!( - "invalid environment variable `{item}`; expected KEY=VALUE" - )) - })?; - if key.is_empty() - || !key.chars().enumerate().all(|(idx, ch)| { - ch == '_' || (ch.is_ascii_alphanumeric() && (idx > 0 || !ch.is_ascii_digit())) - }) - { - return Err(VmError::Exec(format!( - "invalid environment variable name `{key}`" - ))); - } - } - Ok(()) -} - -fn send_json_line(writer: &mut UnixStream, value: &T) -> Result<(), VmError> { - let mut bytes = serde_json::to_vec(value) - .map_err(|e| VmError::Exec(format!("encode VM exec request: {e}")))?; - bytes.push(b'\n'); - writer - .write_all(&bytes) - .map_err(|e| VmError::Exec(format!("write VM exec request: {e}"))) -} - -fn pump_stdin(mut writer: UnixStream, tty: bool) -> Result<(), VmError> { - let stdin = std::io::stdin(); - let mut stdin = stdin.lock(); - let mut buf = [0u8; 8192]; - let mut last_size: Option<(u16, u16)> = None; - - loop { - let read = stdin - .read(&mut buf) - .map_err(|e| VmError::Exec(format!("read local stdin: {e}")))?; - if read == 0 { - break; - } - - if tty - && let Some(size) = get_terminal_size() - && last_size != Some(size) - { - last_size = Some(size); - let _ = send_json_line( - &mut writer, - &ClientFrame::Resize { - cols: size.0, - rows: size.1, - }, - ); - } - - let frame = ClientFrame::Stdin { - data: base64::engine::general_purpose::STANDARD.encode(&buf[..read]), - }; - send_json_line(&mut writer, &frame)?; - } - - send_json_line(&mut writer, &ClientFrame::StdinClose) -} - -fn decode_payload(data: &str) -> Result, VmError> { - base64::engine::general_purpose::STANDARD - .decode(data) - .map_err(|e| VmError::Exec(format!("decode VM exec payload: {e}"))) -} - -fn process_alive(pid: i32) -> bool { - let rc = unsafe { libc::kill(pid, 0) }; - if rc == 0 { - return true; - } - std::io::Error::last_os_error().raw_os_error() == Some(libc::EPERM) -} - -fn now_ms() -> Result { - let duration = SystemTime::now() - .duration_since(UNIX_EPOCH) - .map_err(|e| VmError::RuntimeState(format!("read system clock: {e}")))?; - Ok(duration.as_millis()) -} - -#[cfg(test)] -mod tests { - use super::*; - - // ── ExecRequest serialization ──────────────────────────────────── - - #[test] - fn exec_request_serializes_with_tty() { - let req = ExecRequest { - argv: vec!["sh".into()], - env: vec!["TERM=xterm".into()], - cwd: None, - tty: true, - }; - let json: serde_json::Value = serde_json::to_value(&req).unwrap(); - assert_eq!(json["argv"], serde_json::json!(["sh"])); - assert_eq!(json["tty"], true); - assert_eq!(json["cwd"], serde_json::Value::Null); - } - - #[test] - fn exec_request_serializes_without_tty() { - let req = ExecRequest { - argv: vec!["echo".into(), "hello".into()], - env: vec![], - cwd: Some("/tmp".into()), - tty: false, - }; - let json: serde_json::Value = serde_json::to_value(&req).unwrap(); - assert_eq!(json["tty"], false); - assert_eq!(json["cwd"], "/tmp"); - } - - // ── ClientFrame serialization ──────────────────────────────────── - - #[test] - fn client_frame_stdin_serializes() { - let frame = ClientFrame::Stdin { - data: "aGVsbG8=".into(), - }; - let json: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(json["type"], "stdin"); - assert_eq!(json["data"], "aGVsbG8="); - } - - #[test] - fn client_frame_stdin_close_serializes() { - let frame = ClientFrame::StdinClose; - let json: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(json["type"], "stdin_close"); - } - - #[test] - fn client_frame_resize_serializes() { - let frame = ClientFrame::Resize { - cols: 120, - rows: 40, - }; - let json: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(json["type"], "resize"); - assert_eq!(json["cols"], 120); - assert_eq!(json["rows"], 40); - } - - // ── ServerFrame deserialization ─────────────────────────────────── - - #[test] - fn server_frame_stdout_deserializes() { - let json = r#"{"type":"stdout","data":"aGVsbG8="}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Stdout { data } if data == "aGVsbG8=")); - } - - #[test] - fn server_frame_stderr_deserializes() { - let json = r#"{"type":"stderr","data":"ZXJy"}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Stderr { data } if data == "ZXJy")); - } - - #[test] - fn server_frame_exit_deserializes() { - let json = r#"{"type":"exit","code":42}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Exit { code: 42 })); - } - - #[test] - fn server_frame_error_deserializes() { - let json = r#"{"type":"error","message":"boom"}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Error { message } if message == "boom")); - } - - #[test] - fn server_frame_unknown_type_fails() { - let json = r#"{"type":"unknown","data":"x"}"#; - assert!(serde_json::from_str::(json).is_err()); - } - - // ── ClientFrame ↔ ServerFrame round-trip compatibility ─────────── - // Verify that what the Rust host serializes can be parsed by the - // Python agent (same JSON shape), and vice versa. - - #[test] - fn resize_frame_has_expected_json_shape() { - let frame = ClientFrame::Resize { cols: 80, rows: 24 }; - let s = serde_json::to_string(&frame).unwrap(); - let v: serde_json::Value = serde_json::from_str(&s).unwrap(); - assert_eq!(v["type"].as_str().unwrap(), "resize"); - assert!(v["cols"].is_u64()); - assert!(v["rows"].is_u64()); - } - - // ── validate_env_vars ──────────────────────────────────────────── - - #[test] - fn validate_env_vars_accepts_valid() { - let items = vec![ - "HOME=/root".to_string(), - "PATH=/usr/bin".to_string(), - "_UNDERSCORE=1".to_string(), - "A1B2=val".to_string(), - ]; - assert!(validate_env_vars(&items).is_ok()); - } - - #[test] - fn validate_env_vars_rejects_missing_equals() { - let items = vec!["NOEQUALS".to_string()]; - assert!(validate_env_vars(&items).is_err()); - } - - #[test] - fn validate_env_vars_rejects_empty_key() { - let items = vec!["=value".to_string()]; - assert!(validate_env_vars(&items).is_err()); - } - - #[test] - fn validate_env_vars_rejects_leading_digit() { - let items = vec!["1BAD=val".to_string()]; - assert!(validate_env_vars(&items).is_err()); - } - - #[test] - fn validate_env_vars_rejects_special_chars() { - let items = vec!["BAD-KEY=val".to_string()]; - assert!(validate_env_vars(&items).is_err()); - } - - // ── decode_payload ─────────────────────────────────────────────── - - #[test] - fn decode_payload_valid_base64() { - let decoded = decode_payload("aGVsbG8=").unwrap(); - assert_eq!(decoded, b"hello"); - } - - #[test] - fn decode_payload_empty() { - let decoded = decode_payload("").unwrap(); - assert!(decoded.is_empty()); - } - - #[test] - fn decode_payload_invalid_base64() { - assert!(decode_payload("!!!not-base64!!!").is_err()); - } - - // ── Resize frame edge cases ────────────────────────────────────── - - #[test] - fn resize_frame_max_dimensions() { - let frame = ClientFrame::Resize { - cols: u16::MAX, - rows: u16::MAX, - }; - let json: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(json["cols"], u64::from(u16::MAX)); - assert_eq!(json["rows"], u64::from(u16::MAX)); - } - - #[test] - fn resize_frame_minimum_dimensions() { - let frame = ClientFrame::Resize { cols: 1, rows: 1 }; - let json: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(json["cols"], 1); - assert_eq!(json["rows"], 1); - } - - // ── Wire format: newline-delimited JSON ────────────────────────── - // The protocol sends one JSON object per line. Verify that - // serialized frames produce valid single-line JSON that the - // Python agent can split on '\n' and json.loads(). - - #[test] - fn client_frames_serialize_to_single_line_json() { - let frames: Vec = vec![ - ClientFrame::Stdin { - data: "dGVzdA==".into(), - }, - ClientFrame::StdinClose, - ClientFrame::Resize { cols: 80, rows: 24 }, - ]; - for frame in &frames { - let s = serde_json::to_string(frame).unwrap(); - assert!(!s.contains('\n'), "frame should be single-line: {s}"); - let _: serde_json::Value = serde_json::from_str(&s).unwrap(); - } - } - - #[test] - fn exec_request_serializes_to_single_line_json() { - let req = ExecRequest { - argv: vec!["bash".into(), "-c".into(), "echo 'hello world'".into()], - env: vec!["HOME=/root".into(), "TERM=xterm-256color".into()], - cwd: Some("/home/user".into()), - tty: true, - }; - let s = serde_json::to_string(&req).unwrap(); - assert!(!s.contains('\n')); - let _: serde_json::Value = serde_json::from_str(&s).unwrap(); - } - - // ── Stdin data encode → decode round-trip ──────────────────────── - // Mirrors the flow: host encodes payload as base64 in a Stdin - // frame, guest decodes with decode_payload(). - - #[test] - fn stdin_payload_round_trip() { - let original = b"echo hello\n"; - let encoded = base64::engine::general_purpose::STANDARD.encode(original); - let frame = ClientFrame::Stdin { data: encoded }; - let json = serde_json::to_string(&frame).unwrap(); - let parsed: serde_json::Value = serde_json::from_str(&json).unwrap(); - let decoded = decode_payload(parsed["data"].as_str().unwrap()).unwrap(); - assert_eq!(decoded, original); - } - - #[test] - fn stdin_payload_round_trip_binary() { - let original: Vec = (0..=255).collect(); - let encoded = base64::engine::general_purpose::STANDARD.encode(&original); - let decoded = decode_payload(&encoded).unwrap(); - assert_eq!(decoded, original); - } - - // ── Python agent compatibility ─────────────────────────────────── - // The Python agent parses frames with json.loads() and dispatches - // on frame["type"]. These tests verify the exact field names and - // values match what the Python code expects. - - #[test] - fn exec_request_tty_field_matches_python_dispatch() { - // Python: request.get("tty") — must be a JSON boolean - let req = ExecRequest { - argv: vec!["sh".into()], - env: vec![], - cwd: None, - tty: true, - }; - let v: serde_json::Value = serde_json::to_value(&req).unwrap(); - assert!(v["tty"].is_boolean()); - assert!(v["tty"].as_bool().unwrap()); - - let req_no_tty = ExecRequest { - argv: vec!["echo".into()], - env: vec![], - cwd: None, - tty: false, - }; - let v: serde_json::Value = serde_json::to_value(&req_no_tty).unwrap(); - assert!(!v["tty"].as_bool().unwrap()); - } - - #[test] - fn resize_type_tag_is_snake_case() { - // Python: kind == "resize" — must be lowercase snake_case - let frame = ClientFrame::Resize { cols: 80, rows: 24 }; - let v: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(v["type"].as_str().unwrap(), "resize"); - } - - #[test] - fn stdin_close_type_tag_is_snake_case() { - // Python: kind == "stdin_close" - let frame = ClientFrame::StdinClose; - let v: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert_eq!(v["type"].as_str().unwrap(), "stdin_close"); - } - - #[test] - fn resize_fields_are_integers_not_strings() { - // Python: frame.get("cols", 80) — expects int, not string - let frame = ClientFrame::Resize { - cols: 200, - rows: 50, - }; - let v: serde_json::Value = serde_json::to_value(&frame).unwrap(); - assert!(v["cols"].is_u64()); - assert!(v["rows"].is_u64()); - } - - // ── ServerFrame: Python agent output ───────────────────────────── - // These mirror the exact JSON the Python agent produces with - // json.dumps(frame, separators=(",", ":")) - - #[test] - fn server_frame_parses_compact_json() { - // Python uses separators=(",", ":") — no spaces - let compact = r#"{"type":"stdout","data":"aGk="}"#; - let frame: ServerFrame = serde_json::from_str(compact).unwrap(); - assert!(matches!(frame, ServerFrame::Stdout { data } if data == "aGk=")); - } - - #[test] - fn server_frame_exit_code_zero() { - let json = r#"{"type":"exit","code":0}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Exit { code: 0 })); - } - - #[test] - fn server_frame_exit_code_negative() { - let json = r#"{"type":"exit","code":-1}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Exit { code: -1 })); - } - - #[test] - fn server_frame_tolerates_extra_fields() { - // Future-proofing: agent may add fields we don't know about - let json = r#"{"type":"exit","code":0,"extra":"ignored"}"#; - let frame: ServerFrame = serde_json::from_str(json).unwrap(); - assert!(matches!(frame, ServerFrame::Exit { code: 0 })); - } -} diff --git a/crates/openshell-vm/src/ffi.rs b/crates/openshell-vm/src/ffi.rs deleted file mode 100644 index 66213c624..000000000 --- a/crates/openshell-vm/src/ffi.rs +++ /dev/null @@ -1,340 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Minimal runtime-loaded bindings for the libkrun C API. -//! -//! We intentionally do not link libkrun at build time. Instead, the -//! `openshell-vm` binary loads `libkrun` from the staged `openshell-vm.runtime/` -//! sidecar bundle on first use. - -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::OnceLock; - -use libc::c_char; -use libloading::Library; - -use crate::VmError; - -/// Runtime provenance information extracted from the bundle. -#[derive(Debug, Clone)] -pub struct RuntimeProvenance { - /// Path to the libkrun library that was loaded. - pub libkrun_path: PathBuf, - /// Paths to all libkrunfw libraries that were preloaded. - pub libkrunfw_paths: Vec, - /// SHA-256 hash of the primary libkrunfw artifact (if computable). - pub libkrunfw_sha256: Option, - /// Contents of provenance.json if present in the runtime bundle. - pub provenance_json: Option, - /// Whether this is a custom (OpenShell-built) runtime. - pub is_custom: bool, -} - -pub const KRUN_LOG_TARGET_DEFAULT: i32 = -1; -pub const KRUN_LOG_LEVEL_OFF: u32 = 0; -pub const KRUN_LOG_LEVEL_ERROR: u32 = 1; -pub const KRUN_LOG_LEVEL_WARN: u32 = 2; -pub const KRUN_LOG_LEVEL_INFO: u32 = 3; -pub const KRUN_LOG_LEVEL_DEBUG: u32 = 4; -pub const KRUN_LOG_LEVEL_TRACE: u32 = 5; -pub const KRUN_LOG_STYLE_AUTO: u32 = 0; -pub const KRUN_LOG_OPTION_NO_ENV: u32 = 1; -pub const KRUN_DISK_FORMAT_RAW: u32 = 0; -#[allow(dead_code)] // Used only on macOS (cfg-gated in state_disk_sync_mode) -pub const KRUN_SYNC_RELAXED: u32 = 1; -#[allow(dead_code)] // Used only on Linux (cfg-gated in state_disk_sync_mode) -pub const KRUN_SYNC_FULL: u32 = 2; - -type KrunInitLog = - unsafe extern "C" fn(target_fd: i32, level: u32, style: u32, options: u32) -> i32; -type KrunCreateCtx = unsafe extern "C" fn() -> i32; -type KrunFreeCtx = unsafe extern "C" fn(ctx_id: u32) -> i32; -type KrunSetVmConfig = unsafe extern "C" fn(ctx_id: u32, num_vcpus: u8, ram_mib: u32) -> i32; -type KrunSetRoot = unsafe extern "C" fn(ctx_id: u32, root_path: *const c_char) -> i32; -type KrunSetWorkdir = unsafe extern "C" fn(ctx_id: u32, workdir_path: *const c_char) -> i32; -type KrunSetExec = unsafe extern "C" fn( - ctx_id: u32, - exec_path: *const c_char, - argv: *const *const c_char, - envp: *const *const c_char, -) -> i32; -type KrunSetPortMap = unsafe extern "C" fn(ctx_id: u32, port_map: *const *const c_char) -> i32; -type KrunSetConsoleOutput = unsafe extern "C" fn(ctx_id: u32, filepath: *const c_char) -> i32; -type KrunAddDisk3 = unsafe extern "C" fn( - ctx_id: u32, - block_id: *const c_char, - disk_path: *const c_char, - disk_format: u32, - read_only: bool, - direct_io: bool, - sync_mode: u32, -) -> i32; -type KrunAddVsockPort2 = - unsafe extern "C" fn(ctx_id: u32, port: u32, c_filepath: *const c_char, listen: bool) -> i32; -type KrunStartEnter = unsafe extern "C" fn(ctx_id: u32) -> i32; -type KrunDisableImplicitVsock = unsafe extern "C" fn(ctx_id: u32) -> i32; -type KrunAddVsock = unsafe extern "C" fn(ctx_id: u32, tsi_features: u32) -> i32; -#[cfg(target_os = "macos")] -type KrunAddNetUnixgram = unsafe extern "C" fn( - ctx_id: u32, - c_path: *const c_char, - fd: i32, - c_mac: *const u8, - features: u32, - flags: u32, -) -> i32; -type KrunAddNetUnixstream = unsafe extern "C" fn( - ctx_id: u32, - c_path: *const c_char, - fd: i32, - c_mac: *const u8, - features: u32, - flags: u32, -) -> i32; - -#[allow(clippy::struct_field_names)] // FFI struct mirrors libkrun's symbol naming -pub struct LibKrun { - pub krun_init_log: KrunInitLog, - pub krun_create_ctx: KrunCreateCtx, - pub krun_free_ctx: KrunFreeCtx, - pub krun_set_vm_config: KrunSetVmConfig, - pub krun_set_root: KrunSetRoot, - pub krun_set_workdir: KrunSetWorkdir, - pub krun_set_exec: KrunSetExec, - pub krun_set_port_map: KrunSetPortMap, - pub krun_set_console_output: KrunSetConsoleOutput, - pub krun_add_disk3: Option, - pub krun_add_vsock_port2: KrunAddVsockPort2, - pub krun_start_enter: KrunStartEnter, - pub krun_disable_implicit_vsock: KrunDisableImplicitVsock, - pub krun_add_vsock: KrunAddVsock, - #[cfg(target_os = "macos")] - pub krun_add_net_unixgram: KrunAddNetUnixgram, - #[allow(dead_code)] // FFI symbol loaded for future use - pub krun_add_net_unixstream: KrunAddNetUnixstream, -} - -static LIBKRUN: OnceLock = OnceLock::new(); -static RUNTIME_PROVENANCE: OnceLock = OnceLock::new(); - -pub fn libkrun() -> Result<&'static LibKrun, VmError> { - if let Some(lib) = LIBKRUN.get() { - return Ok(lib); - } - - let loaded = LibKrun::load()?; - let _ = LIBKRUN.set(loaded); - Ok(LIBKRUN.get().expect("libkrun should be initialized")) -} - -/// Return the provenance information for the loaded runtime. -/// -/// Only available after [`libkrun()`] has been called successfully. -pub fn runtime_provenance() -> Option<&'static RuntimeProvenance> { - RUNTIME_PROVENANCE.get() -} - -impl LibKrun { - fn load() -> Result { - let path = runtime_libkrun_path()?; - let runtime_dir = path.parent().ok_or_else(|| { - VmError::HostSetup(format!("libkrun has no parent dir: {}", path.display())) - })?; - let krunfw_paths = preload_runtime_support_libraries(runtime_dir)?; - - // Build and store provenance information. - let provenance_json_path = runtime_dir.join("provenance.json"); - let provenance_json = fs::read_to_string(&provenance_json_path).ok(); - let is_custom = provenance_json.is_some(); - - let libkrunfw_sha256 = krunfw_paths.first().and_then(|p| compute_sha256(p).ok()); - - let provenance = RuntimeProvenance { - libkrun_path: path.clone(), - libkrunfw_paths: krunfw_paths, - libkrunfw_sha256, - provenance_json, - is_custom, - }; - let _ = RUNTIME_PROVENANCE.set(provenance); - - let library = Box::leak(Box::new(unsafe { - Library::new(&path).map_err(|e| { - VmError::HostSetup(format!("load libkrun from {}: {e}", path.display())) - })? - })); - - Ok(Self { - krun_init_log: load_symbol(library, b"krun_init_log\0", &path)?, - krun_create_ctx: load_symbol(library, b"krun_create_ctx\0", &path)?, - krun_free_ctx: load_symbol(library, b"krun_free_ctx\0", &path)?, - krun_set_vm_config: load_symbol(library, b"krun_set_vm_config\0", &path)?, - krun_set_root: load_symbol(library, b"krun_set_root\0", &path)?, - krun_set_workdir: load_symbol(library, b"krun_set_workdir\0", &path)?, - krun_set_exec: load_symbol(library, b"krun_set_exec\0", &path)?, - krun_set_port_map: load_symbol(library, b"krun_set_port_map\0", &path)?, - krun_set_console_output: load_symbol(library, b"krun_set_console_output\0", &path)?, - krun_add_disk3: load_optional_symbol(library, b"krun_add_disk3\0"), - krun_add_vsock_port2: load_symbol(library, b"krun_add_vsock_port2\0", &path)?, - krun_start_enter: load_symbol(library, b"krun_start_enter\0", &path)?, - krun_disable_implicit_vsock: load_symbol( - library, - b"krun_disable_implicit_vsock\0", - &path, - )?, - krun_add_vsock: load_symbol(library, b"krun_add_vsock\0", &path)?, - #[cfg(target_os = "macos")] - krun_add_net_unixgram: load_symbol(library, b"krun_add_net_unixgram\0", &path)?, - krun_add_net_unixstream: load_symbol(library, b"krun_add_net_unixstream\0", &path)?, - }) - } -} - -fn runtime_libkrun_path() -> Result { - Ok(crate::configured_runtime_dir()?.join(required_runtime_lib_name())) -} - -fn preload_runtime_support_libraries(runtime_dir: &Path) -> Result, VmError> { - let entries = fs::read_dir(runtime_dir) - .map_err(|e| VmError::HostSetup(format!("read {}: {e}", runtime_dir.display())))?; - - let mut support_libs: Vec = entries - .filter_map(Result::ok) - .map(|entry| entry.path()) - .filter(|path| { - path.file_name() - .and_then(|name| name.to_str()) - .is_some_and(|name| { - #[cfg(target_os = "macos")] - { - name.starts_with("libkrunfw") - && Path::new(name) - .extension() - .is_some_and(|ext| ext.eq_ignore_ascii_case("dylib")) - } - #[cfg(not(target_os = "macos"))] - { - name.starts_with("libkrunfw") && name.contains(".so") - } - }) - }) - .collect(); - - support_libs.sort(); - - for path in &support_libs { - let path_cstr = std::ffi::CString::new(path.to_string_lossy().as_bytes()).map_err(|e| { - VmError::HostSetup(format!( - "invalid support library path {}: {e}", - path.display() - )) - })?; - let handle = - unsafe { libc::dlopen(path_cstr.as_ptr(), libc::RTLD_NOW | libc::RTLD_GLOBAL) }; - if handle.is_null() { - let error = unsafe { - let err = libc::dlerror(); - if err.is_null() { - "unknown dlopen error".to_string() - } else { - std::ffi::CStr::from_ptr(err).to_string_lossy().into_owned() - } - }; - return Err(VmError::HostSetup(format!( - "preload runtime support library {}: {error}", - path.display() - ))); - } - } - - Ok(support_libs) -} - -pub fn required_runtime_lib_name() -> &'static str { - #[cfg(target_os = "macos")] - { - "libkrun.dylib" - } - #[cfg(not(target_os = "macos"))] - { - "libkrun.so" - } -} - -/// Compute SHA-256 hash of a file, returning hex string. -/// -/// Streams the file contents directly to `shasum -a 256` via a pipe, -/// avoiding buffering the entire file in memory. -fn compute_sha256(path: &Path) -> Result { - use std::io::{Read, Write}; - use std::process::{Command, Stdio}; - - let mut file = fs::File::open(path)?; - - // sha256sum is standard on Linux; shasum ships with macOS/Perl. - let mut child = Command::new("sha256sum") - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::null()) - .spawn() - .or_else(|_| { - Command::new("shasum") - .args(["-a", "256"]) - .stdin(Stdio::piped()) - .stdout(Stdio::piped()) - .stderr(Stdio::null()) - .spawn() - })?; - - // Stream file contents directly to shasum's stdin in 8KB chunks. - { - let mut stdin = child - .stdin - .take() - .ok_or_else(|| std::io::Error::other("failed to open shasum stdin"))?; - let mut buf = [0u8; 8192]; - loop { - let n = file.read(&mut buf)?; - if n == 0 { - break; - } - stdin.write_all(&buf[..n])?; - } - // stdin is dropped here, closing the pipe so shasum can finish. - } - - let output = child.wait_with_output()?; - if output.status.success() { - let stdout = String::from_utf8_lossy(&output.stdout); - Ok(stdout - .split_whitespace() - .next() - .unwrap_or("unknown") - .to_string()) - } else { - Ok("unknown".to_string()) - } -} - -fn load_symbol( - library: &'static Library, - symbol: &[u8], - path: &Path, -) -> Result { - let loaded = unsafe { - library.get::(symbol).map_err(|e| { - VmError::HostSetup(format!( - "resolve {} from {}: {e}", - String::from_utf8_lossy(symbol).trim_end_matches('\0'), - path.display() - )) - })? - }; - Ok(*loaded) -} - -fn load_optional_symbol(library: &'static Library, symbol: &[u8]) -> Option { - let loaded = unsafe { library.get::(symbol).ok()? }; - Some(*loaded) -} diff --git a/crates/openshell-vm/src/health.rs b/crates/openshell-vm/src/health.rs deleted file mode 100644 index c24015bf1..000000000 --- a/crates/openshell-vm/src/health.rs +++ /dev/null @@ -1,204 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! gRPC health check for verifying the gateway is fully ready. -//! -//! This module provides a proper gRPC health check that verifies the gateway -//! service is not just accepting TCP connections, but is actually responding -//! to gRPC requests. This ensures we don't mark the server as ready before -//! it has fully booted. - -use crate::VmError; -use openshell_core::proto::{HealthRequest, ServiceStatus, open_shell_client::OpenShellClient}; -use std::path::PathBuf; -use std::time::Duration; -use tonic::transport::{Certificate, ClientTlsConfig, Endpoint, Identity}; - -/// CA certificate, client certificate, and client key bytes for mTLS. -type MtlsMaterials = (Vec, Vec, Vec); - -/// Load mTLS materials from the gateway's cert directory. -fn load_mtls_materials(gateway_name: &str) -> Result { - let home = std::env::var("HOME").map_err(|_| "HOME not set")?; - let mtls_dir = PathBuf::from(home) - .join(".config/openshell/gateways") - .join(gateway_name) - .join("mtls"); - - let ca = std::fs::read(mtls_dir.join("ca.crt")) - .map_err(|e| format!("failed to read ca.crt: {e}"))?; - let cert = std::fs::read(mtls_dir.join("tls.crt")) - .map_err(|e| format!("failed to read tls.crt: {e}"))?; - let key = std::fs::read(mtls_dir.join("tls.key")) - .map_err(|e| format!("failed to read tls.key: {e}"))?; - - Ok((ca, cert, key)) -} - -/// Build a tonic TLS config from mTLS materials. -fn build_tls_config(ca: Vec, cert: Vec, key: Vec) -> ClientTlsConfig { - let ca_cert = Certificate::from_pem(ca); - let identity = Identity::from_pem(cert, key); - ClientTlsConfig::new() - .ca_certificate(ca_cert) - .identity(identity) -} - -/// Perform a gRPC health check against the gateway. -/// -/// Returns `Ok(())` if the health check succeeds (service reports healthy), -/// or an error describing why the check failed. -async fn grpc_health_check(gateway_port: u16, gateway_name: &str) -> Result<(), String> { - // Load mTLS materials - let (ca, cert, key) = load_mtls_materials(gateway_name)?; - let tls_config = build_tls_config(ca, cert, key); - - // Build the channel with TLS - let endpoint = format!("https://127.0.0.1:{gateway_port}"); - let channel = Endpoint::from_shared(endpoint.clone()) - .map_err(|e| format!("invalid endpoint: {e}"))? - .connect_timeout(Duration::from_secs(5)) - .tls_config(tls_config) - .map_err(|e| format!("TLS config error: {e}"))? - .connect() - .await - .map_err(|e| format!("connection failed: {e}"))?; - - // Create client and call health - let mut client = OpenShellClient::new(channel); - let response = client - .health(HealthRequest {}) - .await - .map_err(|e| format!("health RPC failed: {e}"))?; - - let health = response.into_inner(); - if health.status == ServiceStatus::Healthy as i32 { - Ok(()) - } else { - Err(format!("service not healthy: status={}", health.status)) - } -} - -/// Wait for the gateway service to be fully ready by polling the gRPC health endpoint. -/// -/// This replaces the TCP-only probe with a proper gRPC health check that verifies -/// the service is actually responding to requests, not just accepting connections. -/// -/// Returns `Ok(())` when the gateway is confirmed healthy, or `Err` if the health -/// check fails or times out. Falls back to TCP probe if mTLS materials aren't -/// available yet. -pub fn wait_for_gateway_ready(gateway_port: u16, gateway_name: &str) -> Result<(), VmError> { - let start = std::time::Instant::now(); - let timeout = Duration::from_secs(90); - let poll_interval = Duration::from_secs(1); - - eprintln!("Waiting for gateway gRPC health check..."); - - // Create a runtime for async health checks - let rt = match tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - { - Ok(rt) => rt, - Err(e) => { - eprintln!(" failed to create tokio runtime: {e}, falling back to TCP probe"); - return wait_for_tcp_only(gateway_port, timeout, poll_interval); - } - }; - - loop { - // Try gRPC health check - let result = rt.block_on(async { - tokio::time::timeout( - Duration::from_secs(5), - grpc_health_check(gateway_port, gateway_name), - ) - .await - }); - - match result { - Ok(Ok(())) => { - eprintln!("Gateway healthy [{:.1}s]", start.elapsed().as_secs_f64()); - return Ok(()); - } - Ok(Err(e)) => { - // gRPC call completed but failed - if start.elapsed() >= timeout { - return Err(VmError::Bootstrap(format!( - "gateway health check failed after {:.0}s: {e}", - timeout.as_secs_f64() - ))); - } - } - Err(_) => { - // Timeout on the health check itself - if start.elapsed() >= timeout { - return Err(VmError::Bootstrap(format!( - "gateway health check timed out after {:.0}s", - timeout.as_secs_f64() - ))); - } - } - } - - std::thread::sleep(poll_interval); - } -} - -/// Fallback TCP-only probe when gRPC health check can't be performed. -fn wait_for_tcp_only( - gateway_port: u16, - timeout: Duration, - poll_interval: Duration, -) -> Result<(), VmError> { - let start = std::time::Instant::now(); - - loop { - if host_tcp_probe(gateway_port) { - eprintln!( - "Service reachable (TCP) [{:.1}s]", - start.elapsed().as_secs_f64() - ); - return Ok(()); - } - - if start.elapsed() >= timeout { - return Err(VmError::Bootstrap(format!( - "gateway TCP probe failed after {:.0}s", - timeout.as_secs_f64() - ))); - } - - std::thread::sleep(poll_interval); - } -} - -/// Probe `127.0.0.1:port` from the host to verify the TCP path is working. -/// -/// This is a fallback when gRPC health check isn't available. -fn host_tcp_probe(gateway_port: u16) -> bool { - use std::io::Read; - use std::net::{SocketAddr, TcpStream}; - - let addr: SocketAddr = ([127, 0, 0, 1], gateway_port).into(); - let Ok(mut stream) = TcpStream::connect_timeout(&addr, Duration::from_secs(2)) else { - return false; - }; - - // A short read timeout: if the server is alive it will wait for us - // to send a TLS ClientHello, so the read will time out (= good). - // If the connection resets or closes, the server is dead. - stream - .set_read_timeout(Some(Duration::from_millis(200))) - .ok(); - let mut buf = [0u8; 1]; - match stream.read(&mut buf) { - Err(e) - if e.kind() == std::io::ErrorKind::WouldBlock - || e.kind() == std::io::ErrorKind::TimedOut => - { - true // Timeout = server alive, waiting for ClientHello. - } - _ => false, // Reset, EOF, or unexpected data = not healthy. - } -} diff --git a/crates/openshell-vm/src/lib.rs b/crates/openshell-vm/src/lib.rs deleted file mode 100644 index ba5d64663..000000000 --- a/crates/openshell-vm/src/lib.rs +++ /dev/null @@ -1,2069 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! `MicroVM` runtime using libkrun for hardware-isolated execution. -//! -//! This crate provides a thin wrapper around the libkrun C API to boot -//! lightweight VMs backed by virtio-fs root filesystems. On macOS ARM64, -//! it uses Apple's Hypervisor.framework; on Linux it uses KVM. -//! -//! # Codesigning (macOS) -//! -//! The calling binary must be codesigned with the -//! `com.apple.security.hypervisor` entitlement. See `entitlements.plist`. - -#![allow(unsafe_code)] - -mod embedded; -mod exec; -mod ffi; -mod health; - -use std::ffi::CString; -use std::path::{Path, PathBuf}; -use std::ptr; -use std::time::Instant; - -pub use exec::{ - VM_EXEC_VSOCK_PORT, VmExecOptions, VmRuntimeState, acquire_rootfs_lock, clear_vm_runtime_state, - ensure_vm_not_running, exec_capture, exec_running_vm, recover_corrupt_kine_db, - reset_runtime_state, vm_exec_socket_path, vm_state_path, write_vm_runtime_state, -}; - -// ── Error type ───────────────────────────────────────────────────────── - -/// Errors that can occur when configuring or launching a microVM. -#[derive(Debug, thiserror::Error, miette::Diagnostic)] -pub enum VmError { - /// A libkrun FFI call returned a negative error code. - #[error("{func} failed with error code {code}")] - Krun { func: &'static str, code: i32 }, - - /// The rootfs directory does not exist. - #[error( - "rootfs directory not found: {path}\nRun `openshell-vm prepare-rootfs` or build one with ./crates/openshell-vm/scripts/build-rootfs.sh " - )] - RootfsNotFound { path: String }, - - /// A path contained invalid UTF-8. - #[error("path is not valid UTF-8: {0}")] - InvalidPath(String), - - /// `CString::new` failed (embedded NUL byte). - #[error("invalid C string: {0}")] - CString(#[from] std::ffi::NulError), - - /// A required host binary was not found. - #[error("required binary not found: {path}\n{hint}")] - BinaryNotFound { path: String, hint: String }, - - /// Host-side VM setup failed before boot. - #[error("host setup failed: {0}")] - HostSetup(String), - - /// `/dev/kvm` is not accessible (Linux only). - #[error( - "cannot open /dev/kvm: {reason}\n\ - KVM access is required to run microVMs on Linux.\n\ - Fix: sudo usermod -aG kvm $USER then log out and back in\n\ - (or run: newgrp kvm)" - )] - KvmAccess { reason: String }, - - /// `fork()` failed. - #[error("fork() failed: {0}")] - Fork(String), - - /// Post-boot bootstrap failed. - #[error("bootstrap failed: {0}")] - Bootstrap(String), - - /// Local VM runtime state could not be read or written. - #[error("VM runtime state error: {0}")] - RuntimeState(String), - - /// Exec operation against a running VM failed. - #[error("VM exec failed: {0}")] - Exec(String), -} - -/// Check a libkrun return code; negative values are errors. -fn check(ret: i32, func: &'static str) -> Result<(), VmError> { - if ret < 0 { - Err(VmError::Krun { func, code: ret }) - } else { - Ok(()) - } -} - -// ── Configuration ────────────────────────────────────────────────────── - -/// Networking backend for the microVM. -#[derive(Debug, Clone)] -pub enum NetBackend { - /// TSI (Transparent Socket Impersonation) — default libkrun networking. - /// Simple but intercepts guest loopback connections, breaking k3s. - Tsi, - - /// No networking — disable vsock/TSI entirely. For debugging only. - None, - - /// gvproxy (vfkit mode) — real `eth0` interface via virtio-net. - /// Requires gvproxy binary on the host. Port forwarding is done - /// through gvproxy's HTTP API. - Gvproxy { - /// Path to the gvproxy binary. - binary: PathBuf, - }, -} - -/// Host Unix socket bridged into the guest as a vsock port. -#[derive(Debug, Clone)] -pub struct VsockPort { - pub port: u32, - pub socket_path: PathBuf, - pub listen: bool, -} - -/// Host-backed raw block image attached to the VM for mutable guest state. -#[derive(Debug, Clone)] -pub struct StateDiskConfig { - /// Path to the sparse raw image on the host. - pub path: PathBuf, - - /// Size of the raw image in bytes. - pub size_bytes: u64, - - /// Guest-visible libkrun block ID. - pub block_id: String, - - /// Guest device path used by the init script. - pub guest_device: String, -} - -impl StateDiskConfig { - fn for_rootfs(rootfs: &Path) -> Self { - Self { - path: default_state_disk_path(rootfs), - size_bytes: DEFAULT_STATE_DISK_SIZE_BYTES, - block_id: DEFAULT_STATE_DISK_BLOCK_ID.to_string(), - guest_device: DEFAULT_STATE_DISK_GUEST_DEVICE.to_string(), - } - } -} - -/// Configuration for a libkrun microVM. -pub struct VmConfig { - /// Path to the extracted rootfs directory (aarch64 Linux). - pub rootfs: PathBuf, - - /// Number of virtual CPUs. - pub vcpus: u8, - - /// RAM in MiB. - pub mem_mib: u32, - - /// Executable path inside the VM. - pub exec_path: String, - - /// Arguments to the executable (argv, excluding argv\[0\]). - pub args: Vec, - - /// Environment variables in `KEY=VALUE` form. - /// If empty, a minimal default set is used. - pub env: Vec, - - /// Working directory inside the VM. - pub workdir: String, - - /// TCP port mappings in `"host_port:guest_port"` form. - /// Only used with TSI networking. - pub port_map: Vec, - - /// Optional host Unix sockets exposed to the guest over vsock. - pub vsock_ports: Vec, - - /// libkrun log level (0=Off .. 5=Trace). - pub log_level: u32, - - /// Optional file path for VM console output. If `None`, console output - /// goes to the parent directory of the rootfs as `console.log`. - pub console_output: Option, - - /// Networking backend. - pub net: NetBackend, - - /// Wipe all runtime state (containerd tasks/sandboxes, kubelet pods) - /// before booting. Recovers from corrupted state after a crash. - pub reset: bool, - - /// Gateway metadata name used for host-side config and mTLS material. - pub gateway_name: String, - - /// Optional host-backed raw block image for mutable guest state. - pub state_disk: Option, -} - -impl VmConfig { - /// Default gateway configuration: boots k3s server inside the VM. - /// - /// Runs `/srv/openshell-vm-init.sh` which mounts essential filesystems, - /// deploys the `OpenShell` helm chart, and execs `k3s server`. - /// Exposes the `OpenShell` gateway on port 30051. - pub fn gateway(rootfs: PathBuf) -> Self { - let state_disk = StateDiskConfig::for_rootfs(&rootfs); - Self { - vsock_ports: vec![VsockPort { - port: VM_EXEC_VSOCK_PORT, - socket_path: vm_exec_socket_path(&rootfs), - listen: true, - }], - rootfs, - vcpus: 4, - mem_mib: 8192, - exec_path: "/srv/openshell-vm-init.sh".to_string(), - args: vec![], - env: vec![ - "HOME=/root".to_string(), - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(), - "TERM=xterm".to_string(), - ], - workdir: "/".to_string(), - port_map: vec![ - // OpenShell server — with bridge CNI the pod listens on - // 8080 inside its own network namespace (10.42.0.x), not - // on the VM's root namespace. The NodePort service - // (kube-proxy nftables) forwards VM:30051 → pod:8080. - // gvproxy maps host:30051 → VM:30051 to complete the path. - "30051:30051".to_string(), - ], - log_level: 3, // Info — for debugging - console_output: None, - net: NetBackend::Gvproxy { - binary: default_runtime_gvproxy_path(), - }, - reset: false, - gateway_name: format!("{GATEWAY_NAME_PREFIX}-default"), - state_disk: Some(state_disk), - } - } -} - -/// Base prefix for gateway metadata names. -const GATEWAY_NAME_PREFIX: &str = "openshell-vm"; -const DEFAULT_STATE_DISK_SIZE_BYTES: u64 = 32 * 1024 * 1024 * 1024; -const DEFAULT_STATE_DISK_BLOCK_ID: &str = "openshell-state"; -const DEFAULT_STATE_DISK_GUEST_DEVICE: &str = "/dev/vda"; - -/// Resolve the gateway metadata name for an instance name. -pub fn gateway_name(instance_name: &str) -> Result { - Ok(format!( - "{GATEWAY_NAME_PREFIX}-{}", - sanitize_instance_name(instance_name)? - )) -} - -/// Resolve the rootfs path for a named instance (including the default gateway). -/// -/// Layout: `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances/{name}/rootfs` -pub fn named_rootfs_dir(instance_name: &str) -> Result { - let name = sanitize_instance_name(instance_name)?; - let base = openshell_bootstrap::paths::openshell_vm_base_dir() - .map_err(|e| VmError::RuntimeState(format!("resolve openshell-vm base dir: {e}")))?; - Ok(base - .join(env!("CARGO_PKG_VERSION")) - .join("instances") - .join(name) - .join("rootfs")) -} - -/// Ensure a named instance rootfs exists, extracting from the embedded -/// rootfs tarball on first use. -/// -/// The default (unnamed) gateway should be routed here as `"default"`. -pub fn ensure_named_rootfs(instance_name: &str) -> Result { - let instance_rootfs = named_rootfs_dir(instance_name)?; - if instance_rootfs.is_dir() { - return Ok(instance_rootfs); - } - - if embedded::has_embedded_rootfs() { - // Clean up rootfs directories left by older binary versions. - embedded::cleanup_old_rootfs()?; - - embedded::extract_rootfs_to(&instance_rootfs)?; - return Ok(instance_rootfs); - } - - Err(VmError::RootfsNotFound { - path: instance_rootfs.display().to_string(), - }) -} - -/// Ensure the requested rootfs exists, extracting the embedded rootfs when needed. -/// -/// When `rootfs` is `None`, this uses the named-instance layout under -/// `$XDG_DATA_HOME/openshell/openshell-vm/{version}/instances//rootfs`. -/// When `force_recreate` is true and the target exists, it is removed first. -pub fn prepare_rootfs( - rootfs: Option, - instance_name: &str, - force_recreate: bool, -) -> Result { - let target = match rootfs { - Some(path) => path, - None => named_rootfs_dir(instance_name)?, - }; - - if force_recreate && target.exists() { - std::fs::remove_dir_all(&target).map_err(|e| { - VmError::HostSetup(format!("remove existing rootfs {}: {e}", target.display())) - })?; - } - - if target.is_dir() { - return Ok(target); - } - - if embedded::has_embedded_rootfs() { - if target == named_rootfs_dir(instance_name)? { - embedded::cleanup_old_rootfs()?; - } - embedded::extract_rootfs_to(&target)?; - return Ok(target); - } - - Err(VmError::RootfsNotFound { - path: target.display().to_string(), - }) -} - -fn sanitize_instance_name(name: &str) -> Result { - let trimmed = name.trim(); - if trimmed.is_empty() { - return Err(VmError::RuntimeState( - "instance name cannot be empty".to_string(), - )); - } - - let mut out = String::with_capacity(trimmed.len()); - for ch in trimmed.chars() { - if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { - out.push(ch); - } else { - return Err(VmError::RuntimeState(format!( - "invalid instance name '{trimmed}': only [A-Za-z0-9_-] are allowed" - ))); - } - } - - Ok(out) -} - -// ── Helpers ───────────────────────────────────────────────────────────── - -/// Build a null-terminated C string array from a slice of strings. -/// -/// Returns both the `CString` owners (to keep them alive) and the pointer array. -fn c_string_array(strings: &[&str]) -> Result<(Vec, Vec<*const libc::c_char>), VmError> { - let owned: Vec = strings - .iter() - .map(|s| CString::new(*s)) - .collect::, _>>()?; - let mut ptrs: Vec<*const libc::c_char> = owned.iter().map(|c| c.as_ptr()).collect(); - ptrs.push(ptr::null()); // null terminator - Ok((owned, ptrs)) -} - -const VM_RUNTIME_DIR_ENV: &str = "OPENSHELL_VM_RUNTIME_DIR"; - -pub fn configured_runtime_dir() -> Result { - // Allow override for development - if let Some(path) = std::env::var_os(VM_RUNTIME_DIR_ENV) { - let path = PathBuf::from(path); - tracing::debug!( - path = %path.display(), - "Using runtime from OPENSHELL_VM_RUNTIME_DIR" - ); - return Ok(path); - } - - // Use embedded runtime (extracts on first use) - embedded::ensure_runtime_extracted() -} - -fn validate_runtime_dir(dir: &Path) -> Result<(), VmError> { - if !dir.is_dir() { - return Err(VmError::BinaryNotFound { - path: dir.display().to_string(), - hint: format!( - "VM runtime not found. Run `mise run vm:build:embedded` or set {VM_RUNTIME_DIR_ENV}" - ), - }); - } - - let libkrun = dir.join(ffi::required_runtime_lib_name()); - if !libkrun.is_file() { - return Err(VmError::BinaryNotFound { - path: libkrun.display().to_string(), - hint: "runtime is incomplete: missing libkrun".to_string(), - }); - } - - let has_krunfw = std::fs::read_dir(dir) - .map_err(|e| VmError::HostSetup(format!("read {}: {e}", dir.display())))? - .filter_map(Result::ok) - .any(|entry| { - entry - .file_name() - .to_string_lossy() - .starts_with("libkrunfw.") - }); - if !has_krunfw { - return Err(VmError::BinaryNotFound { - path: dir.display().to_string(), - hint: "runtime is incomplete: missing libkrunfw".to_string(), - }); - } - - let gvproxy = dir.join("gvproxy"); - if !gvproxy.is_file() { - return Err(VmError::BinaryNotFound { - path: gvproxy.display().to_string(), - hint: "runtime is incomplete: missing gvproxy".to_string(), - }); - } - - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt as _; - - let mode = std::fs::metadata(&gvproxy) - .map_err(|e| VmError::HostSetup(format!("stat {}: {e}", gvproxy.display())))? - .permissions() - .mode(); - if mode & 0o111 == 0 { - return Err(VmError::HostSetup(format!( - "gvproxy is not executable: {}", - gvproxy.display() - ))); - } - } - - Ok(()) -} - -fn resolve_runtime_bundle() -> Result { - let runtime_dir = configured_runtime_dir()?; - // Validate the directory has required files - validate_runtime_dir(&runtime_dir)?; - Ok(runtime_dir.join("gvproxy")) -} - -pub fn default_runtime_gvproxy_path() -> PathBuf { - configured_runtime_dir() - .or_else(|_| embedded::runtime_cache_path()) - .unwrap_or_else(|_| PathBuf::from("gvproxy")) - .join("gvproxy") -} - -/// Check if the given path looks like an openshell-vm instance rootfs. -fn is_instance_rootfs_path(path: &Path) -> bool { - // Matches: .../openshell/openshell-vm/.../instances/.../rootfs - let s = path.to_string_lossy(); - s.contains("openshell/openshell-vm") && s.contains("instances") && path.ends_with("rootfs") -} - -#[cfg(target_os = "macos")] -fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { - let existing = std::env::var_os("DYLD_FALLBACK_LIBRARY_PATH"); - let mut paths = vec![runtime_dir.to_path_buf()]; - if let Some(existing) = existing { - paths.extend(std::env::split_paths(&existing)); - } - let joined = std::env::join_paths(paths) - .map_err(|e| VmError::HostSetup(format!("join DYLD_FALLBACK_LIBRARY_PATH: {e}")))?; - unsafe { - std::env::set_var("DYLD_FALLBACK_LIBRARY_PATH", joined); - } - Ok(()) -} - -#[cfg(target_os = "linux")] -fn configure_runtime_loader_env(runtime_dir: &Path) -> Result<(), VmError> { - // On Linux, libkrun.so has a DT_NEEDED for libkrunfw.so. Even though we - // preload libkrunfw with RTLD_GLOBAL, the ELF dynamic linker still resolves - // DT_NEEDED entries through LD_LIBRARY_PATH / system paths. Without this, - // dlopen("libkrun.so") fails if libkrunfw.so is only in the runtime bundle. - let existing = std::env::var_os("LD_LIBRARY_PATH"); - let mut paths = vec![runtime_dir.to_path_buf()]; - if let Some(existing) = existing { - paths.extend(std::env::split_paths(&existing)); - } - let joined = std::env::join_paths(paths) - .map_err(|e| VmError::HostSetup(format!("join LD_LIBRARY_PATH: {e}")))?; - unsafe { - std::env::set_var("LD_LIBRARY_PATH", joined); - } - Ok(()) -} - -#[cfg(not(any(target_os = "macos", target_os = "linux")))] -fn configure_runtime_loader_env(_runtime_dir: &Path) -> Result<(), VmError> { - Ok(()) -} - -fn raise_nofile_limit() { - #[cfg(unix)] - unsafe { - let mut rlim = libc::rlimit { - rlim_cur: 0, - rlim_max: 0, - }; - if libc::getrlimit(libc::RLIMIT_NOFILE, &raw mut rlim) == 0 { - rlim.rlim_cur = rlim.rlim_max; - let _ = libc::setrlimit(libc::RLIMIT_NOFILE, &raw const rlim); - } - } -} - -/// Log runtime provenance information for diagnostics. -/// -/// Prints the libkrun/libkrunfw versions, artifact hashes, and whether -/// a custom runtime is in use. This makes it easy to correlate VM issues -/// with the specific runtime bundle. -fn log_runtime_provenance(runtime_dir: &Path) { - if let Some(prov) = ffi::runtime_provenance() { - eprintln!("runtime: {}", runtime_dir.display()); - eprintln!(" libkrun: {}", prov.libkrun_path.display()); - for krunfw in &prov.libkrunfw_paths { - let name = krunfw.file_name().map_or_else( - || "unknown".to_string(), - |n| n.to_string_lossy().to_string(), - ); - eprintln!(" libkrunfw: {name}"); - } - if let Some(ref sha) = prov.libkrunfw_sha256 { - let short = if sha.len() > 12 { &sha[..12] } else { sha }; - eprintln!(" sha256: {short}..."); - } - if prov.is_custom { - eprintln!(" type: custom (OpenShell-built)"); - // Parse provenance.json for additional details. - if let Some(ref json) = prov.provenance_json { - // Extract key fields from provenance metadata. - for key in &["libkrunfw_commit", "kernel_version", "build_timestamp"] { - if let Some(val) = extract_json_string(json, key) { - eprintln!(" {}: {}", key.replace('_', "-"), val); - } - } - } - } else { - eprintln!(" type: stock (system/homebrew)"); - } - } -} - -/// Extract a string value from a JSON object by key. -fn extract_json_string(json: &str, key: &str) -> Option { - let map: serde_json::Map = serde_json::from_str(json).ok()?; - map.get(key)?.as_str().map(ToOwned::to_owned) -} - -fn clamp_log_level(level: u32) -> u32 { - match level { - 0 => ffi::KRUN_LOG_LEVEL_OFF, - 1 => ffi::KRUN_LOG_LEVEL_ERROR, - 2 => ffi::KRUN_LOG_LEVEL_WARN, - 3 => ffi::KRUN_LOG_LEVEL_INFO, - 4 => ffi::KRUN_LOG_LEVEL_DEBUG, - _ => ffi::KRUN_LOG_LEVEL_TRACE, - } -} - -struct VmContext { - krun: &'static ffi::LibKrun, - ctx_id: u32, -} - -impl VmContext { - fn create(log_level: u32) -> Result { - let krun = ffi::libkrun()?; - unsafe { - check( - (krun.krun_init_log)( - ffi::KRUN_LOG_TARGET_DEFAULT, - clamp_log_level(log_level), - ffi::KRUN_LOG_STYLE_AUTO, - ffi::KRUN_LOG_OPTION_NO_ENV, - ), - "krun_init_log", - )?; - } - - let ctx_id = unsafe { (krun.krun_create_ctx)() }; - if ctx_id < 0 { - return Err(VmError::Krun { - func: "krun_create_ctx", - code: ctx_id, - }); - } - - Ok(Self { - krun, - ctx_id: ctx_id.cast_unsigned(), - }) - } - - fn set_vm_config(&self, vcpus: u8, mem_mib: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_set_vm_config)(self.ctx_id, vcpus, mem_mib), - "krun_set_vm_config", - ) - } - } - - fn set_root(&self, rootfs: &Path) -> Result<(), VmError> { - let rootfs_c = path_to_cstring(rootfs)?; - unsafe { - check( - (self.krun.krun_set_root)(self.ctx_id, rootfs_c.as_ptr()), - "krun_set_root", - ) - } - } - - fn add_state_disk(&self, state_disk: &StateDiskConfig) -> Result<(), VmError> { - let Some(add_disk3) = self.krun.krun_add_disk3 else { - return Err(VmError::HostSetup( - "libkrun runtime does not expose krun_add_disk3; rebuild the VM runtime with block support" - .to_string(), - )); - }; - - let block_id_c = CString::new(state_disk.block_id.as_str())?; - let disk_path_c = path_to_cstring(&state_disk.path)?; - unsafe { - check( - add_disk3( - self.ctx_id, - block_id_c.as_ptr(), - disk_path_c.as_ptr(), - ffi::KRUN_DISK_FORMAT_RAW, - false, - false, - state_disk_sync_mode(), - ), - "krun_add_disk3", - ) - } - } - - fn set_workdir(&self, workdir: &str) -> Result<(), VmError> { - let workdir_c = CString::new(workdir)?; - unsafe { - check( - (self.krun.krun_set_workdir)(self.ctx_id, workdir_c.as_ptr()), - "krun_set_workdir", - ) - } - } - - fn disable_implicit_vsock(&self) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_disable_implicit_vsock)(self.ctx_id), - "krun_disable_implicit_vsock", - ) - } - } - - fn add_vsock(&self, tsi_features: u32) -> Result<(), VmError> { - unsafe { - check( - (self.krun.krun_add_vsock)(self.ctx_id, tsi_features), - "krun_add_vsock", - ) - } - } - - #[cfg(target_os = "macos")] - fn add_net_unixgram( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - flags: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixgram)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - flags, - ), - "krun_add_net_unixgram", - ) - } - } - - #[allow(dead_code)] // FFI binding for future use (e.g. Linux networking) - fn add_net_unixstream( - &self, - socket_path: &Path, - mac: &[u8; 6], - features: u32, - ) -> Result<(), VmError> { - let sock_c = path_to_cstring(socket_path)?; - unsafe { - check( - (self.krun.krun_add_net_unixstream)( - self.ctx_id, - sock_c.as_ptr(), - -1, - mac.as_ptr(), - features, - 0, - ), - "krun_add_net_unixstream", - ) - } - } - - fn set_port_map(&self, port_map: &[String]) -> Result<(), VmError> { - let port_refs: Vec<&str> = port_map.iter().map(String::as_str).collect(); - let (_port_owners, port_ptrs) = c_string_array(&port_refs)?; - unsafe { - check( - (self.krun.krun_set_port_map)(self.ctx_id, port_ptrs.as_ptr()), - "krun_set_port_map", - ) - } - } - - fn add_vsock_port(&self, port: &VsockPort) -> Result<(), VmError> { - let socket_c = path_to_cstring(&port.socket_path)?; - unsafe { - check( - (self.krun.krun_add_vsock_port2)( - self.ctx_id, - port.port, - socket_c.as_ptr(), - port.listen, - ), - "krun_add_vsock_port2", - ) - } - } - - fn set_console_output(&self, path: &Path) -> Result<(), VmError> { - let console_c = path_to_cstring(path)?; - unsafe { - check( - (self.krun.krun_set_console_output)(self.ctx_id, console_c.as_ptr()), - "krun_set_console_output", - ) - } - } - - fn set_exec(&self, exec_path: &str, args: &[String], env: &[String]) -> Result<(), VmError> { - let exec_c = CString::new(exec_path)?; - let argv_refs: Vec<&str> = args.iter().map(String::as_str).collect(); - let (_argv_owners, argv_ptrs) = c_string_array(&argv_refs)?; - let env_refs: Vec<&str> = env.iter().map(String::as_str).collect(); - let (_env_owners, env_ptrs) = c_string_array(&env_refs)?; - - unsafe { - check( - (self.krun.krun_set_exec)( - self.ctx_id, - exec_c.as_ptr(), - argv_ptrs.as_ptr(), - env_ptrs.as_ptr(), - ), - "krun_set_exec", - ) - } - } - - fn start_enter(&self) -> i32 { - unsafe { (self.krun.krun_start_enter)(self.ctx_id) } - } -} - -impl Drop for VmContext { - fn drop(&mut self) { - unsafe { - let ret = (self.krun.krun_free_ctx)(self.ctx_id); - if ret < 0 { - eprintln!( - "warning: krun_free_ctx({}) failed with code {ret}", - self.ctx_id - ); - } - } - } -} - -/// RAII guard that kills and waits on a gvproxy child process when dropped. -/// -/// This prevents orphaned gvproxy processes when early `?` returns in the -/// launch function cause the child to be dropped before cleanup code runs. -/// Call [`GvproxyGuard::disarm`] to take ownership of the child when it -/// should outlive the guard (i.e., after a successful fork). -struct GvproxyGuard { - child: Option, -} - -impl GvproxyGuard { - fn new(child: std::process::Child) -> Self { - Self { child: Some(child) } - } - - /// Take the child out of the guard, preventing it from being killed on drop. - /// Use this after the launch is successful and the parent will manage cleanup. - fn disarm(&mut self) -> Option { - self.child.take() - } - - /// Get the child's PID without disarming. - fn id(&self) -> Option { - self.child.as_ref().map(std::process::Child::id) - } -} - -impl Drop for GvproxyGuard { - fn drop(&mut self) { - if let Some(mut child) = self.child.take() { - let pid = child.id(); - let _ = child.kill(); - let _ = child.wait(); - eprintln!("gvproxy cleaned up (pid {pid})"); - } - } -} - -/// Issue a gvproxy expose call via its HTTP API (unix socket). -/// -/// Sends a raw HTTP/1.1 POST request over the unix socket to avoid -/// depending on `curl` being installed on the host. -fn gvproxy_expose(api_sock: &Path, body: &str) -> Result<(), String> { - use std::io::{Read, Write}; - use std::os::unix::net::UnixStream; - - let mut stream = - UnixStream::connect(api_sock).map_err(|e| format!("connect to gvproxy API socket: {e}"))?; - - let request = format!( - "POST /services/forwarder/expose HTTP/1.1\r\n\ - Host: localhost\r\n\ - Content-Type: application/json\r\n\ - Content-Length: {}\r\n\ - Connection: close\r\n\ - \r\n\ - {}", - body.len(), - body, - ); - - stream - .write_all(request.as_bytes()) - .map_err(|e| format!("write to gvproxy API: {e}"))?; - - // Read just enough of the response to get the status line. - let mut buf = [0u8; 1024]; - let n = stream - .read(&mut buf) - .map_err(|e| format!("read from gvproxy API: {e}"))?; - let response = String::from_utf8_lossy(&buf[..n]); - - // Parse the HTTP status code from the first line (e.g. "HTTP/1.1 200 OK"). - let status = response - .lines() - .next() - .and_then(|line| line.split_whitespace().nth(1)) - .unwrap_or("0"); - - match status { - "200" | "204" => Ok(()), - _ => { - let first_line = response.lines().next().unwrap_or(""); - Err(format!("gvproxy API: {first_line}")) - } - } -} - -/// Kill a stale gvproxy process from a previous openshell-vm run. -/// -/// If the CLI crashes or is killed before cleanup, gvproxy keeps running -/// and holds its ports. A new gvproxy instance then fails with -/// "bind: address already in use" when trying to forward ports. -/// -/// We first try to kill the specific gvproxy PID recorded in the VM -/// runtime state. If the state file was deleted (e.g. the user ran -/// `rm -rf` on the data directory), we fall back to killing any gvproxy -/// process holding the target ports. -fn kill_stale_gvproxy(rootfs: &Path) { - kill_stale_gvproxy_by_state(rootfs); -} - -/// Kill stale gvproxy using the PID from the VM state file. -fn kill_stale_gvproxy_by_state(rootfs: &Path) { - let state_path = vm_state_path(rootfs); - let pid = std::fs::read(&state_path) - .ok() - .and_then(|bytes| serde_json::from_slice::(&bytes).ok()) - .and_then(|state| state.gvproxy_pid); - - if let Some(gvproxy_pid) = pid { - kill_gvproxy_pid(gvproxy_pid); - } -} - -/// Kill any gvproxy process holding a specific TCP port. -/// -/// Used as a fallback when the VM state file is missing (e.g. after the -/// user deleted the data directory while a VM was running). -fn kill_stale_gvproxy_by_port(port: u16) { - // Use lsof to find PIDs listening on the target port. - let output = std::process::Command::new("lsof") - .args(["-ti", &format!(":{port}")]) - .output(); - - let pids = match output { - Ok(o) if o.status.success() => String::from_utf8_lossy(&o.stdout).to_string(), - _ => return, - }; - - for line in pids.lines() { - if let Ok(pid) = line.trim().parse::() { - let pid_i32 = pid.cast_signed(); - if is_process_named(pid_i32, "gvproxy") { - kill_gvproxy_pid(pid); - } - } - } -} - -fn kill_gvproxy_pid(gvproxy_pid: u32) { - let pid_i32 = gvproxy_pid.cast_signed(); - let is_alive = unsafe { libc::kill(pid_i32, 0) } == 0; - if is_alive { - // Verify the process is actually gvproxy before killing. - // Without this check, PID reuse could cause us to kill an - // unrelated process. - if !is_process_named(pid_i32, "gvproxy") { - eprintln!( - "Stale gvproxy pid {gvproxy_pid} is no longer gvproxy (PID reused), skipping kill" - ); - return; - } - unsafe { - libc::kill(pid_i32, libc::SIGTERM); - } - eprintln!("Killed stale gvproxy process (pid {gvproxy_pid})"); - // Brief pause for the port to be released. - std::thread::sleep(std::time::Duration::from_millis(200)); - } -} - -/// Check whether a process with the given PID has the expected name. -/// -/// On macOS, shells out to `ps` to query the process name. On Linux, reads -/// `/proc//comm`. Returns `false` if the process name cannot be -/// determined (fail-safe: don't kill if we can't verify). -#[cfg(target_os = "macos")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - // Use `ps -p -o comm=` to get just the process name. - // This avoids depending on libc kinfo_proc struct layout. - std::process::Command::new("ps") - .args(["-p", &pid.to_string(), "-o", "comm="]) - .output() - .ok() - .and_then(|output| { - if output.status.success() { - String::from_utf8(output.stdout).ok() - } else { - None - } - }) - .is_some_and(|name| name.trim().contains(expected)) -} - -#[cfg(target_os = "linux")] -fn is_process_named(pid: libc::pid_t, expected: &str) -> bool { - let comm_path = format!("/proc/{pid}/comm"); - std::fs::read_to_string(comm_path).is_ok_and(|name| name.trim().contains(expected)) -} - -#[cfg(not(any(target_os = "macos", target_os = "linux")))] -fn is_process_named(_pid: libc::pid_t, _expected: &str) -> bool { - // Cannot verify on this platform — fail-safe: don't kill. - false -} - -fn vm_rootfs_key(rootfs: &Path) -> String { - let name = rootfs - .file_name() - .and_then(|part| part.to_str()) - .unwrap_or("openshell-vm"); - let mut out = String::with_capacity(name.len()); - for ch in name.chars() { - if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { - out.push(ch); - } else { - out.push('_'); - } - } - if out.is_empty() { - "openshell-vm".to_string() - } else { - out - } -} - -fn default_state_disk_path(rootfs: &Path) -> PathBuf { - rootfs - .parent() - .unwrap_or(rootfs) - .join(format!("{}-state.raw", vm_rootfs_key(rootfs))) -} - -fn ensure_state_disk_image(state_disk: &StateDiskConfig) -> Result<(), VmError> { - if let Some(parent) = state_disk.path.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::HostSetup(format!("create state disk dir {}: {e}", parent.display())) - })?; - } - - let file = std::fs::OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(false) - .open(&state_disk.path) - .map_err(|e| { - VmError::HostSetup(format!( - "open state disk {}: {e}", - state_disk.path.display() - )) - })?; - - let current_len = file - .metadata() - .map_err(|e| { - VmError::HostSetup(format!( - "stat state disk {}: {e}", - state_disk.path.display() - )) - })? - .len(); - if current_len < state_disk.size_bytes { - file.set_len(state_disk.size_bytes).map_err(|e| { - VmError::HostSetup(format!( - "resize state disk {} to {} bytes: {e}", - state_disk.path.display(), - state_disk.size_bytes - )) - })?; - } - - Ok(()) -} - -fn state_disk_sync_mode() -> u32 { - #[cfg(target_os = "macos")] - { - ffi::KRUN_SYNC_RELAXED - } - #[cfg(not(target_os = "macos"))] - { - ffi::KRUN_SYNC_FULL - } -} - -fn hash_path_id(path: &Path) -> String { - let mut hash: u64 = 0xcbf2_9ce4_8422_2325; - for byte in path.to_string_lossy().as_bytes() { - hash ^= u64::from(*byte); - hash = hash.wrapping_mul(0x0100_0000_01b3); - } - format!("{:012x}", hash & 0x0000_ffff_ffff_ffff) -} - -/// Return a secure base directory for temporary socket files. -/// -/// Prefers `XDG_RUNTIME_DIR` (per-user, restricted permissions on Linux), -/// falls back to `/tmp`. After `create_dir_all`, validates the directory -/// is not a symlink and is owned by the current user. -fn secure_socket_base(subdir: &str) -> Result { - let base = std::env::var_os("XDG_RUNTIME_DIR").map_or_else( - || { - let mut base = PathBuf::from("/tmp"); - if !base.is_dir() { - base = std::env::temp_dir(); - } - base - }, - PathBuf::from, - ); - let dir = base.join(subdir); - - // If the path exists, verify it is not a symlink before using it. - if dir.exists() { - let meta = dir - .symlink_metadata() - .map_err(|e| VmError::HostSetup(format!("lstat {}: {e}", dir.display())))?; - if meta.file_type().is_symlink() { - return Err(VmError::HostSetup(format!( - "socket directory {} is a symlink — refusing to use it", - dir.display() - ))); - } - // Verify ownership matches current user. - #[cfg(unix)] - { - use std::os::unix::fs::MetadataExt as _; - let uid = unsafe { libc::getuid() }; - if meta.uid() != uid { - return Err(VmError::HostSetup(format!( - "socket directory {} is owned by uid {} but we are uid {} — refusing to use it", - dir.display(), - meta.uid(), - uid - ))); - } - } - } else { - std::fs::create_dir_all(&dir) - .map_err(|e| VmError::HostSetup(format!("create socket dir {}: {e}", dir.display())))?; - // Set restrictive permissions on the newly created directory. - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt as _; - let _ = std::fs::set_permissions(&dir, std::fs::Permissions::from_mode(0o700)); - } - } - - Ok(dir) -} - -fn gvproxy_socket_dir(rootfs: &Path) -> Result { - let dir = secure_socket_base("ovm-gv")?; - - // macOS unix socket path limit is tight (~104 bytes). Keep paths very short. - let id = hash_path_id(rootfs); - Ok(dir.join(id)) -} - -fn gateway_host_port(config: &VmConfig) -> u16 { - config - .port_map - .first() - .and_then(|pm| pm.split(':').next()) - .and_then(|port| port.parse::().ok()) - .unwrap_or(DEFAULT_GATEWAY_PORT) -} - -fn pick_gvproxy_ssh_port() -> Result { - let listener = std::net::TcpListener::bind(("127.0.0.1", 0)) - .map_err(|e| VmError::HostSetup(format!("allocate gvproxy ssh port on localhost: {e}")))?; - let port = listener - .local_addr() - .map_err(|e| VmError::HostSetup(format!("read gvproxy ssh port: {e}")))? - .port(); - drop(listener); - Ok(port) -} - -fn path_to_cstring(path: &Path) -> Result { - let s = path - .to_str() - .ok_or_else(|| VmError::InvalidPath(path.display().to_string()))?; - Ok(CString::new(s)?) -} - -/// Check that `/dev/kvm` is readable before attempting to boot. -/// -/// libkrun panics with an opaque Rust panic (instead of returning an error -/// code) when `/dev/kvm` is inaccessible. This pre-check turns that into a -/// clear, actionable error message. -#[cfg(target_os = "linux")] -fn check_kvm_access() -> Result<(), VmError> { - use std::fs::OpenOptions; - match OpenOptions::new().read(true).open("/dev/kvm") { - Ok(_) => Ok(()), - Err(e) => Err(VmError::KvmAccess { - reason: e.to_string(), - }), - } -} - -// ── Launch ────────────────────────────────────────────────────────────── - -/// Configure and launch a libkrun microVM. -/// -/// This forks the process. The child enters the VM (never returns); the -/// parent blocks until the VM exits or a signal is received. -/// -/// Returns the VM exit code (from `waitpid`). -#[allow(clippy::similar_names)] -pub fn launch(config: &VmConfig) -> Result { - // Auto-extract embedded rootfs if using an instance path and it doesn't exist - if !config.rootfs.is_dir() - && is_instance_rootfs_path(&config.rootfs) - && embedded::has_embedded_rootfs() - { - embedded::extract_rootfs_to(&config.rootfs)?; - } - - // Validate rootfs - if !config.rootfs.is_dir() { - return Err(VmError::RootfsNotFound { - path: config.rootfs.display().to_string(), - }); - } - - // On Linux, libkrun uses KVM for hardware virtualization. Check access - // before starting so a missing kvm group membership produces a clear - // error instead of a cryptic panic inside krun_start_enter. - #[cfg(target_os = "linux")] - check_kvm_access()?; - - if config.exec_path == "/srv/openshell-vm-init.sh" { - ensure_vm_not_running(&config.rootfs)?; - } - - // Acquire an exclusive flock on the rootfs lock file. This is held - // by the parent process for the VM's entire lifetime. If this process - // is killed (even SIGKILL), the OS releases the lock automatically. - // This prevents a second launch or rootfs rebuild from corrupting a - // running VM's filesystem via virtio-fs. - let _rootfs_lock = if config.exec_path == "/srv/openshell-vm-init.sh" { - Some(acquire_rootfs_lock(&config.rootfs)?) - } else { - None - }; - - // Check for a corrupt kine (SQLite) database and remove it if the - // header is invalid. Stale bootstrap locks are handled inside the VM - // by the init script (sqlite3 DELETE before k3s starts). This runs on - // every normal boot (not --reset, which wipes k3s/server/ entirely). - // Must happen after the lock so we know no other VM process is using - // the rootfs. - if !config.reset && config.exec_path == "/srv/openshell-vm-init.sh" { - recover_corrupt_kine_db(&config.rootfs)?; - } - - // Wipe stale containerd/kubelet runtime state if requested. - // This must happen after the lock (to confirm no other VM is using - // the rootfs) but before booting (so the new VM starts clean). - if config.reset { - reset_runtime_state(&config.rootfs, &config.gateway_name)?; - } - if config.reset - && let Some(state_disk) = &config.state_disk - && let Err(err) = std::fs::remove_file(&state_disk.path) - && err.kind() != std::io::ErrorKind::NotFound - { - return Err(VmError::HostSetup(format!( - "remove state disk {}: {err}", - state_disk.path.display() - ))); - } - if let Some(state_disk) = &config.state_disk { - ensure_state_disk_image(state_disk)?; - } - - let launch_start = Instant::now(); - eprintln!("rootfs: {}", config.rootfs.display()); - if let Some(state_disk) = &config.state_disk { - eprintln!( - "state disk: {} ({} GiB)", - state_disk.path.display(), - state_disk.size_bytes / 1024 / 1024 / 1024 - ); - } - eprintln!("vm: {} vCPU(s), {} MiB RAM", config.vcpus, config.mem_mib); - - // The runtime is embedded in the binary and extracted on first use. - // Can be overridden via OPENSHELL_VM_RUNTIME_DIR for development. - let runtime_gvproxy = resolve_runtime_bundle()?; - let runtime_dir = runtime_gvproxy.parent().ok_or_else(|| { - VmError::HostSetup(format!( - "runtime bundle file has no parent directory: {}", - runtime_gvproxy.display() - )) - })?; - configure_runtime_loader_env(runtime_dir)?; - raise_nofile_limit(); - - // ── Log runtime provenance ───────────────────────────────────── - // After configuring the loader, trigger library loading so that - // provenance is captured before we proceed with VM configuration. - let _ = ffi::libkrun()?; - log_runtime_provenance(runtime_dir); - - // ── Configure the microVM ────────────────────────────────────── - - let vm = VmContext::create(config.log_level)?; - vm.set_vm_config(config.vcpus, config.mem_mib)?; - vm.set_root(&config.rootfs)?; - if let Some(state_disk) = &config.state_disk { - vm.add_state_disk(state_disk)?; - } - vm.set_workdir(&config.workdir)?; - - // Networking setup — use a drop guard so gvproxy is killed if we - // return early via `?` before reaching the parent's cleanup code. - let mut gvproxy_guard: Option = None; - let mut gvproxy_api_sock: Option = None; - - match &config.net { - NetBackend::Tsi => { - // Default TSI — no special setup needed. - } - NetBackend::None => { - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - eprintln!("Networking: disabled (no TSI, no virtio-net)"); - } - NetBackend::Gvproxy { binary } => { - if !binary.exists() { - return Err(VmError::BinaryNotFound { - path: binary.display().to_string(), - hint: "Install Podman Desktop or place gvproxy in PATH".to_string(), - }); - } - - // Create temp socket paths - let run_dir = config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .to_path_buf(); - let rootfs_key = vm_rootfs_key(&config.rootfs); - let sock_base = gvproxy_socket_dir(&config.rootfs)?; - let net_sock = sock_base.with_extension("v"); - let api_sock = sock_base.with_extension("a"); - - // Kill any stale gvproxy process from a previous run. - // First try via the saved PID in the state file, then fall - // back to killing any gvproxy holding our target ports (covers - // the case where the state file was deleted). - kill_stale_gvproxy(&config.rootfs); - for pm in &config.port_map { - if let Some(host_port) = pm.split(':').next().and_then(|p| p.parse::().ok()) { - kill_stale_gvproxy_by_port(host_port); - } - } - - // Clean stale sockets (including the -krun.sock file that - // libkrun creates as its datagram endpoint on macOS). - let _ = std::fs::remove_file(&net_sock); - let _ = std::fs::remove_file(&api_sock); - let krun_sock = sock_base.with_extension("v-krun.sock"); - let _ = std::fs::remove_file(&krun_sock); - - // Start gvproxy - eprintln!("Starting gvproxy: {}", binary.display()); - let ssh_port = pick_gvproxy_ssh_port()?; - let gvproxy_log = run_dir.join(format!("{rootfs_key}-gvproxy.log")); - let gvproxy_log_file = std::fs::File::create(&gvproxy_log) - .map_err(|e| VmError::Fork(format!("failed to create gvproxy log: {e}")))?; - - // On Linux, gvproxy uses QEMU mode (SOCK_STREAM) since the vfkit - // unixgram scheme is macOS/vfkit-specific. On macOS, use vfkit mode. - #[cfg(target_os = "linux")] - let (gvproxy_net_flag, gvproxy_net_url) = - ("-listen-qemu", format!("unix://{}", net_sock.display())); - #[cfg(target_os = "macos")] - let (gvproxy_net_flag, gvproxy_net_url) = ( - "-listen-vfkit", - format!("unixgram://{}", net_sock.display()), - ); - - let child = std::process::Command::new(binary) - .arg(gvproxy_net_flag) - .arg(&gvproxy_net_url) - .arg("-listen") - .arg(format!("unix://{}", api_sock.display())) - .arg("-ssh-port") - .arg(ssh_port.to_string()) - .stdout(std::process::Stdio::null()) - .stderr(gvproxy_log_file) - .spawn() - .map_err(|e| VmError::Fork(format!("failed to start gvproxy: {e}")))?; - - eprintln!( - "gvproxy started (pid {}, ssh port {}) [{:.1}s]", - child.id(), - ssh_port, - launch_start.elapsed().as_secs_f64() - ); - - // Wait for the socket to appear (exponential backoff: 5ms → 100ms). - { - let deadline = Instant::now() + std::time::Duration::from_secs(5); - let mut interval = std::time::Duration::from_millis(5); - while !net_sock.exists() { - if Instant::now() >= deadline { - return Err(VmError::Fork( - "gvproxy socket did not appear within 5s".to_string(), - )); - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(100)); - } - } - - // Disable implicit TSI and add virtio-net via gvproxy - vm.disable_implicit_vsock()?; - vm.add_vsock(0)?; - // This MAC matches gvproxy's default static DHCP lease for - // 192.168.127.2. Using a different MAC can cause the gVisor - // network stack to misroute or drop packets. - let mac: [u8; 6] = [0x5a, 0x94, 0xef, 0xe4, 0x0c, 0xee]; - - // COMPAT_NET_FEATURES from libkrun.h: - // NET_FEATURE_CSUM (1 << 0) | NET_FEATURE_GUEST_CSUM (1 << 1) - // | NET_FEATURE_GUEST_TSO4 (1 << 7) | NET_FEATURE_GUEST_UFO (1 << 10) - // | NET_FEATURE_HOST_TSO4 (1 << 11) | NET_FEATURE_HOST_UFO (1 << 14). - let compat_net_features: u32 = - (1 << 0) | (1 << 1) | (1 << 7) | (1 << 10) | (1 << 11) | (1 << 14); - - // On Linux use unixstream (SOCK_STREAM) to connect to gvproxy's - // QEMU listener. On macOS use unixgram (SOCK_DGRAM) with the vfkit - // magic byte for the vfkit listener. - #[cfg(target_os = "linux")] - vm.add_net_unixstream(&net_sock, &mac, compat_net_features)?; - #[cfg(target_os = "macos")] - { - // NET_FLAG_VFKIT = 1 << 0 - let net_flag_vfkit: u32 = 1 << 0; - vm.add_net_unixgram(&net_sock, &mac, compat_net_features, net_flag_vfkit)?; - } - - eprintln!( - "Networking: gvproxy (virtio-net) [{:.1}s]", - launch_start.elapsed().as_secs_f64() - ); - gvproxy_guard = Some(GvproxyGuard::new(child)); - gvproxy_api_sock = Some(api_sock); - } - } - - // Port mapping (TSI only) - if !config.port_map.is_empty() && matches!(config.net, NetBackend::Tsi) { - vm.set_port_map(&config.port_map)?; - } - - for vsock_port in &config.vsock_ports { - if let Some(parent) = vsock_port.socket_path.parent() { - std::fs::create_dir_all(parent).map_err(|e| { - VmError::RuntimeState(format!("create vsock socket dir {}: {e}", parent.display())) - })?; - } - // libkrun returns EEXIST if the socket file is already present from a - // previous run. Remove any stale socket before registering the port. - let _ = std::fs::remove_file(&vsock_port.socket_path); - vm.add_vsock_port(vsock_port)?; - } - - // Console output - let console_log = config.console_output.clone().unwrap_or_else(|| { - config - .rootfs - .parent() - .unwrap_or(&config.rootfs) - .join(format!("{}-console.log", vm_rootfs_key(&config.rootfs))) - }); - vm.set_console_output(&console_log)?; - - // envp: use provided env or minimal defaults - let mut env: Vec = if config.env.is_empty() { - vec![ - "HOME=/root", - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - ] - .into_iter() - .map(ToOwned::to_owned) - .collect() - } else { - config.env.clone() - }; - if let Some(state_disk) = &config.state_disk - && !env - .iter() - .any(|entry| entry.starts_with("OPENSHELL_VM_STATE_DISK_DEVICE=")) - { - env.push(format!( - "OPENSHELL_VM_STATE_DISK_DEVICE={}", - state_disk.guest_device - )); - } - vm.set_exec(&config.exec_path, &config.args, &env)?; - - // ── Fork and enter the VM ────────────────────────────────────── - // - // krun_start_enter() never returns — it calls exit() when the guest - // process exits. We fork so the parent can monitor and report. - - let boot_start = Instant::now(); - eprintln!("Booting microVM..."); - - let pid = unsafe { libc::fork() }; - match pid { - -1 => Err(VmError::Fork(std::io::Error::last_os_error().to_string())), - 0 => { - // Child process: enter the VM (never returns on success) - let ret = vm.start_enter(); - eprintln!("krun_start_enter failed: {ret}"); - std::process::exit(1); - } - _ => { - // Parent: wait for child - if config.exec_path == "/srv/openshell-vm-init.sh" { - let gvproxy_pid = gvproxy_guard.as_ref().and_then(GvproxyGuard::id); - if let Err(err) = - write_vm_runtime_state(&config.rootfs, pid, &console_log, gvproxy_pid) - { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - // Guard drop will kill gvproxy automatically - drop(gvproxy_guard); - clear_vm_runtime_state(&config.rootfs); - return Err(err); - } - } - eprintln!( - "VM started (child pid {pid}) [{:.1}s]", - boot_start.elapsed().as_secs_f64() - ); - for pm in &config.port_map { - let host_port = pm.split(':').next().unwrap_or(pm); - eprintln!(" port {pm} -> http://localhost:{host_port}"); - } - eprintln!("Console output: {}", console_log.display()); - - // Set up gvproxy port forwarding via its HTTP API. - // The port_map entries use the same "host:guest" format - // as TSI, but here we translate them into gvproxy expose - // calls targeting the guest IP (192.168.127.2). - // - // Instead of a fixed 500ms sleep, poll the API socket with - // exponential backoff (5ms → 200ms, ~1s total budget). - if let Some(ref api_sock) = gvproxy_api_sock { - let fwd_start = Instant::now(); - // Wait for the API socket to appear (it lags slightly - // behind the vfkit data socket). - { - let deadline = Instant::now() + std::time::Duration::from_secs(2); - let mut interval = std::time::Duration::from_millis(5); - while !api_sock.exists() { - if Instant::now() >= deadline { - eprintln!( - "warning: gvproxy API socket not ready after 2s, attempting anyway" - ); - break; - } - std::thread::sleep(interval); - interval = (interval * 2).min(std::time::Duration::from_millis(200)); - } - } - - let guest_ip = "192.168.127.2"; - - for pm in &config.port_map { - let parts: Vec<&str> = pm.split(':').collect(); - let (host_port, guest_port) = match parts.len() { - 2 => (parts[0], parts[1]), - 1 => (parts[0], parts[0]), - _ => { - eprintln!(" skipping invalid port mapping: {pm}"); - continue; - } - }; - - let expose_body = format!( - r#"{{"local":":{host_port}","remote":"{guest_ip}:{guest_port}","protocol":"tcp"}}"# - ); - - // Retry with exponential backoff — gvproxy's internal - // netstack may not be ready immediately after socket creation. - let mut expose_ok = false; - let mut retry_interval = std::time::Duration::from_millis(100); - let expose_deadline = Instant::now() + std::time::Duration::from_secs(10); - loop { - match gvproxy_expose(api_sock, &expose_body) { - Ok(()) => { - eprintln!(" port {host_port} -> {guest_ip}:{guest_port}"); - expose_ok = true; - break; - } - Err(e) => { - if Instant::now() >= expose_deadline { - eprintln!(" port {host_port}: {e} (retries exhausted)"); - break; - } - std::thread::sleep(retry_interval); - retry_interval = - (retry_interval * 2).min(std::time::Duration::from_secs(1)); - } - } - } - if !expose_ok { - return Err(VmError::HostSetup(format!( - "failed to forward port {host_port} via gvproxy" - ))); - } - } - eprintln!( - "Port forwarding ready [{:.1}s]", - fwd_start.elapsed().as_secs_f64() - ); - } - - // Bootstrap the OpenShell control plane and wait for the - // service to be reachable. Only for the gateway preset, and - // only when port forwarding is configured (i.e. the gateway - // is reachable from the host). During rootfs pre-init builds, - // no --port is specified so there is nothing to health-check - // — the build script has its own kubectl-based readiness - // checks inside the VM. - if config.exec_path == "/srv/openshell-vm-init.sh" && !config.port_map.is_empty() { - // Bootstrap stores host-side metadata and mTLS creds. - // With pre-baked rootfs (Path 1) this reads PKI directly - // from virtio-fs — no kubectl or port forwarding needed. - // Cold boot (Path 2) writes secret manifests into the - // k3s auto-deploy directory via virtio-fs. - let gateway_port = gateway_host_port(config); - bootstrap_gateway(&config.rootfs, &config.gateway_name, gateway_port)?; - - // Wait for the gRPC health check to pass. This ensures - // the service is fully operational, not just accepting - // TCP connections. The health check confirms the full - // path (gvproxy → kube-proxy nftables → pod:8080) and - // that the gRPC service is responding to requests. - health::wait_for_gateway_ready(gateway_port, &config.gateway_name)?; - } - - eprintln!("Ready [{:.1}s total]", boot_start.elapsed().as_secs_f64()); - eprintln!("Press Ctrl+C to stop."); - - // Forward signals to child - unsafe { - libc::signal( - libc::SIGINT, - forward_signal as *const () as libc::sighandler_t, - ); - libc::signal( - libc::SIGTERM, - forward_signal as *const () as libc::sighandler_t, - ); - CHILD_PID.store(pid, std::sync::atomic::Ordering::Relaxed); - } - - let mut status: libc::c_int = 0; - unsafe { - libc::waitpid(pid, &raw mut status, 0); - } - - // Clean up gvproxy — disarm the guard and do explicit cleanup - // so we can print the "stopped" message. - if config.exec_path == "/srv/openshell-vm-init.sh" { - clear_vm_runtime_state(&config.rootfs); - } - if let Some(mut guard) = gvproxy_guard - && let Some(mut child) = guard.disarm() - { - let _ = child.kill(); - let _ = child.wait(); - eprintln!("gvproxy stopped"); - } - - if libc::WIFEXITED(status) { - let code = libc::WEXITSTATUS(status); - eprintln!("VM exited with code {code}"); - return Ok(code); - } else if libc::WIFSIGNALED(status) { - let sig = libc::WTERMSIG(status); - eprintln!("VM killed by signal {sig}"); - return Ok(128 + sig); - } - - Ok(status) - } - } -} - -// ── Post-boot bootstrap ──────────────────────────────────────────────── - -/// Default gateway port: host port mapped to the `OpenShell` `NodePort` (30051). -const DEFAULT_GATEWAY_PORT: u16 = 30051; - -/// Bootstrap the `OpenShell` control plane after k3s is ready. -/// -/// Two paths: -/// -/// 1. **Warm boot**: host-side metadata and mTLS certs already exist from a -/// previous run. Fetch PKI via the exec agent to detect cert drift (e.g. -/// after a `--reset`), re-sync if needed, then proceed to the health check. -/// -/// 2. **First boot / post-reset**: poll the exec agent to `cat` each PEM file -/// from `/opt/openshell/pki/` until the files exist (PKI generation has -/// finished), then store them in `~/.config/openshell/gateways//mtls/`. -fn bootstrap_gateway(rootfs: &Path, gateway_name: &str, gateway_port: u16) -> Result<(), VmError> { - let bootstrap_start = Instant::now(); - - let metadata = openshell_bootstrap::GatewayMetadata { - name: gateway_name.to_string(), - gateway_endpoint: format!("https://127.0.0.1:{gateway_port}"), - gateway_port, - ..Default::default() - }; - - let exec_socket = vm_exec_socket_path(rootfs); - - // ── Warm boot: host already has certs ────────────────────────── - if is_warm_boot(gateway_name) { - // Always (re-)store metadata so port/endpoint changes are picked up. - openshell_bootstrap::store_gateway_metadata(gateway_name, &metadata) - .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; - openshell_bootstrap::save_active_gateway(gateway_name) - .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; - - // Verify host certs match the VM's PKI. If they diverge (e.g. - // PKI was regenerated after a --reset, or the state disk was - // replaced), re-sync the host certs from the VM via the exec agent. - // - // On warm boot the exec agent may not be ready yet (the VM is - // still booting). Use a short timeout — this is a non-critical - // drift check and the host already has valid certs. If the agent - // isn't reachable we skip silently rather than blocking boot for - // 30s. - // Expected on warm boot — exec agent not ready yet. - if let Ok(bundle) = fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(5)) - && let Err(e) = sync_host_certs_if_stale(gateway_name, &bundle) - { - eprintln!("Warning: cert sync check failed: {e}"); - } - - eprintln!( - "Warm boot [{:.1}s]", - bootstrap_start.elapsed().as_secs_f64() - ); - eprintln!(" Cluster: {gateway_name}"); - eprintln!(" Gateway: https://127.0.0.1:{gateway_port}"); - eprintln!(" mTLS: ~/.config/openshell/gateways/{gateway_name}/mtls/"); - return Ok(()); - } - - // ── First boot / post-reset: fetch PKI from VM via exec agent ── - // - // The VM init script generates certs on first boot at /opt/openshell/pki/. - // We poll the exec agent with `cat ` for each PEM file until they - // exist, retrying to handle the window between VM boot and PKI generation. - eprintln!("Waiting for VM to generate PKI..."); - let pki_bundle = fetch_pki_over_exec(&exec_socket, std::time::Duration::from_secs(120)) - .map_err(|e| VmError::Bootstrap(format!("VM did not produce PKI within 120s: {e}")))?; - - eprintln!("PKI ready — storing client certs on host..."); - - openshell_bootstrap::store_gateway_metadata(gateway_name, &metadata) - .map_err(|e| VmError::Bootstrap(format!("failed to store metadata: {e}")))?; - - openshell_bootstrap::mtls::store_pki_bundle(gateway_name, &pki_bundle) - .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; - - openshell_bootstrap::save_active_gateway(gateway_name) - .map_err(|e| VmError::Bootstrap(format!("failed to set active cluster: {e}")))?; - - eprintln!( - "Bootstrap complete [{:.1}s]", - bootstrap_start.elapsed().as_secs_f64() - ); - eprintln!(" Cluster: {gateway_name}"); - eprintln!(" Gateway: https://127.0.0.1:{gateway_port}"); - eprintln!(" mTLS: ~/.config/openshell/gateways/{gateway_name}/mtls/"); - - Ok(()) -} - -/// PKI file names and the corresponding [`PkiBundle`] fields. -const PKI_FILES: &[(&str, &str)] = &[ - ("ca.crt", "ca_cert_pem"), - ("ca.key", "ca_key_pem"), - ("server.crt", "server_cert_pem"), - ("server.key", "server_key_pem"), - ("client.crt", "client_cert_pem"), - ("client.key", "client_key_pem"), -]; - -/// Fetch all six PEM files from `/opt/openshell/pki/` inside the guest by -/// running `cat` via the exec agent. Retries until `timeout` elapses, -/// sleeping 500ms between attempts, to handle the window between VM boot -/// and PKI generation completing. -fn fetch_pki_over_exec( - exec_socket: &Path, - timeout: std::time::Duration, -) -> Result { - let deadline = Instant::now() + timeout; - - loop { - match try_read_pki_files(exec_socket) { - Ok(bundle) => return Ok(bundle), - Err(_) if Instant::now() < deadline => { - std::thread::sleep(std::time::Duration::from_millis(500)); - } - Err(e) => { - return Err(VmError::Bootstrap(format!( - "failed to read PKI files via exec agent: {e}" - ))); - } - } - } -} - -/// Attempt to read all six PEM files from the guest in one pass. -fn try_read_pki_files(exec_socket: &Path) -> Result { - let mut pems = std::collections::HashMap::new(); - - for &(filename, _field) in PKI_FILES { - let path = format!("/opt/openshell/pki/{filename}"); - let output = exec_capture(exec_socket, vec!["cat".to_string(), path])?; - let content = String::from_utf8(output).map_err(|e| { - VmError::Bootstrap(format!("PKI file {filename} is not valid UTF-8: {e}")) - })?; - if content.is_empty() { - return Err(VmError::Bootstrap(format!("PKI file {filename} is empty"))); - } - pems.insert(filename, content); - } - - let mut get = |key: &str| -> Result { - pems.remove(key) - .ok_or_else(|| VmError::Bootstrap(format!("PKI file {key} missing from exec output"))) - }; - - Ok(openshell_bootstrap::pki::PkiBundle { - ca_cert_pem: get("ca.crt")?, - ca_key_pem: get("ca.key")?, - server_cert_pem: get("server.crt")?, - server_key_pem: get("server.key")?, - client_cert_pem: get("client.crt")?, - client_key_pem: get("client.key")?, - }) -} - -/// Check whether a previous bootstrap left valid state on disk. -/// -/// A warm boot is detected when both: -/// - Cluster metadata exists: `$XDG_CONFIG_HOME/openshell/gateways/openshell-vm/metadata.json` -/// - mTLS certs exist: `$XDG_CONFIG_HOME/openshell/gateways/openshell-vm/mtls/{ca.crt,tls.crt,tls.key}` -/// -/// When true, the host-side bootstrap (PKI generation, secret manifest writing, -/// metadata storage) can be skipped because the virtio-fs rootfs persists k3s -/// state (TLS certs, kine/SQLite cluster objects, containerd images, helm -/// releases) across VM restarts. The kine database is preserved on normal -/// boots so that pods and other cluster objects survive restarts. -fn is_warm_boot(gateway_name: &str) -> bool { - let Ok(home) = std::env::var("HOME") else { - return false; - }; - - let config_base = - std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); - - let config_dir = PathBuf::from(&config_base) - .join("openshell") - .join("gateways"); - - // Check metadata file. - let metadata_path = config_dir.join(gateway_name).join("metadata.json"); - if !metadata_path.is_file() { - return false; - } - - // Check mTLS cert files. - let mtls_dir = config_dir.join(gateway_name).join("mtls"); - for name in &["ca.crt", "tls.crt", "tls.key"] { - let path = mtls_dir.join(name); - match std::fs::metadata(&path) { - Ok(m) if m.is_file() && m.len() > 0 => {} - _ => return false, - } - } - - true -} - -/// Compare the CA cert on the rootfs (authoritative source) against the -/// host-side copy. If they differ, re-copy all client certs from the rootfs. -/// -/// This catches cases where PKI was regenerated (e.g. rootfs rebuilt, -/// manual reset) but host-side certs survived from a previous boot cycle. -fn sync_host_certs_if_stale( - gateway_name: &str, - bundle: &openshell_bootstrap::pki::PkiBundle, -) -> Result<(), VmError> { - let Ok(home) = std::env::var("HOME") else { - return Ok(()); - }; - let config_base = - std::env::var("XDG_CONFIG_HOME").unwrap_or_else(|_| format!("{home}/.config")); - let host_ca = PathBuf::from(&config_base) - .join("openshell/gateways") - .join(gateway_name) - .join("mtls/ca.crt"); - - let host_ca_contents = std::fs::read_to_string(&host_ca) - .map_err(|e| VmError::Bootstrap(format!("failed to read host ca.crt: {e}")))?; - - if bundle.ca_cert_pem.trim() == host_ca_contents.trim() { - return Ok(()); - } - - eprintln!("Cert drift detected — re-syncing mTLS certs from VM..."); - - openshell_bootstrap::mtls::store_pki_bundle(gateway_name, bundle) - .map_err(|e| VmError::Bootstrap(format!("failed to store mTLS creds: {e}")))?; - - eprintln!(" mTLS certs re-synced from VM"); - Ok(()) -} - -static CHILD_PID: std::sync::atomic::AtomicI32 = std::sync::atomic::AtomicI32::new(0); - -extern "C" fn forward_signal(_sig: libc::c_int) { - let pid = CHILD_PID.load(std::sync::atomic::Ordering::Relaxed); - if pid > 0 { - unsafe { - libc::kill(pid, libc::SIGTERM); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fs; - use std::time::{SystemTime, UNIX_EPOCH}; - - fn temp_runtime_dir() -> PathBuf { - let nanos = SystemTime::now() - .duration_since(UNIX_EPOCH) - .expect("time went backwards") - .as_nanos(); - std::env::temp_dir().join(format!( - "openshell-vm-runtime-{}-{nanos}", - std::process::id() - )) - } - - fn write_runtime_file(path: &Path) { - fs::write(path, b"test").expect("failed to write runtime file"); - } - - #[test] - fn validate_runtime_dir_accepts_minimal_bundle() { - let dir = temp_runtime_dir(); - fs::create_dir_all(&dir).expect("failed to create runtime dir"); - - write_runtime_file(&dir.join(ffi::required_runtime_lib_name())); - write_runtime_file(&dir.join("libkrunfw.test")); - let gvproxy = dir.join("gvproxy"); - write_runtime_file(&gvproxy); - #[cfg(unix)] - { - use std::os::unix::fs::PermissionsExt as _; - - let mut perms = fs::metadata(&gvproxy).expect("stat gvproxy").permissions(); - perms.set_mode(0o755); - fs::set_permissions(&gvproxy, perms).expect("chmod gvproxy"); - } - - validate_runtime_dir(&dir).expect("runtime bundle should validate"); - assert!(gvproxy.exists()); - - let _ = fs::remove_dir_all(&dir); - } - - #[test] - fn validate_runtime_dir_requires_gvproxy() { - let dir = temp_runtime_dir(); - fs::create_dir_all(&dir).expect("failed to create runtime dir"); - - write_runtime_file(&dir.join(ffi::required_runtime_lib_name())); - write_runtime_file(&dir.join("libkrunfw.test")); - - let err = validate_runtime_dir(&dir).expect_err("missing gvproxy should fail"); - match err { - VmError::BinaryNotFound { hint, .. } => { - assert!(hint.contains("missing gvproxy")); - } - other => panic!("unexpected error: {other:?}"), - } - - let _ = fs::remove_dir_all(&dir); - } - - #[test] - fn gateway_config_uses_default_state_disk_next_to_rootfs() { - let rootfs = PathBuf::from("/tmp/openshell-vm-test/rootfs"); - - let config = VmConfig::gateway(rootfs.clone()); - let state_disk = config - .state_disk - .expect("gateway should enable a state disk"); - - assert_eq!( - state_disk.path, - rootfs.parent().unwrap().join("rootfs-state.raw") - ); - assert_eq!(state_disk.block_id, DEFAULT_STATE_DISK_BLOCK_ID); - assert_eq!(state_disk.guest_device, DEFAULT_STATE_DISK_GUEST_DEVICE); - assert_eq!(state_disk.size_bytes, DEFAULT_STATE_DISK_SIZE_BYTES); - } - - #[test] - fn ensure_state_disk_image_creates_sparse_file() { - let dir = temp_runtime_dir(); - fs::create_dir_all(&dir).expect("failed to create runtime dir"); - - let state_disk = StateDiskConfig { - path: dir.join("state.raw"), - size_bytes: 8 * 1024 * 1024, - block_id: DEFAULT_STATE_DISK_BLOCK_ID.to_string(), - guest_device: DEFAULT_STATE_DISK_GUEST_DEVICE.to_string(), - }; - - ensure_state_disk_image(&state_disk).expect("state disk should be created"); - - let metadata = fs::metadata(&state_disk.path).expect("stat state disk"); - assert_eq!(metadata.len(), state_disk.size_bytes); - - let _ = fs::remove_dir_all(&dir); - } - - #[test] - fn prepare_rootfs_returns_existing_explicit_rootfs() { - let dir = temp_runtime_dir(); - let rootfs = dir.join("rootfs"); - fs::create_dir_all(&rootfs).expect("failed to create rootfs dir"); - - let prepared = - prepare_rootfs(Some(rootfs.clone()), "default", false).expect("prepare rootfs"); - - assert_eq!(prepared, rootfs); - - let _ = fs::remove_dir_all(&dir); - } -} diff --git a/crates/openshell-vm/src/main.rs b/crates/openshell-vm/src/main.rs deleted file mode 100644 index b2dce993e..000000000 --- a/crates/openshell-vm/src/main.rs +++ /dev/null @@ -1,279 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Standalone openshell-vm binary. -//! -//! Boots a libkrun microVM running the `OpenShell` control plane (k3s + -//! openshell-server). Each named instance gets its own rootfs extracted from -//! the embedded tarball at -//! `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs`. -//! -//! # Codesigning (macOS) -//! -//! This binary must be codesigned with the `com.apple.security.hypervisor` -//! entitlement. See `entitlements.plist` in this crate. -//! -//! ```sh -//! codesign --entitlements crates/openshell-vm/entitlements.plist --force -s - target/debug/openshell-vm -//! ``` - -use std::io::IsTerminal; -use std::path::PathBuf; - -use clap::{Parser, Subcommand, ValueHint}; - -const DISABLE_STATE_DISK_ENV: &str = "OPENSHELL_VM_DISABLE_STATE_DISK"; - -/// Boot the `OpenShell` gateway microVM. -/// -/// Starts a libkrun microVM running a k3s Kubernetes cluster with the -/// `OpenShell` control plane. Use `--exec` to run a custom process instead. -#[derive(Parser)] -#[command(name = "openshell-vm", version)] -struct Cli { - #[command(subcommand)] - command: Option, - - /// Path to the rootfs directory (aarch64 Linux). - /// Overrides the default instance-based rootfs resolution. - #[arg(long, value_hint = ValueHint::DirPath)] - rootfs: Option, - - /// Named VM instance. - /// - /// When used alone, the rootfs resolves to - /// `~/.local/share/openshell/openshell-vm/{version}/instances//rootfs` - /// and is extracted from the embedded tarball on first use. - /// When combined with `--rootfs`, only provides the instance identity - /// (for exec, gateway name, etc.) while the rootfs comes from the - /// explicit path. - #[arg(long, default_value = "default")] - name: String, - - /// Executable path inside the VM. When set, runs this instead of - /// the default k3s server. - #[arg(long)] - exec: Option, - - /// Arguments to the executable (requires `--exec`). - #[arg(long, num_args = 1..)] - args: Vec, - - /// Environment variables in `KEY=VALUE` form (requires `--exec`). - #[arg(long, num_args = 1..)] - env: Vec, - - /// Working directory inside the VM. - #[arg(long, default_value = "/")] - workdir: String, - - /// Port mappings (`host_port:guest_port`). - #[arg(long, short, num_args = 1..)] - port: Vec, - - /// Number of virtual CPUs (default: 4 for openshell-vm, 2 for --exec). - #[arg(long)] - vcpus: Option, - - /// RAM in MiB (default: 8192 for openshell-vm, 2048 for --exec). - #[arg(long)] - mem: Option, - - /// libkrun log level (0=Off .. 5=Trace). - #[arg(long, default_value_t = 1)] - krun_log_level: u32, - - /// Networking backend: "gvproxy" (default), "tsi", or "none". - #[arg(long, default_value = "gvproxy")] - net: String, - - /// Wipe all runtime state (containerd, kubelet, k3s) before booting. - /// Use this to recover from a corrupted state after a crash or - /// unclean shutdown. - #[arg(long)] - reset: bool, -} - -#[derive(Subcommand)] -enum GatewayCommand { - /// Ensure the target rootfs exists, extracting the embedded rootfs if needed. - PrepareRootfs { - /// Recreate the target rootfs even if it already exists. - #[arg(long)] - force: bool, - }, - - /// Execute a command inside a running openshell-vm VM. - Exec { - /// Working directory inside the VM. - #[arg(long)] - workdir: Option, - - /// Environment variables in `KEY=VALUE` form. - #[arg(long, num_args = 1..)] - env: Vec, - - /// Command and arguments to run inside the VM. - #[arg(trailing_var_arg = true)] - command: Vec, - }, -} - -fn main() { - // On macOS, libkrun loads libkrunfw.5.dylib via dlopen() with a bare name. - // The dynamic linker only finds it if DYLD_LIBRARY_PATH includes the runtime - // directory, but env vars set after process start are ignored by dyld. To work - // around this, re-exec the binary with DYLD_LIBRARY_PATH set if the runtime - // is available and the variable is not already configured. - #[cfg(target_os = "macos")] - { - if std::env::var_os("__OPENSHELL_VM_REEXEC").is_none() - && let Ok(runtime_dir) = openshell_vm::configured_runtime_dir() - { - let needs_reexec = std::env::var_os("DYLD_LIBRARY_PATH").is_none_or(|v| { - !v.to_string_lossy() - .contains(runtime_dir.to_str().unwrap_or("")) - }); - if needs_reexec { - let mut dyld_paths = vec![runtime_dir]; - if let Some(existing) = std::env::var_os("DYLD_LIBRARY_PATH") { - dyld_paths.extend(std::env::split_paths(&existing)); - } - let joined = std::env::join_paths(&dyld_paths).expect("join DYLD_LIBRARY_PATH"); - let exe = std::env::current_exe().expect("current_exe"); - let args: Vec = std::env::args().skip(1).collect(); - let err = std::process::Command::new(exe) - .args(&args) - .env("DYLD_LIBRARY_PATH", &joined) - .env("__OPENSHELL_VM_REEXEC", "1") - .status(); - match err { - Ok(status) => std::process::exit(status.code().unwrap_or(1)), - Err(e) => { - eprintln!("Error: failed to re-exec with DYLD_LIBRARY_PATH: {e}"); - std::process::exit(1); - } - } - } - } - } - - tracing_subscriber::fmt::init(); - - let cli = Cli::parse(); - - let code = match run(cli) { - Ok(code) => code, - Err(e) => { - eprintln!("Error: {e}"); - 1 - } - }; - - if code != 0 { - std::process::exit(code); - } -} - -fn run(cli: Cli) -> Result> { - if let Some(GatewayCommand::PrepareRootfs { force }) = &cli.command { - let rootfs = openshell_vm::prepare_rootfs(cli.rootfs.clone(), &cli.name, *force)?; - println!("{}", rootfs.display()); - return Ok(0); - } - - if let Some(GatewayCommand::Exec { - workdir, - env, - mut command, - }) = cli.command - { - let effective_tty = std::io::stdin().is_terminal(); - if command.is_empty() { - if effective_tty { - command.push("sh".to_string()); - } else { - return Err("openshell-vm exec requires a command when stdin is not a TTY".into()); - } - } - return Ok(openshell_vm::exec_running_vm( - openshell_vm::VmExecOptions { - rootfs: Some( - cli.rootfs - .unwrap_or(openshell_vm::named_rootfs_dir(&cli.name)?), - ), - command, - workdir, - env, - tty: effective_tty, - }, - )?); - } - - let net_backend = match cli.net.as_str() { - "tsi" => openshell_vm::NetBackend::Tsi, - "none" => openshell_vm::NetBackend::None, - "gvproxy" => openshell_vm::NetBackend::Gvproxy { - binary: openshell_vm::default_runtime_gvproxy_path(), - }, - other => { - return Err( - format!("unknown --net backend: {other} (expected: gvproxy, tsi, none)").into(), - ); - } - }; - - let rootfs = cli - .rootfs - .map_or_else(|| openshell_vm::ensure_named_rootfs(&cli.name), Ok)?; - - let gateway_name = openshell_vm::gateway_name(&cli.name)?; - - let mut config = if let Some(exec_path) = cli.exec { - openshell_vm::VmConfig { - rootfs, - vcpus: cli.vcpus.unwrap_or(2), - mem_mib: cli.mem.unwrap_or(2048), - exec_path, - args: cli.args, - env: cli.env, - workdir: cli.workdir, - port_map: cli.port, - vsock_ports: vec![], - log_level: cli.krun_log_level, - console_output: None, - net: net_backend, - reset: cli.reset, - gateway_name, - state_disk: None, - } - } else { - let mut c = openshell_vm::VmConfig::gateway(rootfs); - if !cli.port.is_empty() { - c.port_map = cli.port; - } - if let Some(v) = cli.vcpus { - c.vcpus = v; - } - if let Some(m) = cli.mem { - c.mem_mib = m; - } - c.net = net_backend; - c.reset = cli.reset; - c.gateway_name = gateway_name; - if state_disk_disabled() { - c.state_disk = None; - } - c - }; - config.log_level = cli.krun_log_level; - - Ok(openshell_vm::launch(&config)?) -} - -fn state_disk_disabled() -> bool { - matches!( - std::env::var(DISABLE_STATE_DISK_ENV).ok().as_deref(), - Some("1" | "true" | "TRUE" | "yes" | "YES") - ) -} diff --git a/crates/openshell-vm/tests/gateway_integration.rs b/crates/openshell-vm/tests/gateway_integration.rs deleted file mode 100644 index fabfd74c8..000000000 --- a/crates/openshell-vm/tests/gateway_integration.rs +++ /dev/null @@ -1,155 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! Integration tests for the standalone `openshell-vm` binary. -//! -//! These tests require: -//! - libkrun installed (e.g. `brew tap slp/krun && brew install libkrun`) -//! - macOS ARM64 with Apple Hypervisor.framework -//! - An `openshell-vm` binary built with an embedded rootfs tarball -//! (for example via `mise run vm:build:embedded`) -//! -//! All tests are `#[ignore]` — run them explicitly: -//! -//! ```sh -//! cargo test -p openshell-vm --test gateway_integration -- --ignored -//! ``` - -#![allow(unsafe_code)] - -use std::net::{SocketAddr, TcpStream}; -use std::process::{Command, Stdio}; -use std::time::{Duration, Instant}; - -/// Path to the built `openshell-vm` binary (resolved by Cargo at compile time). -const GATEWAY: &str = env!("CARGO_BIN_EXE_openshell-vm"); - -// ── Helpers ──────────────────────────────────────────────────────────── - -/// Codesign the binary on macOS so it can access Hypervisor.framework. -fn codesign_if_needed() { - if cfg!(target_os = "macos") { - let entitlements = format!("{}/entitlements.plist", env!("CARGO_MANIFEST_DIR")); - let status = Command::new("codesign") - .args([ - "--entitlements", - &entitlements, - "--force", - "-s", - "-", - GATEWAY, - ]) - .status() - .expect("codesign command failed to execute"); - assert!(status.success(), "failed to codesign openshell-vm binary"); - } -} - -fn assert_runtime_bundle_staged() { - let bundle_dir = std::path::Path::new(GATEWAY) - .parent() - .expect("openshell-vm binary has no parent") - .join("openshell-vm.runtime"); - assert!( - bundle_dir.is_dir(), - "openshell-vm.runtime is missing next to the test binary: {}. Run `mise run vm:bundle-runtime` first.", - bundle_dir.display() - ); -} - -// ── Tests ────────────────────────────────────────────────────────────── - -/// Boot the full `OpenShell` gateway and verify the gRPC service becomes -/// reachable on port 30051. -#[test] -#[ignore = "requires libkrun + rootfs"] -fn gateway_boots_and_service_becomes_reachable() { - codesign_if_needed(); - assert_runtime_bundle_staged(); - - let mut cmd = Command::new(GATEWAY); - cmd.stdout(Stdio::null()).stderr(Stdio::piped()); - - let mut child = cmd.spawn().expect("failed to start openshell-vm"); - - // Poll for the OpenShell gRPC service. - let addr: SocketAddr = ([127, 0, 0, 1], 30051).into(); - let timeout = Duration::from_secs(180); - let start = Instant::now(); - let mut reachable = false; - - while start.elapsed() < timeout { - if TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { - reachable = true; - break; - } - std::thread::sleep(Duration::from_secs(2)); - } - - // Tear down regardless of result. - let _ = unsafe { libc::kill(child.id().cast_signed(), libc::SIGTERM) }; - let _ = child.wait(); - - assert!( - reachable, - "openshell-vm service on port 30051 not reachable within {timeout:?}" - ); -} - -/// Run a trivial command inside the VM via `--exec` and verify it exits -/// successfully, proving the VM boots and can execute guest processes. -#[test] -#[ignore = "requires libkrun + rootfs"] -fn gateway_exec_runs_guest_command() { - codesign_if_needed(); - assert_runtime_bundle_staged(); - - let mut cmd = Command::new(GATEWAY); - cmd.args(["--exec", "/bin/true"]); - - let output = cmd.output().expect("failed to run openshell-vm --exec"); - - assert!( - output.status.success(), - "openshell-vm --exec /bin/true failed with status {:?}\nstderr: {}", - output.status, - String::from_utf8_lossy(&output.stderr), - ); -} - -/// Boot the VM, then use `openshell-vm exec` against the running instance. -#[test] -#[ignore = "requires libkrun + rootfs"] -fn gateway_exec_attaches_to_running_vm() { - codesign_if_needed(); - assert_runtime_bundle_staged(); - - let mut boot = Command::new(GATEWAY); - boot.stdout(Stdio::null()).stderr(Stdio::piped()); - let mut child = boot.spawn().expect("failed to start openshell-vm VM"); - - let addr: SocketAddr = ([127, 0, 0, 1], 30051).into(); - let timeout = Duration::from_secs(180); - let start = Instant::now(); - while start.elapsed() < timeout { - if TcpStream::connect_timeout(&addr, Duration::from_secs(1)).is_ok() { - break; - } - std::thread::sleep(Duration::from_secs(2)); - } - - let output = Command::new(GATEWAY) - .args(["exec", "--", "/bin/true"]) - .output() - .expect("failed to run openshell-vm exec"); - - let _ = unsafe { libc::kill(child.id().cast_signed(), libc::SIGTERM) }; - let _ = child.wait(); - - assert!( - output.status.success(), - "openshell-vm exec -- /bin/true failed with status {:?}\nstderr: {}", - output.status, - String::from_utf8_lossy(&output.stderr), - ); -} diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh index dc462bf98..a4afb16d5 100755 --- a/e2e/rust/e2e-vm.sh +++ b/e2e/rust/e2e-vm.sh @@ -57,7 +57,7 @@ STATE_DIR_ROOT="/tmp" # Smoke test timeouts. First boot extracts the embedded libkrun runtime # (~60-90MB of zstd per architecture) and prepares a sandbox rootfs from the -# configured image. The guest then runs k3s-free sandbox supervisor startup; a +# configured image. The guest then starts the sandbox supervisor directly; a # cold microVM is typically ready within ~15s after image preparation. GATEWAY_READY_TIMEOUT=60 SANDBOX_PROVISION_TIMEOUT=180 diff --git a/rfc/0001-core-architecture/README.md b/rfc/0001-core-architecture/README.md index eb5d91d56..0a3af73a8 100644 --- a/rfc/0001-core-architecture/README.md +++ b/rfc/0001-core-architecture/README.md @@ -235,7 +235,7 @@ Background coordination must also avoid a singleton controller. Reconciliation, ### Published Artifacts -This architecture ships as three core binaries, a standalone VM binary for quick single-player getting started, two container images, and a Kubernetes deployment story. +This architecture ships as three core binaries, a VM compute driver, two container images, and a Kubernetes deployment story. #### Binaries @@ -245,7 +245,7 @@ This architecture ships as three core binaries, a standalone VM binary for quick `openshell-supervisor` **(Supervisor).** The sandbox-side security boundary. Runs inside every sandbox environment, connects outbound to the gateway, and enforces policy. Distributed as a standalone executable and as the entrypoint of the supervisor container image. -`openshell-vm` **(VM).** The single-player runtime. Launches a lightweight microVM (via libkrun) that embeds both the gateway and supervisor in a single local VM, providing a fully self-contained sandbox environment without requiring a cluster or container runtime. Distributed as a standalone executable for macOS and Linux. Used by the CLI when running in single-player mode. +`openshell-driver-vm` **(VM compute driver).** Launches per-sandbox microVMs through the gateway compute-driver interface. Distributed as a standalone helper binary for supported macOS and Linux hosts. #### Container images diff --git a/tasks/scripts/vm/build-libkrun.sh b/tasks/scripts/vm/build-libkrun.sh index 46e9c041d..580f6bf4e 100755 --- a/tasks/scripts/vm/build-libkrun.sh +++ b/tasks/scripts/vm/build-libkrun.sh @@ -152,7 +152,7 @@ if [ -f "$KERNEL_CONFIG" ]; then echo " Applied custom kernel config fragment: openshell.kconfig" else echo "Warning: Custom kernel config not found at ${KERNEL_CONFIG}" >&2 - echo " Building with default config (k3s networking may not work)" >&2 + echo " Building with default config (sandbox networking may lack required kernel features)" >&2 fi echo " Building kernel and libkrunfw (this may take 15-20 minutes)..."