From a04e55289aecd5e6fa87e78561a038f22c6ca15e Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:19:08 -0700 Subject: [PATCH 1/5] feat(gateway): add TOML configuration file (RFC 0003) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces an opt-in --config / OPENSHELL_GATEWAY_CONFIG flag that loads a TOML file with gateway-wide settings and per-driver tables. Source precedence is CLI > env > file > built-in default, implemented via clap's ValueSource so existing flags and env vars keep their priority. Driver crates (kubernetes, docker, podman, vm) now derive Deserialize on their config structs. SupervisorSideloadMethod gains Deserialize with kebab-case rename. A per-driver inheritance allowlist on the loader side overlays [openshell.gateway] shared defaults (default_image, supervisor_image, image_pull_policy, guest_tls_*, ssh_handshake_skew_secs, client_tls_secret_name, host_gateway_ip, enable_user_namespaces) onto each [openshell.drivers.] table before deserialization. The Helm chart renders a new gateway-config ConfigMap and mounts it at /etc/openshell/gateway.toml. The migrated OPENSHELL_* env entries are dropped from the StatefulSet — only the Secret-backed OPENSHELL_SSH_HANDSHAKE_SECRET remains. database_url stays on --db-url. Adds examples/gateway/gateway.example.toml and updates architecture/gateway.md with the source precedence and inheritance rules. --- Cargo.lock | 62 +++ Cargo.toml | 1 + architecture/gateway.md | 36 ++ crates/openshell-driver-docker/Cargo.toml | 1 + crates/openshell-driver-docker/src/lib.rs | 16 +- crates/openshell-driver-kubernetes/Cargo.toml | 1 + .../openshell-driver-kubernetes/src/config.rs | 32 +- crates/openshell-driver-podman/src/config.rs | 3 +- crates/openshell-server/Cargo.toml | 1 + crates/openshell-server/src/cli.rs | 502 +++++++++++++++-- crates/openshell-server/src/compute/vm.rs | 3 +- crates/openshell-server/src/config_file.rs | 509 ++++++++++++++++++ crates/openshell-server/src/lib.rs | 213 +++++--- .../openshell/templates/gateway-config.yaml | 91 ++++ .../helm/openshell/templates/statefulset.yaml | 97 +--- .../tests/sandbox_namespace_test.yaml | 24 +- examples/gateway/gateway.example.toml | 78 +++ rfc/0003-gateway-configuration/README.md | 1 - 18 files changed, 1434 insertions(+), 237 deletions(-) create mode 100644 crates/openshell-server/src/config_file.rs create mode 100644 deploy/helm/openshell/templates/gateway-config.yaml create mode 100644 examples/gateway/gateway.example.toml diff --git a/Cargo.lock b/Cargo.lock index 05a1bdff2..40a57647d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3414,6 +3414,7 @@ dependencies = [ "bytes", "futures", "openshell-core", + "serde", "tar", "tempfile", "tokio", @@ -3436,6 +3437,7 @@ dependencies = [ "openshell-core", "prost", "prost-types", + "serde", "serde_json", "thiserror 2.0.18", "tokio", @@ -3667,6 +3669,7 @@ dependencies = [ "tokio-rustls", "tokio-stream", "tokio-tungstenite 0.26.2", + "toml", "tonic", "tower 0.5.3", "tower-http 0.6.8", @@ -5128,6 +5131,15 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -6065,6 +6077,47 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.14.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + [[package]] name = "tonic" version = "0.12.3" @@ -7154,6 +7207,15 @@ version = "0.53.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + [[package]] name = "wiremock" version = "0.6.5" diff --git a/Cargo.toml b/Cargo.toml index 9bc3f9ea2..7f283eb24 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,6 +69,7 @@ nix = { version = "0.29", features = ["signal", "process", "user", "fs", "term"] serde = { version = "1", features = ["derive"] } serde_json = "1" serde_yml = "0.0.12" +toml = "0.8" apollo-parser = "0.8.5" # HTTP client diff --git a/architecture/gateway.md b/architecture/gateway.md index a1320cfaa..1279e7036 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -172,6 +172,42 @@ pre-created Secrets) disable the Helm hook via `pkiInitJob.enabled=false`. The chart also ships a `certManager.*` path that produces equivalent Secrets through cert-manager `Issuer`/`Certificate` resources. +## Configuration + +The gateway reads its configuration from three sources, merged in this +precedence (highest first): + +``` +CLI flag > OPENSHELL_* env var > TOML file > built-in default +``` + +The TOML file is opt-in via `--config ` / `OPENSHELL_GATEWAY_CONFIG`. +When unset, the gateway behaves exactly as before — CLI flags and env vars +drive every setting. See `examples/gateway/gateway.example.toml` for a +worked example and RFC 0003 for the full schema. + +Two secrets are env-only and rejected when present in the file: + +| Field | Source | +|------------------------|--------------------------------------------------| +| `database_url` | `OPENSHELL_DB_URL` / `--db-url` | +| `ssh_handshake_secret` | `OPENSHELL_SSH_HANDSHAKE_SECRET` / `--ssh-handshake-secret` | + +### Driver inheritance + +`[openshell.gateway]` carries a small set of values (`default_image`, +`supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, +`client_tls_secret_name`, `host_gateway_ip`, `ssh_handshake_skew_secs`, +`enable_user_namespaces`) that are inherited into each driver's +`[openshell.drivers.]` table when the driver-specific table does not +override them. The allowlist is per-driver so a gateway-wide default cannot +land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +Driver-specific values that are not part of the inheritance allowlist +(e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from +the driver's own table. + ## Operational Constraints - Gateway TLS and client certificate distribution are deployment concerns owned diff --git a/crates/openshell-driver-docker/Cargo.toml b/crates/openshell-driver-docker/Cargo.toml index 79d4fb37d..e2c97532a 100644 --- a/crates/openshell-driver-docker/Cargo.toml +++ b/crates/openshell-driver-docker/Cargo.toml @@ -19,6 +19,7 @@ futures = { workspace = true } tokio-stream = { workspace = true } tracing = { workspace = true } bytes = { workspace = true } +serde = { workspace = true } bollard = { version = "0.20" } tar = "0.4" tempfile = "3" diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 6059596ab..785cc4cd3 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -127,7 +127,8 @@ pub trait SupervisorReadiness: Send + Sync + 'static { } /// Gateway-local configuration for the Docker compute driver. -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct DockerComputeConfig { /// Optional override for the Linux `openshell-sandbox` binary mounted into containers. pub supervisor_bin: Option, @@ -151,6 +152,19 @@ pub struct DockerComputeConfig { pub network_name: String, } +impl Default for DockerComputeConfig { + fn default() -> Self { + Self { + supervisor_bin: None, + supervisor_image: None, + guest_tls_ca: None, + guest_tls_cert: None, + guest_tls_key: None, + network_name: DEFAULT_DOCKER_NETWORK_NAME.to_string(), + } + } +} + #[derive(Debug, Clone, PartialEq, Eq)] pub(crate) struct DockerGuestTlsPaths { pub(crate) ca: PathBuf, diff --git a/crates/openshell-driver-kubernetes/Cargo.toml b/crates/openshell-driver-kubernetes/Cargo.toml index 5e247dc77..c222c9c31 100644 --- a/crates/openshell-driver-kubernetes/Cargo.toml +++ b/crates/openshell-driver-kubernetes/Cargo.toml @@ -26,6 +26,7 @@ tokio-stream = { workspace = true } kube = { workspace = true } kube-runtime = { workspace = true } k8s-openapi = { workspace = true } +serde = { workspace = true } serde_json = { workspace = true } clap = { workspace = true } tracing = { workspace = true } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index e9baf0e5c..2eab5a4c7 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -1,8 +1,15 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use openshell_core::config::{ + DEFAULT_IMAGE_PULL_POLICY, DEFAULT_K8S_NAMESPACE, DEFAULT_SSH_HANDSHAKE_SKEW_SECS, + DEFAULT_SUPERVISOR_IMAGE, +}; +use serde::{Deserialize, Serialize}; + /// How the supervisor binary is delivered into sandbox pods. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "kebab-case")] pub enum SupervisorSideloadMethod { /// Mount the supervisor OCI image directly as a read-only volume /// (requires Kubernetes >= v1.33 with the `ImageVolume` feature gate, @@ -37,7 +44,8 @@ impl std::str::FromStr for SupervisorSideloadMethod { } } -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct KubernetesComputeConfig { pub namespace: String, pub default_image: String, @@ -59,3 +67,23 @@ pub struct KubernetesComputeConfig { pub host_gateway_ip: String, pub enable_user_namespaces: bool, } + +impl Default for KubernetesComputeConfig { + fn default() -> Self { + Self { + namespace: DEFAULT_K8S_NAMESPACE.to_string(), + default_image: String::new(), + image_pull_policy: DEFAULT_IMAGE_PULL_POLICY.to_string(), + supervisor_image: DEFAULT_SUPERVISOR_IMAGE.to_string(), + supervisor_image_pull_policy: String::new(), + supervisor_sideload_method: SupervisorSideloadMethod::default(), + grpc_endpoint: String::new(), + ssh_socket_path: "/run/openshell/ssh.sock".to_string(), + ssh_handshake_secret: String::new(), + ssh_handshake_skew_secs: DEFAULT_SSH_HANDSHAKE_SKEW_SECS, + client_tls_secret_name: String::new(), + host_gateway_ip: String::new(), + enable_user_namespaces: false, + } + } +} diff --git a/crates/openshell-driver-podman/src/config.rs b/crates/openshell-driver-podman/src/config.rs index d82b8d0b0..cfb58751e 100644 --- a/crates/openshell-driver-podman/src/config.rs +++ b/crates/openshell-driver-podman/src/config.rs @@ -61,7 +61,8 @@ impl FromStr for ImagePullPolicy { } } -#[derive(Clone)] +#[derive(Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct PodmanComputeConfig { /// Path to the Podman API Unix socket. /// Default: `$XDG_RUNTIME_DIR/podman/podman.sock` (Linux), diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 9cba99045..f03f6f3cc 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -74,6 +74,7 @@ bytes = { workspace = true } pin-project-lite = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +toml = { workspace = true } tokio-stream = { workspace = true } sqlx = { workspace = true } reqwest = { workspace = true } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index a3098c1cf..33a37d50f 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -3,7 +3,8 @@ //! Shared CLI entrypoint for the gateway binaries. -use clap::{Command, CommandFactory, FromArgMatches, Parser}; +use clap::parser::ValueSource; +use clap::{ArgMatches, Command, CommandFactory, FromArgMatches, Parser}; use miette::{IntoDiagnostic, Result}; use openshell_core::ComputeDriverKind; use openshell_core::config::{ @@ -17,6 +18,7 @@ use tracing_subscriber::EnvFilter; use crate::certgen; use crate::compute::{DockerComputeConfig, VmComputeConfig}; +use crate::config_file::{self, ConfigFile, GatewayFileSection}; use crate::{run_server, tracing_bus::TracingLogBus}; /// `OpenShell` gateway process - gRPC and HTTP server with protocol multiplexing. @@ -43,6 +45,14 @@ enum Commands { #[derive(clap::Args, Debug)] struct RunArgs { + /// Path to a TOML configuration file (see RFC 0003). + /// + /// When set, gateway-wide settings and per-driver tables are read from + /// the file. Command-line flags and `OPENSHELL_*` environment variables + /// continue to take precedence over file values. + #[arg(long, env = "OPENSHELL_GATEWAY_CONFIG")] + config: Option, + /// IP address to bind the server, health, and metrics listeners to. #[arg(long, default_value = "127.0.0.1", env = "OPENSHELL_BIND_ADDRESS")] bind_address: IpAddr, @@ -310,15 +320,28 @@ pub async fn run_cli() -> Result<()> { .install_default() .map_err(|e| miette::miette!("failed to install rustls crypto provider: {e:?}"))?; - let cli = Cli::from_arg_matches(&command().get_matches()).expect("clap validated args"); + let matches = command().get_matches(); + let cli = Cli::from_arg_matches(&matches).expect("clap validated args"); match cli.command { Some(Commands::GenerateCerts(args)) => certgen::run(args).await, - None => Box::pin(run_from_args(cli.run)).await, + None => Box::pin(run_from_args(cli.run, matches)).await, } } -async fn run_from_args(args: RunArgs) -> Result<()> { +async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { + // Load TOML file when --config / OPENSHELL_GATEWAY_CONFIG is set. + // File values are applied below for any argument that is still at its + // built-in default — CLI flags and OPENSHELL_* env vars always win. + let file: Option = if let Some(path) = args.config.clone() { + Some(config_file::load(&path).map_err(|e| miette::miette!("{e}"))?) + } else { + None + }; + if let Some(file) = file.as_ref() { + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + } + let tracing_log_bus = TracingLogBus::new(); tracing_log_bus.install_subscriber( EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), @@ -329,15 +352,15 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let tls = if args.disable_tls { None } else { - let cert_path = args.tls_cert.ok_or_else(|| { + let cert_path = args.tls_cert.clone().ok_or_else(|| { miette::miette!( "--tls-cert is required when TLS is enabled (use --disable-tls to skip)" ) })?; - let key_path = args.tls_key.ok_or_else(|| { + let key_path = args.tls_key.clone().ok_or_else(|| { miette::miette!("--tls-key is required when TLS is enabled (use --disable-tls to skip)") })?; - let client_ca_path = args.tls_client_ca.ok_or_else(|| { + let client_ca_path = args.tls_client_ca.clone().ok_or_else(|| { miette::miette!( "--tls-client-ca is required when TLS is enabled (use --disable-tls to skip)" ) @@ -352,6 +375,7 @@ async fn run_from_args(args: RunArgs) -> Result<()> { let db_url = args .db_url + .clone() .ok_or_else(|| miette::miette!("--db-url is required (or set OPENSHELL_DB_URL)"))?; let mut config = openshell_core::Config::new(tls) @@ -388,71 +412,53 @@ async fn run_from_args(args: RunArgs) -> Result<()> { config = config .with_database_url(db_url) - .with_compute_drivers(args.drivers) - .with_sandbox_namespace(args.sandbox_namespace) - .with_ssh_gateway_host(args.ssh_gateway_host) + .with_compute_drivers(args.drivers.clone()) + .with_sandbox_namespace(args.sandbox_namespace.clone()) + .with_ssh_gateway_host(args.ssh_gateway_host.clone()) .with_ssh_gateway_port(args.ssh_gateway_port) .with_sandbox_ssh_port(args.sandbox_ssh_port) .with_ssh_handshake_skew_secs(args.ssh_handshake_skew_secs); - if let Some(image) = args.sandbox_image { + if let Some(image) = args.sandbox_image.clone() { config = config.with_sandbox_image(image); } - if let Some(policy) = args.sandbox_image_pull_policy { + if let Some(policy) = args.sandbox_image_pull_policy.clone() { config = config.with_sandbox_image_pull_policy(policy); } - if let Some(endpoint) = args.grpc_endpoint { + if let Some(endpoint) = args.grpc_endpoint.clone() { config = config.with_grpc_endpoint(endpoint); } - if let Some(secret) = args.ssh_handshake_secret { + if let Some(secret) = args.ssh_handshake_secret.clone() { config = config.with_ssh_handshake_secret(secret); } - if let Some(name) = args.client_tls_secret_name { + if let Some(name) = args.client_tls_secret_name.clone() { config = config.with_client_tls_secret_name(name); } - if let Some(ip) = args.host_gateway_ip { + if let Some(ip) = args.host_gateway_ip.clone() { config = config.with_host_gateway_ip(ip); } - if let Some(issuer) = args.oidc_issuer { + if let Some(issuer) = args.oidc_issuer.clone() { config = config.with_oidc(openshell_core::OidcConfig { issuer, - audience: args.oidc_audience, + audience: args.oidc_audience.clone(), jwks_ttl_secs: args.oidc_jwks_ttl, - roles_claim: args.oidc_roles_claim, - admin_role: args.oidc_admin_role, - user_role: args.oidc_user_role, - scopes_claim: args.oidc_scopes_claim, + roles_claim: args.oidc_roles_claim.clone(), + admin_role: args.oidc_admin_role.clone(), + user_role: args.oidc_user_role.clone(), + scopes_claim: args.oidc_scopes_claim.clone(), }); } config.enable_user_namespaces = args.enable_user_namespaces; - let vm_config = VmComputeConfig { - state_dir: args.vm_driver_state_dir, - driver_dir: args.driver_dir, - default_image: config.sandbox_image.clone(), - krun_log_level: args.vm_krun_log_level, - vcpus: args.vm_vcpus, - mem_mib: args.vm_mem_mib, - guest_tls_ca: args.vm_tls_ca, - guest_tls_cert: args.vm_tls_cert, - guest_tls_key: args.vm_tls_key, - }; - - let docker_config = DockerComputeConfig { - supervisor_bin: args.docker_supervisor_bin, - supervisor_image: args.docker_supervisor_image, - guest_tls_ca: args.docker_tls_ca, - guest_tls_cert: args.docker_tls_cert, - guest_tls_key: args.docker_tls_key, - network_name: args.docker_network_name, - }; + let vm_config = build_vm_config(&args, &matches, &config, file.as_ref())?; + let docker_config = build_docker_config(&args, &matches, file.as_ref())?; if args.disable_tls { info!("TLS disabled — listening on plaintext HTTP"); @@ -462,15 +468,258 @@ async fn run_from_args(args: RunArgs) -> Result<()> { info!(bind = %config.bind_address, "Starting OpenShell server"); - run_server(config, vm_config, docker_config, tracing_log_bus) - .await - .into_diagnostic() + Box::pin(run_server( + config, + vm_config, + docker_config, + file, + tracing_log_bus, + )) + .await + .into_diagnostic() } fn parse_compute_driver(value: &str) -> std::result::Result { value.parse() } +/// Returns `true` when an argument's value came from clap's built-in default +/// (or was never supplied at all). When the predicate is `true`, the loader +/// is free to replace the value with one read from the TOML config file. +fn arg_defaulted(matches: &ArgMatches, id: &str) -> bool { + matches!( + matches.value_source(id), + None | Some(ValueSource::DefaultValue) + ) +} + +/// Apply gateway-wide values from `[openshell.gateway]` onto `RunArgs` for +/// every argument that is still sourced from clap's built-in default. +/// +/// The function intentionally does not touch `database_url` or +/// `ssh_handshake_secret` — those secrets are env-only and the loader +/// already rejected them when they appear in the file. +fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: &ArgMatches) { + if let Some(addr) = file.bind_address { + if arg_defaulted(matches, "bind_address") { + args.bind_address = addr.ip(); + } + if arg_defaulted(matches, "port") { + args.port = addr.port(); + } + } + if let Some(addr) = file.health_bind_address + && arg_defaulted(matches, "health_port") + { + args.health_port = addr.port(); + } + if let Some(addr) = file.metrics_bind_address + && arg_defaulted(matches, "metrics_port") + { + args.metrics_port = addr.port(); + } + if let Some(level) = &file.log_level + && arg_defaulted(matches, "log_level") + { + args.log_level.clone_from(level); + } + if let Some(drivers) = &file.compute_drivers + && arg_defaulted(matches, "drivers") + { + args.drivers.clone_from(drivers); + } + if let Some(ns) = &file.sandbox_namespace + && arg_defaulted(matches, "sandbox_namespace") + { + args.sandbox_namespace.clone_from(ns); + } + if let Some(port) = file.sandbox_ssh_port + && arg_defaulted(matches, "sandbox_ssh_port") + { + args.sandbox_ssh_port = port; + } + if let Some(host) = &file.ssh_gateway_host + && arg_defaulted(matches, "ssh_gateway_host") + { + args.ssh_gateway_host.clone_from(host); + } + if let Some(port) = file.ssh_gateway_port + && arg_defaulted(matches, "ssh_gateway_port") + { + args.ssh_gateway_port = port; + } + if let Some(skew) = file.ssh_handshake_skew_secs + && arg_defaulted(matches, "ssh_handshake_skew_secs") + { + args.ssh_handshake_skew_secs = skew; + } + if let Some(image) = &file.default_image + && args.sandbox_image.is_none() + && arg_defaulted(matches, "sandbox_image") + { + args.sandbox_image = Some(image.clone()); + } + if let Some(policy) = &file.image_pull_policy + && args.sandbox_image_pull_policy.is_none() + && arg_defaulted(matches, "sandbox_image_pull_policy") + { + args.sandbox_image_pull_policy = Some(policy.clone()); + } + if let Some(secret) = &file.client_tls_secret_name + && args.client_tls_secret_name.is_none() + && arg_defaulted(matches, "client_tls_secret_name") + { + args.client_tls_secret_name = Some(secret.clone()); + } + if let Some(ip) = &file.host_gateway_ip + && args.host_gateway_ip.is_none() + && arg_defaulted(matches, "host_gateway_ip") + { + args.host_gateway_ip = Some(ip.clone()); + } + if let Some(enabled) = file.enable_user_namespaces + && arg_defaulted(matches, "enable_user_namespaces") + { + args.enable_user_namespaces = enabled; + } + // TLS gateway listener fields + if let Some(tls) = &file.tls { + if args.tls_cert.is_none() && arg_defaulted(matches, "tls_cert") { + args.tls_cert = Some(tls.cert_path.clone()); + } + if args.tls_key.is_none() && arg_defaulted(matches, "tls_key") { + args.tls_key = Some(tls.key_path.clone()); + } + if args.tls_client_ca.is_none() && arg_defaulted(matches, "tls_client_ca") { + args.tls_client_ca = Some(tls.client_ca_path.clone()); + } + if tls.allow_unauthenticated && arg_defaulted(matches, "disable_gateway_auth") { + args.disable_gateway_auth = true; + } + } + // OIDC fields + if let Some(oidc) = &file.oidc { + if args.oidc_issuer.is_none() && arg_defaulted(matches, "oidc_issuer") { + args.oidc_issuer = Some(oidc.issuer.clone()); + } + if arg_defaulted(matches, "oidc_audience") { + args.oidc_audience.clone_from(&oidc.audience); + } + if arg_defaulted(matches, "oidc_jwks_ttl") { + args.oidc_jwks_ttl = oidc.jwks_ttl_secs; + } + if arg_defaulted(matches, "oidc_roles_claim") { + args.oidc_roles_claim.clone_from(&oidc.roles_claim); + } + if arg_defaulted(matches, "oidc_admin_role") { + args.oidc_admin_role.clone_from(&oidc.admin_role); + } + if arg_defaulted(matches, "oidc_user_role") { + args.oidc_user_role.clone_from(&oidc.user_role); + } + if arg_defaulted(matches, "oidc_scopes_claim") { + args.oidc_scopes_claim.clone_from(&oidc.scopes_claim); + } + } +} + +/// Build [`VmComputeConfig`] by overlaying CLI args on top of the +/// `[openshell.drivers.vm]` table inherited from `[openshell.gateway]`. +fn build_vm_config( + args: &RunArgs, + matches: &ArgMatches, + config: &openshell_core::Config, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Vm, + &file.openshell.gateway, + file.openshell.drivers.get("vm"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.vm] table: {e}"))? + } else { + VmComputeConfig::default() + }; + + // CLI/env overrides — and `state_dir` is also pulled from RunArgs when the + // file did not set it, so the gateway always has a working directory. + if !arg_defaulted(matches, "vm_driver_state_dir") || cfg.state_dir.as_os_str().is_empty() { + cfg.state_dir.clone_from(&args.vm_driver_state_dir); + } + if !arg_defaulted(matches, "driver_dir") || cfg.driver_dir.is_none() { + cfg.driver_dir.clone_from(&args.driver_dir); + } + if !arg_defaulted(matches, "vm_krun_log_level") { + cfg.krun_log_level = args.vm_krun_log_level; + } + if !arg_defaulted(matches, "vm_vcpus") { + cfg.vcpus = args.vm_vcpus; + } + if !arg_defaulted(matches, "vm_mem_mib") { + cfg.mem_mib = args.vm_mem_mib; + } + if let Some(p) = args.vm_tls_ca.clone() { + cfg.guest_tls_ca = Some(p); + } + if let Some(p) = args.vm_tls_cert.clone() { + cfg.guest_tls_cert = Some(p); + } + if let Some(p) = args.vm_tls_key.clone() { + cfg.guest_tls_key = Some(p); + } + // Fall through: image inherited from gateway-wide `sandbox_image` when + // the merged table did not supply `default_image`. + if cfg.default_image.is_empty() { + cfg.default_image.clone_from(&config.sandbox_image); + } + Ok(cfg) +} + +/// Build [`DockerComputeConfig`] using the same inheritance pattern as +/// [`build_vm_config`]. +fn build_docker_config( + args: &RunArgs, + matches: &ArgMatches, + file: Option<&ConfigFile>, +) -> Result { + let mut cfg = if let Some(file) = file { + let merged = config_file::driver_table( + ComputeDriverKind::Docker, + &file.openshell.gateway, + file.openshell.drivers.get("docker"), + ); + merged + .try_into::() + .map_err(|e| miette::miette!("invalid [openshell.drivers.docker] table: {e}"))? + } else { + DockerComputeConfig::default() + }; + + if args.docker_supervisor_bin.is_some() { + cfg.supervisor_bin.clone_from(&args.docker_supervisor_bin); + } + if args.docker_supervisor_image.is_some() { + cfg.supervisor_image + .clone_from(&args.docker_supervisor_image); + } + if args.docker_tls_ca.is_some() { + cfg.guest_tls_ca.clone_from(&args.docker_tls_ca); + } + if args.docker_tls_cert.is_some() { + cfg.guest_tls_cert.clone_from(&args.docker_tls_cert); + } + if args.docker_tls_key.is_some() { + cfg.guest_tls_key.clone_from(&args.docker_tls_key); + } + if !arg_defaulted(matches, "docker_network_name") { + cfg.network_name.clone_from(&args.docker_network_name); + } + Ok(cfg) +} + #[cfg(test)] mod tests { use super::{Cli, command}; @@ -638,4 +887,167 @@ mod tests { assert!(cli.command.is_none()); assert!(cli.run.db_url.is_none()); } + + // ── Config-file merge tests ────────────────────────────────────────── + // + // `merge_file_into_args` is the bridge between `config_file::ConfigFile` + // and `RunArgs`. These cases lock in the precedence rule: + // + // CLI flag > OPENSHELL_* env var > TOML file > built-in default + // + // by exercising each combination on representative gateway fields. + + use super::{ConfigFile, merge_file_into_args}; + use clap::FromArgMatches; + + fn parse_with_args(argv: &[&str]) -> (super::RunArgs, clap::ArgMatches) { + let matches = command().try_get_matches_from(argv).expect("parses"); + let cli = Cli::from_arg_matches(&matches).expect("from arg matches"); + (cli.run, matches) + } + + fn config_file_from_toml(toml: &str) -> ConfigFile { + toml::from_str(toml).expect("valid TOML in test fixture") + } + + #[test] + fn file_value_applies_when_cli_uses_default() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_SERVER_PORT"); + let _g3 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +bind_address = "0.0.0.0:9090" +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.bind_address, IpAddr::V4(Ipv4Addr::UNSPECIFIED)); + assert_eq!(args.port, 9090); + assert_eq!(args.log_level, "debug"); + } + + #[test] + fn cli_flag_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_BIND_ADDRESS"); + let _g2 = EnvVarGuard::remove("OPENSHELL_LOG_LEVEL"); + + let (mut args, matches) = parse_with_args(&[ + "openshell-gateway", + "--db-url", + "sqlite::memory:", + "--log-level", + "warn", + ]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "warn", "CLI flag must win over file"); + } + + #[test] + fn env_var_overrides_file_value() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g = EnvVarGuard::set("OPENSHELL_LOG_LEVEL", "trace"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway] +log_level = "debug" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.log_level, "trace", "env var must win over file"); + } + + #[test] + fn file_oidc_block_populates_oidc_args() { + let _lock = ENV_LOCK + .lock() + .unwrap_or_else(std::sync::PoisonError::into_inner); + let _g1 = EnvVarGuard::remove("OPENSHELL_OIDC_ISSUER"); + let _g2 = EnvVarGuard::remove("OPENSHELL_OIDC_AUDIENCE"); + + let (mut args, matches) = + parse_with_args(&["openshell-gateway", "--db-url", "sqlite::memory:"]); + let file = config_file_from_toml( + r#" +[openshell.gateway.oidc] +issuer = "https://idp.example.com" +audience = "openshell-cli" +"#, + ); + merge_file_into_args(&mut args, &file.openshell.gateway, &matches); + + assert_eq!(args.oidc_issuer.as_deref(), Some("https://idp.example.com")); + assert_eq!(args.oidc_audience, "openshell-cli"); + } + + #[test] + fn driver_inherits_shared_image_from_gateway_section() { + // [openshell.gateway].default_image inherits into the K8s driver + // table when the driver-specific table does not set it. + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "ghcr.io/nvidia/openshell/sandbox:1.0" + +[openshell.drivers.kubernetes] +namespace = "agents" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("merged table deserializes"); + assert_eq!(parsed.default_image, "ghcr.io/nvidia/openshell/sandbox:1.0"); + assert_eq!(parsed.namespace, "agents"); + } + + #[test] + fn driver_specific_value_overrides_gateway_inheritance() { + let file = config_file_from_toml( + r#" +[openshell.gateway] +default_image = "gateway-default:1.0" + +[openshell.drivers.kubernetes] +default_image = "k8s-specific:1.0" +"#, + ); + let merged = crate::config_file::driver_table( + super::ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + let parsed = merged + .try_into::() + .expect("deserializes"); + assert_eq!(parsed.default_image, "k8s-specific:1.0"); + } } diff --git a/crates/openshell-server/src/compute/vm.rs b/crates/openshell-server/src/compute/vm.rs index 1e62d4942..3a67f381b 100644 --- a/crates/openshell-server/src/compute/vm.rs +++ b/crates/openshell-server/src/compute/vm.rs @@ -60,7 +60,8 @@ const COMPUTE_DRIVER_SOCKET_RUN_DIR: &str = "run"; const COMPUTE_DRIVER_SOCKET_NAME: &str = "compute-driver.sock"; /// Configuration for launching and talking to the VM compute driver. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] +#[serde(default, deny_unknown_fields)] pub struct VmComputeConfig { /// Working directory for VM driver sandbox state. pub state_dir: PathBuf, diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs new file mode 100644 index 000000000..254de12f1 --- /dev/null +++ b/crates/openshell-server/src/config_file.rs @@ -0,0 +1,509 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! TOML configuration file loader for the gateway. +//! +//! See `rfc/0003-gateway-configuration/README.md` for the file format. This +//! module parses the file into [`ConfigFile`], rejects fields that must be +//! supplied via env/CLI (database URL, SSH handshake secret), and provides +//! [`driver_table`] which overlays shared `[openshell.gateway]` defaults onto +//! a `[openshell.drivers.]` table so each driver crate's +//! `Deserialize` impl sees a fully-populated table. +//! +//! The merge precedence at the gateway level is: +//! ```text +//! CLI flag > OPENSHELL_* env var > TOML file > built-in default +//! ``` +//! Per-field application of file values happens in [`crate::cli`], which uses +//! clap's `ArgMatches::value_source` to detect arguments that fell back to +//! their default and are therefore eligible for replacement by file values. + +use std::collections::BTreeMap; +use std::net::SocketAddr; +use std::path::{Path, PathBuf}; + +use openshell_core::config::ComputeDriverKind; +use openshell_core::{OidcConfig, TlsConfig}; +use serde::{Deserialize, Serialize}; + +/// Latest schema version this build understands. +pub const SCHEMA_VERSION: u32 = 1; + +/// Root of the gateway TOML config file. +/// +/// The file is rooted at `[openshell]` to reserve room for future components +/// (CLI, sandbox, router) to share a single config file without key +/// collisions. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct ConfigFile { + #[serde(default)] + pub openshell: OpenShellRoot, +} + +/// `[openshell]` table. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct OpenShellRoot { + /// Reserved for future schema migrations. Versions greater than + /// [`SCHEMA_VERSION`] are rejected at load time. + #[serde(default)] + pub version: Option, + + #[serde(default)] + pub gateway: GatewayFileSection, + + /// `[openshell.drivers.]` tables — passed verbatim to each driver + /// crate's `Deserialize` impl after the gateway-side inheritance merge. + /// Stored as raw [`toml::Value`] so each driver can evolve its schema + /// independently of this crate. + #[serde(default)] + pub drivers: BTreeMap, +} + +/// `[openshell.gateway]` section. +/// +/// All fields are `Option` so the loader can tell whether a key was set +/// in the file (`Some`) or not (`None` — value is taken from CLI/env/default). +/// +/// The fields under "Shared driver defaults" are inherited into +/// `[openshell.drivers.]` tables per [`inheritable_keys`]. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct GatewayFileSection { + // ── Listeners ──────────────────────────────────────────────────────── + #[serde(default)] + pub bind_address: Option, + #[serde(default)] + pub health_bind_address: Option, + #[serde(default)] + pub metrics_bind_address: Option, + + // ── Logging ────────────────────────────────────────────────────────── + #[serde(default)] + pub log_level: Option, + + // ── Drivers ────────────────────────────────────────────────────────── + #[serde(default)] + pub compute_drivers: Option>, + + // ── Sandbox / SSH ──────────────────────────────────────────────────── + #[serde(default)] + pub sandbox_namespace: Option, + #[serde(default)] + pub sandbox_ssh_port: Option, + #[serde(default)] + pub ssh_gateway_host: Option, + #[serde(default)] + pub ssh_gateway_port: Option, + #[serde(default)] + pub ssh_handshake_skew_secs: Option, + #[serde(default)] + pub ssh_session_ttl_secs: Option, + + // ── Shared driver defaults (inherited into [openshell.drivers.]) ─ + #[serde(default)] + pub default_image: Option, + #[serde(default)] + pub supervisor_image: Option, + #[serde(default)] + pub image_pull_policy: Option, + #[serde(default)] + pub client_tls_secret_name: Option, + #[serde(default)] + pub host_gateway_ip: Option, + #[serde(default)] + pub enable_user_namespaces: Option, + #[serde(default)] + pub guest_tls_ca: Option, + #[serde(default)] + pub guest_tls_cert: Option, + #[serde(default)] + pub guest_tls_key: Option, + + // ── Nested tables ──────────────────────────────────────────────────── + #[serde(default)] + pub tls: Option, + #[serde(default)] + pub oidc: Option, + + // ── Disallowed-in-file fields ──────────────────────────────────────── + // + // Captured so we can produce a friendly "set this via env/CLI instead" + // error rather than a generic "unknown field" message. Validated and + // rejected in [`load`]. + #[serde(default)] + pub database_url: Option, + #[serde(default)] + pub ssh_handshake_secret: Option, +} + +#[derive(Debug, thiserror::Error)] +pub enum ConfigFileError { + #[error("failed to read gateway config file '{}': {source}", path.display())] + Io { + path: PathBuf, + #[source] + source: std::io::Error, + }, + #[error("failed to parse gateway config file '{}': {source}", path.display())] + Parse { + path: PathBuf, + #[source] + source: toml::de::Error, + }, + #[error( + "unsupported gateway config version {version}; this build only supports version {SCHEMA_VERSION}" + )] + UnsupportedVersion { version: u32 }, + #[error( + "`{field}` is not allowed in the gateway config file — set the {env} env var or pass {cli} on the command line" + )] + SecretInFile { + field: &'static str, + env: &'static str, + cli: &'static str, + }, +} + +/// Load and validate a TOML config file. +/// +/// Returns `Ok(ConfigFile::default())` for an empty file (the gateway then +/// falls back entirely to CLI/env/built-in defaults). +pub fn load(path: &Path) -> Result { + let contents = std::fs::read_to_string(path).map_err(|source| ConfigFileError::Io { + path: path.to_path_buf(), + source, + })?; + if contents.trim().is_empty() { + return Ok(ConfigFile::default()); + } + let file: ConfigFile = toml::from_str(&contents).map_err(|source| ConfigFileError::Parse { + path: path.to_path_buf(), + source, + })?; + + if let Some(version) = file.openshell.version + && version > SCHEMA_VERSION + { + return Err(ConfigFileError::UnsupportedVersion { version }); + } + + if file.openshell.gateway.database_url.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "database_url", + env: "OPENSHELL_DB_URL", + cli: "--db-url", + }); + } + if file.openshell.gateway.ssh_handshake_secret.is_some() { + return Err(ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + env: "OPENSHELL_SSH_HANDSHAKE_SECRET", + cli: "--ssh-handshake-secret", + }); + } + + Ok(file) +} + +/// Build the merged TOML table for `driver` by overlaying inheritable +/// `[openshell.gateway]` defaults onto `[openshell.drivers.]`. +/// +/// The returned [`toml::Value`] is a Table ready to feed into the driver's +/// `Deserialize` impl — keys present in `raw` win over the gateway defaults. +/// Keys outside [`inheritable_keys`] for this driver are never copied from +/// the gateway section, which keeps each driver's `deny_unknown_fields` +/// invariant intact. +pub fn driver_table( + driver: ComputeDriverKind, + gateway: &GatewayFileSection, + raw: Option<&toml::Value>, +) -> toml::Value { + let mut merged = match raw { + Some(toml::Value::Table(table)) => table.clone(), + _ => toml::Table::new(), + }; + + for key in inheritable_keys(driver) { + if merged.contains_key(*key) { + continue; + } + if let Some(value) = gateway_inherited_value(gateway, key) { + merged.insert((*key).to_string(), value); + } + } + + toml::Value::Table(merged) +} + +/// Inheritance allowlist (the Q4 "high-overlap set"). Each driver opts in +/// to a specific subset so a gateway-wide default does not accidentally land +/// in a driver table that does not understand the field. +fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { + match driver { + ComputeDriverKind::Kubernetes => &[ + "default_image", + "supervisor_image", + "image_pull_policy", + "client_tls_secret_name", + "host_gateway_ip", + "ssh_handshake_skew_secs", + "enable_user_namespaces", + ], + ComputeDriverKind::Docker => &[ + "supervisor_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + ComputeDriverKind::Podman => &[ + "default_image", + "supervisor_image", + "image_pull_policy", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + "ssh_handshake_skew_secs", + ], + ComputeDriverKind::Vm => &[ + "default_image", + "guest_tls_ca", + "guest_tls_cert", + "guest_tls_key", + ], + } +} + +fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option { + match key { + "default_image" => g.default_image.as_deref().map(string_value), + "supervisor_image" => g.supervisor_image.as_deref().map(string_value), + "image_pull_policy" => g.image_pull_policy.as_deref().map(string_value), + "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), + "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), + "ssh_handshake_skew_secs" => g.ssh_handshake_skew_secs.and_then(skew_value), + "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), + "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), + "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), + _ => None, + } +} + +fn string_value(s: &str) -> toml::Value { + toml::Value::String(s.to_owned()) +} + +fn path_value(p: &Path) -> toml::Value { + toml::Value::String(p.display().to_string()) +} + +fn skew_value(n: u64) -> Option { + i64::try_from(n).ok().map(toml::Value::Integer) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Write; + + fn write_tmp(contents: &str) -> tempfile::NamedTempFile { + let mut tmp = tempfile::Builder::new() + .suffix(".toml") + .tempfile() + .expect("tempfile"); + tmp.write_all(contents.as_bytes()).expect("write"); + tmp + } + + #[test] + fn empty_file_yields_default_config() { + let tmp = write_tmp(""); + let file = load(tmp.path()).expect("empty file parses"); + assert!(file.openshell.version.is_none()); + assert!(file.openshell.gateway.bind_address.is_none()); + assert!(file.openshell.drivers.is_empty()); + } + + #[test] + fn parses_full_example() { + let toml = r#" +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +log_level = "info" +compute_drivers = ["kubernetes"] +sandbox_namespace = "agents" +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-sandbox-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +"#; + let tmp = write_tmp(toml); + let file = load(tmp.path()).expect("valid file parses"); + let gw = &file.openshell.gateway; + assert_eq!(gw.log_level.as_deref(), Some("info")); + assert_eq!( + gw.default_image.as_deref(), + Some("ghcr.io/nvidia/openshell/sandbox:latest") + ); + assert!(gw.tls.is_some()); + assert!(gw.oidc.is_some()); + assert!(file.openshell.drivers.contains_key("kubernetes")); + } + + #[test] + fn rejects_database_url_in_file() { + let toml = r#" +[openshell.gateway] +database_url = "sqlite::memory:" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("database_url must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "database_url", + .. + } + )); + } + + #[test] + fn rejects_ssh_handshake_secret_in_file() { + let toml = r#" +[openshell.gateway] +ssh_handshake_secret = "leaked" +"#; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("ssh_handshake_secret must be rejected"); + assert!(matches!( + err, + ConfigFileError::SecretInFile { + field: "ssh_handshake_secret", + .. + } + )); + } + + #[test] + fn rejects_unknown_gateway_field() { + let toml = r" +[openshell.gateway] +nonsense = true +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("unknown field must be rejected"); + assert!(matches!(err, ConfigFileError::Parse { .. })); + } + + #[test] + fn rejects_unsupported_version() { + let toml = r" +[openshell] +version = 2 +"; + let tmp = write_tmp(toml); + let err = load(tmp.path()).expect_err("version > 1 must be rejected"); + assert!(matches!( + err, + ConfigFileError::UnsupportedVersion { version: 2 } + )); + } + + #[test] + fn driver_table_inherits_gateway_defaults() { + let gateway = GatewayFileSection { + default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), + supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), + image_pull_policy: Some("IfNotPresent".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + namespace = "agents" + }; + let merged = driver_table( + ComputeDriverKind::Kubernetes, + &gateway, + Some(&toml::Value::Table(raw)), + ); + let table = merged.as_table().expect("table"); + assert_eq!( + table.get("namespace").and_then(|v| v.as_str()), + Some("agents") + ); + assert_eq!( + table.get("default_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/sandbox:0.9") + ); + assert_eq!( + table.get("supervisor_image").and_then(|v| v.as_str()), + Some("ghcr.io/nvidia/openshell/supervisor:0.9") + ); + } + + #[test] + fn driver_table_specific_value_overrides_gateway_default() { + let gateway = GatewayFileSection { + default_image: Some("gateway-default".to_string()), + ..Default::default() + }; + let raw = toml::toml! { + default_image = "driver-specific" + }; + let merged = driver_table( + ComputeDriverKind::Podman, + &gateway, + Some(&toml::Value::Table(raw)), + ); + assert_eq!( + merged + .as_table() + .unwrap() + .get("default_image") + .and_then(|v| v.as_str()), + Some("driver-specific") + ); + } + + #[test] + fn driver_table_does_not_leak_keys_outside_allowlist() { + // `client_tls_secret_name` is K8s-only; Docker must not receive it + // even when set at gateway scope. + let gateway = GatewayFileSection { + client_tls_secret_name: Some("openshell-sandbox-tls".to_string()), + ..Default::default() + }; + let merged = driver_table(ComputeDriverKind::Docker, &gateway, None); + assert!( + !merged + .as_table() + .unwrap() + .contains_key("client_tls_secret_name") + ); + } + + #[test] + fn missing_path_is_io_error() { + let err = load(Path::new("/nonexistent/openshell-gateway.toml")) + .expect_err("missing file must be io error"); + assert!(matches!(err, ConfigFileError::Io { .. })); + } +} diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index bca6e44aa..fb13ed83f 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -23,6 +23,7 @@ mod auth; pub mod certgen; pub mod cli; mod compute; +pub mod config_file; mod grpc; mod http; mod inference; @@ -151,6 +152,7 @@ pub async fn run_server( config: Config, vm_config: VmComputeConfig, docker_config: DockerComputeConfig, + config_file: Option, tracing_log_bus: TracingLogBus, ) -> Result<()> { let database_url = config.database_url.trim(); @@ -193,6 +195,7 @@ pub async fn run_server( &config, &vm_config, &docker_config, + config_file.as_ref(), store.clone(), sandbox_index.clone(), sandbox_watch_bus.clone(), @@ -460,6 +463,7 @@ async fn build_compute_runtime( config: &Config, vm_config: &VmComputeConfig, docker_config: &DockerComputeConfig, + file: Option<&config_file::ConfigFile>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, @@ -471,37 +475,42 @@ async fn build_compute_runtime( match driver { ComputeDriverKind::Kubernetes => { - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - let supervisor_image_pull_policy = - std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_default(); + let mut k8s = kubernetes_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + k8s.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY") + && !v.is_empty() + { + k8s.supervisor_image_pull_policy = v; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_SIDELOAD_METHOD") + && !v.is_empty() + && let Ok(parsed) = v.parse() + { + k8s.supervisor_sideload_method = parsed; + } + // Shared fields are sourced from Config, which already merged + // file + CLI/env at startup. + k8s.namespace.clone_from(&config.sandbox_namespace); + k8s.default_image.clone_from(&config.sandbox_image); + k8s.image_pull_policy + .clone_from(&config.sandbox_image_pull_policy); + k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); + k8s.ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + k8s.ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + k8s.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; + k8s.client_tls_secret_name + .clone_from(&config.client_tls_secret_name); + k8s.host_gateway_ip.clone_from(&config.host_gateway_ip); + k8s.enable_user_namespaces = config.enable_user_namespaces; ComputeRuntime::new_kubernetes( - KubernetesComputeConfig { - namespace: config.sandbox_namespace.clone(), - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.clone(), - supervisor_image, - supervisor_image_pull_policy, - supervisor_sideload_method: std::env::var( - "OPENSHELL_SUPERVISOR_SIDELOAD_METHOD", - ) - .ok() - .filter(|s| !s.is_empty()) - .and_then(|s| s.parse().ok()) - .unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - client_tls_secret_name: config.client_tls_secret_name.clone(), - host_gateway_ip: config.host_gateway_ip.clone(), - enable_user_namespaces: config.enable_user_namespaces, - }, + k8s, store, sandbox_index, sandbox_watch_bus, @@ -537,63 +546,60 @@ async fn build_compute_runtime( .map_err(|e| Error::execution(format!("failed to create compute runtime: {e}"))) } ComputeDriverKind::Podman => { - let socket_path = std::env::var("OPENSHELL_PODMAN_SOCKET") - .ok() - .filter(|s| !s.is_empty()) - .map_or_else( - openshell_driver_podman::PodmanComputeConfig::default_socket_path, - std::path::PathBuf::from, - ); - - let network_name = std::env::var("OPENSHELL_NETWORK_NAME") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_NETWORK_NAME.to_string()); - - let stop_timeout_secs: u32 = std::env::var("OPENSHELL_STOP_TIMEOUT") - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS); - - let supervisor_image = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") - .ok() - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()); - - // TLS client cert paths for sandbox mTLS. When all three are - // set, the Podman driver bind-mounts them into sandbox - // containers and switches the endpoint to https://. - let podman_tls_ca = std::env::var("OPENSHELL_PODMAN_TLS_CA") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_cert = std::env::var("OPENSHELL_PODMAN_TLS_CERT") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); - let podman_tls_key = std::env::var("OPENSHELL_PODMAN_TLS_KEY") - .ok() - .filter(|s| !s.is_empty()) - .map(std::path::PathBuf::from); + let mut podman = podman_config_from_file(file)?; + // Env overrides file for fields not represented in Config. + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_SOCKET") + && !v.is_empty() + { + podman.socket_path = std::path::PathBuf::from(v); + } + if let Ok(v) = std::env::var("OPENSHELL_NETWORK_NAME") + && !v.is_empty() + { + podman.network_name = v; + } + if let Ok(v) = std::env::var("OPENSHELL_STOP_TIMEOUT") + && let Ok(parsed) = v.parse() + { + podman.stop_timeout_secs = parsed; + } + if let Ok(v) = std::env::var("OPENSHELL_SUPERVISOR_IMAGE") + && !v.is_empty() + { + podman.supervisor_image = v; + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CA") + && !v.is_empty() + { + podman.guest_tls_ca = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_CERT") + && !v.is_empty() + { + podman.guest_tls_cert = Some(std::path::PathBuf::from(v)); + } + if let Ok(v) = std::env::var("OPENSHELL_PODMAN_TLS_KEY") + && !v.is_empty() + { + podman.guest_tls_key = Some(std::path::PathBuf::from(v)); + } + // Shared fields are sourced from Config (which already merged + // file + CLI/env at startup). + podman.default_image.clone_from(&config.sandbox_image); + podman.image_pull_policy = config.sandbox_image_pull_policy.parse().unwrap_or_default(); + podman.grpc_endpoint.clone_from(&config.grpc_endpoint); + podman.gateway_port = config.bind_address.port(); + podman + .sandbox_ssh_socket_path + .clone_from(&config.sandbox_ssh_socket_path); + podman.ssh_port = config.sandbox_ssh_port; + podman + .ssh_handshake_secret + .clone_from(&config.ssh_handshake_secret); + podman.ssh_handshake_skew_secs = config.ssh_handshake_skew_secs; ComputeRuntime::new_podman( - openshell_driver_podman::PodmanComputeConfig { - socket_path, - default_image: config.sandbox_image.clone(), - image_pull_policy: config.sandbox_image_pull_policy.parse().unwrap_or_default(), - grpc_endpoint: config.grpc_endpoint.clone(), - gateway_port: config.bind_address.port(), - sandbox_ssh_socket_path: config.sandbox_ssh_socket_path.clone(), - network_name, - ssh_port: config.sandbox_ssh_port, - ssh_handshake_secret: config.ssh_handshake_secret.clone(), - ssh_handshake_skew_secs: config.ssh_handshake_skew_secs, - stop_timeout_secs, - supervisor_image, - guest_tls_ca: podman_tls_ca, - guest_tls_cert: podman_tls_cert, - guest_tls_key: podman_tls_key, - }, + podman, store, sandbox_index, sandbox_watch_bus, @@ -606,6 +612,43 @@ async fn build_compute_runtime( } } +/// Build a [`KubernetesComputeConfig`] from the file's +/// `[openshell.drivers.kubernetes]` table merged with inheritable +/// `[openshell.gateway]` defaults. Falls back to the driver's `Default` +/// when no file is present. +fn kubernetes_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(KubernetesComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Kubernetes, + &file.openshell.gateway, + file.openshell.drivers.get("kubernetes"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.kubernetes] table: {e}"))) +} + +/// Same pattern as [`kubernetes_config_from_file`] but for Podman. +fn podman_config_from_file( + file: Option<&config_file::ConfigFile>, +) -> Result { + let Some(file) = file else { + return Ok(openshell_driver_podman::PodmanComputeConfig::default()); + }; + let merged = config_file::driver_table( + ComputeDriverKind::Podman, + &file.openshell.gateway, + file.openshell.drivers.get("podman"), + ); + merged + .try_into() + .map_err(|e| Error::config(format!("invalid [openshell.drivers.podman] table: {e}"))) +} + fn configured_compute_driver(config: &Config) -> Result { match config.compute_drivers.as_slice() { [] => openshell_core::config::detect_driver().ok_or_else(|| { diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml new file mode 100644 index 000000000..6caf44985 --- /dev/null +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -0,0 +1,91 @@ +{{/* +ConfigMap holding the gateway TOML config file (RFC 0003). + +The gateway reads `/etc/openshell/gateway.toml` (mounted from this ConfigMap) +at startup. CLI flags and OPENSHELL_* env vars on the StatefulSet container +still override anything in this file. + +Two values are intentionally NOT rendered here: + - server.dbUrl → passed via --db-url in the StatefulSet args + - sshHandshake secret → injected as OPENSHELL_SSH_HANDSHAKE_SECRET env var + from a Kubernetes Secret reference. +*/}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "openshell.fullname" . }}-config + labels: + {{- include "openshell.labels" . | nindent 4 }} +data: + gateway.toml: | + [openshell] + version = 1 + + [openshell.gateway] + bind_address = "0.0.0.0:{{ .Values.service.port }}" + {{- if .Values.service.healthPort }} + health_bind_address = "0.0.0.0:{{ .Values.service.healthPort }}" + {{- end }} + {{- if .Values.service.metricsPort }} + metrics_bind_address = "0.0.0.0:{{ .Values.service.metricsPort }}" + {{- end }} + log_level = {{ .Values.server.logLevel | quote }} + sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} + default_image = {{ .Values.server.sandboxImage | quote }} + supervisor_image = {{ include "openshell.supervisorImage" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} + grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} + {{- if .Values.server.sshGatewayHost }} + ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} + {{- end }} + {{- if .Values.server.sshGatewayPort }} + ssh_gateway_port = {{ .Values.server.sshGatewayPort }} + {{- end }} + {{- if .Values.server.hostGatewayIP }} + host_gateway_ip = {{ .Values.server.hostGatewayIP | quote }} + {{- end }} + {{- if .Values.server.enableUserNamespaces }} + enable_user_namespaces = true + {{- end }} + {{- if not .Values.server.disableTls }} + client_tls_secret_name = {{ .Values.server.tls.clientTlsSecretName | quote }} + {{- end }} + + {{- if not .Values.server.disableTls }} + + [openshell.gateway.tls] + cert_path = "/etc/openshell-tls/server/tls.crt" + key_path = "/etc/openshell-tls/server/tls.key" + client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + {{- if .Values.server.disableGatewayAuth }} + allow_unauthenticated = true + {{- end }} + {{- end }} + + {{- if .Values.server.oidc.issuer }} + + [openshell.gateway.oidc] + issuer = {{ .Values.server.oidc.issuer | quote }} + audience = {{ .Values.server.oidc.audience | quote }} + jwks_ttl_secs = {{ .Values.server.oidc.jwksTtl }} + {{- if .Values.server.oidc.rolesClaim }} + roles_claim = {{ .Values.server.oidc.rolesClaim | quote }} + {{- end }} + {{- if .Values.server.oidc.adminRole }} + admin_role = {{ .Values.server.oidc.adminRole | quote }} + {{- end }} + {{- if .Values.server.oidc.userRole }} + user_role = {{ .Values.server.oidc.userRole | quote }} + {{- end }} + {{- if .Values.server.oidc.scopesClaim }} + scopes_claim = {{ .Values.server.oidc.scopesClaim | quote }} + {{- end }} + {{- end }} + + [openshell.drivers.kubernetes] + supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index 4a0b70621..cbd70094a 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -47,104 +47,24 @@ spec: image: {{ include "openshell.image" . | quote }} imagePullPolicy: {{ .Values.image.pullPolicy }} args: - - --bind-address - - "0.0.0.0" - - --port - - {{ .Values.service.port | quote }} - - --health-port - - {{ .Values.service.healthPort | quote }} - {{- if .Values.service.metricsPort }} - - --metrics-port - - {{ .Values.service.metricsPort | quote }} - {{- end }} - - --log-level - - {{ .Values.server.logLevel }} + - --config + - /etc/openshell/gateway.toml - --db-url - {{ .Values.server.dbUrl | quote }} env: - - name: OPENSHELL_SANDBOX_NAMESPACE - value: {{ include "openshell.sandboxNamespace" . | quote }} - - name: OPENSHELL_SANDBOX_IMAGE - value: {{ .Values.server.sandboxImage | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - - name: OPENSHELL_SANDBOX_IMAGE_PULL_POLICY - value: {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_IMAGE - value: {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.supervisor.image.pullPolicy }} - - name: OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY - value: {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} - - name: OPENSHELL_SUPERVISOR_SIDELOAD_METHOD - value: {{ include "openshell.supervisorSideloadMethod" . | quote }} - - name: OPENSHELL_GRPC_ENDPOINT - value: {{ include "openshell.grpcEndpoint" . | quote }} - {{- if .Values.server.sshGatewayHost }} - - name: OPENSHELL_SSH_GATEWAY_HOST - value: {{ .Values.server.sshGatewayHost | quote }} - {{- end }} - {{- if .Values.server.sshGatewayPort }} - - name: OPENSHELL_SSH_GATEWAY_PORT - value: {{ .Values.server.sshGatewayPort | quote }} - {{- end }} - {{- if .Values.server.hostGatewayIP }} - - name: OPENSHELL_HOST_GATEWAY_IP - value: {{ .Values.server.hostGatewayIP | quote }} - {{- end }} - {{- if .Values.server.enableUserNamespaces }} - - name: OPENSHELL_ENABLE_USER_NAMESPACES - value: "true" - {{- end }} + # Secrets are env-only. Everything else lives in the ConfigMap- + # backed TOML file mounted at /etc/openshell/gateway.toml. - name: OPENSHELL_SSH_HANDSHAKE_SECRET valueFrom: secretKeyRef: name: {{ .Values.server.sshHandshakeSecretName | quote }} key: secret - {{- if .Values.server.disableTls }} - - name: OPENSHELL_DISABLE_TLS - value: "true" - {{- else }} - - name: OPENSHELL_TLS_CERT - value: /etc/openshell-tls/server/tls.crt - - name: OPENSHELL_TLS_KEY - value: /etc/openshell-tls/server/tls.key - - name: OPENSHELL_TLS_CLIENT_CA - value: /etc/openshell-tls/client-ca/ca.crt - - name: OPENSHELL_CLIENT_TLS_SECRET_NAME - value: {{ .Values.server.tls.clientTlsSecretName | quote }} - {{- if .Values.server.disableGatewayAuth }} - - name: OPENSHELL_DISABLE_GATEWAY_AUTH - value: "true" - {{- end }} - {{- end }} - {{- if .Values.server.oidc.issuer }} - - name: OPENSHELL_OIDC_ISSUER - value: {{ .Values.server.oidc.issuer | quote }} - - name: OPENSHELL_OIDC_AUDIENCE - value: {{ .Values.server.oidc.audience | quote }} - - name: OPENSHELL_OIDC_JWKS_TTL - value: {{ .Values.server.oidc.jwksTtl | quote }} - {{- if .Values.server.oidc.rolesClaim }} - - name: OPENSHELL_OIDC_ROLES_CLAIM - value: {{ .Values.server.oidc.rolesClaim | quote }} - {{- end }} - {{- if .Values.server.oidc.adminRole }} - - name: OPENSHELL_OIDC_ADMIN_ROLE - value: {{ .Values.server.oidc.adminRole | quote }} - {{- end }} - {{- if .Values.server.oidc.userRole }} - - name: OPENSHELL_OIDC_USER_ROLE - value: {{ .Values.server.oidc.userRole | quote }} - {{- end }} - {{- if .Values.server.oidc.scopesClaim }} - - name: OPENSHELL_OIDC_SCOPES_CLAIM - value: {{ .Values.server.oidc.scopesClaim | quote }} - {{- end }} - {{- end }} volumeMounts: - name: openshell-data mountPath: /var/openshell + - name: gateway-config + mountPath: /etc/openshell + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -191,6 +111,9 @@ spec: resources: {{- toYaml .Values.resources | nindent 12 }} volumes: + - name: gateway-config + configMap: + name: {{ include "openshell.fullname" . }}-config {{- if not .Values.server.disableTls }} - name: tls-cert secret: diff --git a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml index a128cd440..2d3461c6f 100644 --- a/deploy/helm/openshell/tests/sandbox_namespace_test.yaml +++ b/deploy/helm/openshell/tests/sandbox_namespace_test.yaml @@ -3,32 +3,28 @@ suite: sandboxNamespace defaulting templates: - - templates/statefulset.yaml + - templates/gateway-config.yaml - templates/networkpolicy.yaml release: name: openshell namespace: my-namespace tests: - - it: defaults OPENSHELL_SANDBOX_NAMESPACE to release namespace - template: templates/statefulset.yaml + - it: defaults sandbox_namespace to release namespace in the TOML config + template: templates/gateway-config.yaml asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "my-namespace" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"my-namespace"' - it: uses explicit sandboxNamespace when set - template: templates/statefulset.yaml + template: templates/gateway-config.yaml set: server.sandboxNamespace: other-ns asserts: - - contains: - path: spec.template.spec.containers[0].env - content: - name: OPENSHELL_SANDBOX_NAMESPACE - value: "other-ns" + - matchRegex: + path: data["gateway.toml"] + pattern: 'sandbox_namespace\s*=\s*"other-ns"' - it: defaults NetworkPolicy namespace to release namespace template: templates/networkpolicy.yaml diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml new file mode 100644 index 000000000..f150bbf2f --- /dev/null +++ b/examples/gateway/gateway.example.toml @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Example gateway configuration. See RFC 0003 for the schema and +# rfc/0003-gateway-configuration/README.md for the field-by-field reference. +# +# Source precedence (highest first): +# CLI flag > OPENSHELL_* env var > this file > built-in default +# +# Two values are deliberately env-only and MUST NOT appear in this file: +# - database_url (OPENSHELL_DB_URL / --db-url) +# - ssh_handshake_secret (OPENSHELL_SSH_HANDSHAKE_SECRET / --ssh-handshake-secret) + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" + +log_level = "info" + +# When empty the gateway auto-detects (Kubernetes -> Podman -> Docker). VM is +# never auto-detected and requires an explicit entry here. +compute_drivers = ["kubernetes"] + +sandbox_namespace = "openshell" +sandbox_ssh_port = 2222 +ssh_gateway_host = "127.0.0.1" +ssh_gateway_port = 8080 +ssh_handshake_skew_secs = 300 + +# Shared driver defaults — inherited into [openshell.drivers.] tables +# when the driver-specific table does not override them. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-client-tls" + +# Gateway listener TLS (distinct from the per-driver guest_tls_*). +[openshell.gateway.tls] +cert_path = "/etc/openshell/certs/gateway.pem" +key_path = "/etc/openshell/certs/gateway-key.pem" +client_ca_path = "/etc/openshell/certs/client-ca.pem" +allow_unauthenticated = false + +[openshell.gateway.oidc] +issuer = "https://idp.example.com/realms/openshell" +audience = "openshell-cli" +jwks_ttl_secs = 3600 +roles_claim = "realm_access.roles" +admin_role = "openshell-admin" +user_role = "openshell-user" + +# ── Driver tables ────────────────────────────────────────────────────────── +# Each table inherits the shared defaults set above. Keys present here win +# over the inherited value. + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# supervisor_image and guest_tls_* inherit from [openshell.gateway] + +[openshell.drivers.podman] +socket_path = "/run/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 70ca09f7b..3854f6124 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -86,7 +86,6 @@ ssh_handshake_skew_secs = 300 ssh_session_ttl_secs = 86400 ssh_gateway_host = "127.0.0.1" ssh_gateway_port = 8080 -ssh_connect_path = "/connect/ssh" sandbox_ssh_port = 2222 # ────────────────────────────────────────────────────────────────────────────── From bd9f482d997de54ddd5a9ddae02177999da32bfe Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:23:01 -0700 Subject: [PATCH 2/5] docs(gateway): drop ssh_handshake_skew_secs and ssh_handshake_secret from examples Both fields are scheduled for removal. Remove the example values and the env-only note so the gateway.toml example and the architecture doc stop recommending settings that will not exist much longer. --- architecture/gateway.md | 19 +++++++------------ examples/gateway/gateway.example.toml | 6 ++---- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 1279e7036..76a2e8318 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -186,23 +186,18 @@ When unset, the gateway behaves exactly as before — CLI flags and env vars drive every setting. See `examples/gateway/gateway.example.toml` for a worked example and RFC 0003 for the full schema. -Two secrets are env-only and rejected when present in the file: - -| Field | Source | -|------------------------|--------------------------------------------------| -| `database_url` | `OPENSHELL_DB_URL` / `--db-url` | -| `ssh_handshake_secret` | `OPENSHELL_SSH_HANDSHAKE_SECRET` / `--ssh-handshake-secret` | +`database_url` is env-only and rejected when present in the file +(`OPENSHELL_DB_URL` / `--db-url`). ### Driver inheritance `[openshell.gateway]` carries a small set of values (`default_image`, `supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, -`client_tls_secret_name`, `host_gateway_ip`, `ssh_handshake_skew_secs`, -`enable_user_namespaces`) that are inherited into each driver's -`[openshell.drivers.]` table when the driver-specific table does not -override them. The allowlist is per-driver so a gateway-wide default cannot -land in a driver that does not understand it (e.g. -`client_tls_secret_name` is K8s-only). +`client_tls_secret_name`, `host_gateway_ip`, `enable_user_namespaces`) that +are inherited into each driver's `[openshell.drivers.]` table when +the driver-specific table does not override them. The allowlist is +per-driver so a gateway-wide default cannot land in a driver that does not +understand it (e.g. `client_tls_secret_name` is K8s-only). Driver-specific values that are not part of the inheritance allowlist (e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml index f150bbf2f..16f1681c3 100644 --- a/examples/gateway/gateway.example.toml +++ b/examples/gateway/gateway.example.toml @@ -7,9 +7,8 @@ # Source precedence (highest first): # CLI flag > OPENSHELL_* env var > this file > built-in default # -# Two values are deliberately env-only and MUST NOT appear in this file: -# - database_url (OPENSHELL_DB_URL / --db-url) -# - ssh_handshake_secret (OPENSHELL_SSH_HANDSHAKE_SECRET / --ssh-handshake-secret) +# `database_url` is deliberately env-only and MUST NOT appear in this file +# (set OPENSHELL_DB_URL / --db-url instead). [openshell] version = 1 @@ -29,7 +28,6 @@ sandbox_namespace = "openshell" sandbox_ssh_port = 2222 ssh_gateway_host = "127.0.0.1" ssh_gateway_port = 8080 -ssh_handshake_skew_secs = 300 # Shared driver defaults — inherited into [openshell.drivers.] tables # when the driver-specific table does not override them. From de05aabaddaae4423e3302ae3b46930d29b20eeb Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 18:25:48 -0700 Subject: [PATCH 3/5] docs(rfc): correct OPENSHELL_CONFIG to OPENSHELL_GATEWAY_CONFIG in RFC 0003 --- rfc/0003-gateway-configuration/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 3854f6124..9d66e04ef 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -37,7 +37,7 @@ Three sources are merged at startup, in descending priority: CLI flags > OPENSHELL_* environment variables > TOML config file > built-in defaults ``` -The TOML file is optional. If neither `--config` nor `OPENSHELL_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. +The TOML file is optional. If neither `--config` nor `OPENSHELL_GATEWAY_CONFIG` is set, the gateway behaves exactly as before. Any field present in the file is overridden by a CLI flag or matching environment variable. ### Loading the file @@ -260,7 +260,7 @@ The chart owners can migrate one section at a time: `OPENSHELL_*` env vars and t No part of this RFC has shipped yet. The work breaks down as: 1. **Add a config-file loader to `openshell-server`** — define a `GatewayConfigFile` struct that mirrors the schema above, parse it with `serde` + `toml`, and merge it into `openshell_core::Config` plus the per-driver structs in `compute/`. -2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. +2. **Wire the merge into `cli.rs`** — add `--config` / `OPENSHELL_GATEWAY_CONFIG`, gate each existing flag's "apply from file" path on clap `ValueSource::DefaultValue`, and run cross-field validation after the merge. 3. **Per-driver deserialization** — give each driver crate (`openshell-driver-{kubernetes,docker,podman,vm}`) a `from_toml` (or `serde::Deserialize`) entry point so the gateway can hand each driver its own table. 4. **Test coverage** — file parsing, env-overrides-file, CLI-overrides-env, partial TLS error, port-collision error, unknown-field rejection, missing driver table fallback. 5. **Helm chart migration** — add `gateway.config` value tree, render the `ConfigMap`, mount it, switch the gateway container to `--config`. Keep the `OPENSHELL_*` env names available as opt-in overrides for secrets. From dcfe083c9910aad89492fdd8c52c12ff8e3b6862 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Mon, 11 May 2026 19:02:05 -0700 Subject: [PATCH 4/5] docs(gateway): add per-driver TOML example configurations Adds focused single-driver examples next to the comprehensive gateway.example.toml: kubernetes, docker, podman, and microvm. Each one demonstrates the realistic settings for that driver plus how shared [openshell.gateway] defaults inherit into the driver table. A new unit test (`checked_in_examples_parse`) loads every example through the config_file loader so schema drift fails CI rather than silently shipping a broken example. --- crates/openshell-server/src/config_file.rs | 21 +++++++++++++ examples/gateway/docker.example.toml | 27 ++++++++++++++++ examples/gateway/kubernetes.example.toml | 36 ++++++++++++++++++++++ examples/gateway/microvm.example.toml | 31 +++++++++++++++++++ examples/gateway/podman.example.toml | 30 ++++++++++++++++++ 5 files changed, 145 insertions(+) create mode 100644 examples/gateway/docker.example.toml create mode 100644 examples/gateway/kubernetes.example.toml create mode 100644 examples/gateway/microvm.example.toml create mode 100644 examples/gateway/podman.example.toml diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 254de12f1..4ba8ec528 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -506,4 +506,25 @@ version = 2 .expect_err("missing file must be io error"); assert!(matches!(err, ConfigFileError::Io { .. })); } + + /// Lock in that every checked-in example under `examples/gateway/` + /// round-trips through the loader. Catches schema drift the moment a + /// field is renamed or a doc snippet falls out of date. + #[test] + fn checked_in_examples_parse() { + let repo_root = Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .and_then(Path::parent) + .expect("crate manifest has grandparent"); + for name in [ + "gateway.example.toml", + "kubernetes.example.toml", + "docker.example.toml", + "podman.example.toml", + "microvm.example.toml", + ] { + let path = repo_root.join("examples/gateway").join(name); + load(&path).unwrap_or_else(|e| panic!("{name} failed to parse: {e}")); + } + } } diff --git a/examples/gateway/docker.example.toml b/examples/gateway/docker.example.toml new file mode 100644 index 000000000..27238fe0a --- /dev/null +++ b/examples/gateway/docker.example.toml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Docker-driver example. Sandboxes run as containers on a local bridge +# network. The supervisor binary is bind-mounted from the host (no in-cluster +# image pull required); guest mTLS material is supplied as host paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["docker"] + +# Shared defaults inherited into [openshell.drivers.docker]. +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.docker] +network_name = "openshell-docker" +# Skip the image-pull-and-extract step by pointing at a locally built binary. +supervisor_bin = "/usr/local/libexec/openshell/openshell-sandbox" diff --git a/examples/gateway/kubernetes.example.toml b/examples/gateway/kubernetes.example.toml new file mode 100644 index 000000000..9a5b77605 --- /dev/null +++ b/examples/gateway/kubernetes.example.toml @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Kubernetes-driver example. The gateway runs as a Pod and creates sandbox +# Pods in another namespace. mTLS material for sandboxes is delivered via a +# Kubernetes Secret rather than host-side file paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "0.0.0.0:8080" +health_bind_address = "0.0.0.0:8081" +metrics_bind_address = "0.0.0.0:9090" +log_level = "info" +compute_drivers = ["kubernetes"] + +# Shared defaults inherited into [openshell.drivers.kubernetes]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "IfNotPresent" +client_tls_secret_name = "openshell-client-tls" + +[openshell.gateway.tls] +cert_path = "/etc/openshell-tls/server/tls.crt" +key_path = "/etc/openshell-tls/server/tls.key" +client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" + +[openshell.drivers.kubernetes] +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +# Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" +# on older clusters or where the ImageVolume feature gate is off. +supervisor_sideload_method = "image-volume" diff --git a/examples/gateway/microvm.example.toml b/examples/gateway/microvm.example.toml new file mode 100644 index 000000000..60bbd4356 --- /dev/null +++ b/examples/gateway/microvm.example.toml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# microVM-driver example. Each sandbox runs inside its own libkrun microVM +# managed by the standalone `openshell-driver-vm` subprocess. Use this driver +# when you want stronger isolation than container namespaces alone. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +# VM is never auto-detected; an explicit entry here is required. +compute_drivers = ["vm"] + +# Shared defaults inherited into [openshell.drivers.vm]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +guest_tls_ca = "/var/lib/openshell/guest-tls/ca.pem" +guest_tls_cert = "/var/lib/openshell/guest-tls/client.pem" +guest_tls_key = "/var/lib/openshell/guest-tls/client-key.pem" + +[openshell.drivers.vm] +state_dir = "/var/lib/openshell/vm" +# Where the gateway looks for the openshell-driver-vm subprocess binary. +driver_dir = "/usr/local/libexec/openshell" +vcpus = 2 +mem_mib = 2048 +krun_log_level = 1 diff --git a/examples/gateway/podman.example.toml b/examples/gateway/podman.example.toml new file mode 100644 index 000000000..11d241d6e --- /dev/null +++ b/examples/gateway/podman.example.toml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Podman-driver example. Sandboxes run as Podman containers on a user-mode +# bridge network. The supervisor image is mounted read-only via Podman's +# `type=image` mount; guest mTLS material is supplied as host paths. +# +# `database_url` is env-only — set OPENSHELL_DB_URL / --db-url. + +[openshell] +version = 1 + +[openshell.gateway] +bind_address = "127.0.0.1:8080" +log_level = "info" +compute_drivers = ["podman"] + +# Shared defaults inherited into [openshell.drivers.podman]. +default_image = "ghcr.io/nvidia/openshell/sandbox:latest" +supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" +image_pull_policy = "missing" # Podman pull policy: always | missing | never | newer +guest_tls_ca = "/etc/openshell/certs/ca.pem" +guest_tls_cert = "/etc/openshell/certs/client.pem" +guest_tls_key = "/etc/openshell/certs/client-key.pem" + +[openshell.drivers.podman] +# Rootless socket path. For root Podman use /run/podman/podman.sock. +socket_path = "/run/user/1000/podman/podman.sock" +network_name = "openshell" +stop_timeout_secs = 10 From 02df5f70df019615112c52e782666e45168c2633 Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Tue, 12 May 2026 15:36:13 -0700 Subject: [PATCH 5/5] refactor(gateway): drop image_pull_policy from shared inheritance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kubernetes and Podman use mutually-incompatible vocabularies for the same TOML key: - Kubernetes: `Always | IfNotPresent | Never` (free-form string passed verbatim to the K8s API). - Podman: `always | missing | never | newer` (strict lowercase enum deserialised into `ImagePullPolicy`). No value means the same thing in both drivers. Sharing the key at `[openshell.gateway]` scope and inheriting it into every active driver's table meant any value safe for one driver was either wrong or silently dropped for the other (`IfNotPresent` → `ImagePullPolicy::Missing` after `.unwrap_or_default()`). Operators run one driver per gateway, so the "shared default" never pays for itself. Make `image_pull_policy` driver-local: - Remove the field from `GatewayFileSection` and from `inheritable_keys()` for both Kubernetes and Podman. - Drop the file→`RunArgs` merge for the gateway-scope key. - Stop unconditionally clobbering the driver value with `config.sandbox_image_pull_policy` in the runtime wiring — only apply the CLI/env override when it was set (and, for Podman, only when it parses into the lowercase enum). - Move the key under `[openshell.drivers.kubernetes]` and `[openshell.drivers.podman]` in every example, the RFC, the architecture doc, and the Helm-rendered gateway ConfigMap. The supervisor pull policy follows the same shape: it is K8s-only and moves into `[openshell.drivers.kubernetes]` alongside `image_pull_policy` in the Helm template. --- architecture/gateway.md | 18 ++++++++++------ crates/openshell-server/src/cli.rs | 6 ------ crates/openshell-server/src/config_file.rs | 7 ------- crates/openshell-server/src/lib.rs | 21 ++++++++++++++++--- .../openshell/templates/gateway-config.yaml | 12 +++++------ examples/gateway/gateway.example.toml | 12 ++++++++--- examples/gateway/kubernetes.example.toml | 2 +- examples/gateway/podman.example.toml | 2 +- rfc/0003-gateway-configuration/README.md | 2 +- 9 files changed, 48 insertions(+), 34 deletions(-) diff --git a/architecture/gateway.md b/architecture/gateway.md index 76a2e8318..0311dd861 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -192,12 +192,18 @@ worked example and RFC 0003 for the full schema. ### Driver inheritance `[openshell.gateway]` carries a small set of values (`default_image`, -`supervisor_image`, `image_pull_policy`, `guest_tls_ca/cert/key`, -`client_tls_secret_name`, `host_gateway_ip`, `enable_user_namespaces`) that -are inherited into each driver's `[openshell.drivers.]` table when -the driver-specific table does not override them. The allowlist is -per-driver so a gateway-wide default cannot land in a driver that does not -understand it (e.g. `client_tls_secret_name` is K8s-only). +`supervisor_image`, `guest_tls_ca/cert/key`, `client_tls_secret_name`, +`host_gateway_ip`, `enable_user_namespaces`) that are inherited into each +driver's `[openshell.drivers.]` table when the driver-specific table +does not override them. The allowlist is per-driver so a gateway-wide +default cannot land in a driver that does not understand it (e.g. +`client_tls_secret_name` is K8s-only). + +`image_pull_policy` is intentionally **not** inheritable: Kubernetes uses +`Always | IfNotPresent | Never` (passed verbatim to the K8s API) while +Podman uses the lowercase enum `always | missing | never | newer`. No +value means the same thing in both, so the key lives only under each +driver's own table. Driver-specific values that are not part of the inheritance allowlist (e.g. K8s `namespace`, Podman `socket_path`, VM `vcpus`) only come from diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index 33a37d50f..687ee6022 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -559,12 +559,6 @@ fn merge_file_into_args(args: &mut RunArgs, file: &GatewayFileSection, matches: { args.sandbox_image = Some(image.clone()); } - if let Some(policy) = &file.image_pull_policy - && args.sandbox_image_pull_policy.is_none() - && arg_defaulted(matches, "sandbox_image_pull_policy") - { - args.sandbox_image_pull_policy = Some(policy.clone()); - } if let Some(secret) = &file.client_tls_secret_name && args.client_tls_secret_name.is_none() && arg_defaulted(matches, "client_tls_secret_name") diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 4ba8ec528..bcdab1947 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -107,8 +107,6 @@ pub struct GatewayFileSection { #[serde(default)] pub supervisor_image: Option, #[serde(default)] - pub image_pull_policy: Option, - #[serde(default)] pub client_tls_secret_name: Option, #[serde(default)] pub host_gateway_ip: Option, @@ -245,7 +243,6 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { ComputeDriverKind::Kubernetes => &[ "default_image", "supervisor_image", - "image_pull_policy", "client_tls_secret_name", "host_gateway_ip", "ssh_handshake_skew_secs", @@ -260,7 +257,6 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { ComputeDriverKind::Podman => &[ "default_image", "supervisor_image", - "image_pull_policy", "guest_tls_ca", "guest_tls_cert", "guest_tls_key", @@ -279,7 +275,6 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.default_image.as_deref().map(string_value), "supervisor_image" => g.supervisor_image.as_deref().map(string_value), - "image_pull_policy" => g.image_pull_policy.as_deref().map(string_value), "client_tls_secret_name" => g.client_tls_secret_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "ssh_handshake_skew_secs" => g.ssh_handshake_skew_secs.and_then(skew_value), @@ -340,7 +335,6 @@ compute_drivers = ["kubernetes"] sandbox_namespace = "agents" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-sandbox-tls" [openshell.gateway.tls] @@ -433,7 +427,6 @@ version = 2 let gateway = GatewayFileSection { default_image: Some("ghcr.io/nvidia/openshell/sandbox:0.9".to_string()), supervisor_image: Some("ghcr.io/nvidia/openshell/supervisor:0.9".to_string()), - image_pull_policy: Some("IfNotPresent".to_string()), ..Default::default() }; let raw = toml::toml! { diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index fb13ed83f..5a0a97ead 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -497,8 +497,14 @@ async fn build_compute_runtime( // file + CLI/env at startup. k8s.namespace.clone_from(&config.sandbox_namespace); k8s.default_image.clone_from(&config.sandbox_image); - k8s.image_pull_policy - .clone_from(&config.sandbox_image_pull_policy); + // Only let the gateway-wide CLI/env value overwrite the per-driver + // file value when it was actually set — otherwise the empty CLI + // default would silently clobber `image_pull_policy` configured + // under `[openshell.drivers.kubernetes]`. + if !config.sandbox_image_pull_policy.is_empty() { + k8s.image_pull_policy + .clone_from(&config.sandbox_image_pull_policy); + } k8s.grpc_endpoint.clone_from(&config.grpc_endpoint); k8s.ssh_socket_path .clone_from(&config.sandbox_ssh_socket_path); @@ -586,7 +592,16 @@ async fn build_compute_runtime( // Shared fields are sourced from Config (which already merged // file + CLI/env at startup). podman.default_image.clone_from(&config.sandbox_image); - podman.image_pull_policy = config.sandbox_image_pull_policy.parse().unwrap_or_default(); + // The CLI/env `image_pull_policy` is K8s-shaped + // (e.g. `IfNotPresent`) and won't parse into Podman's lowercase + // enum. Only apply it when the operator set a Podman-shaped value + // explicitly; otherwise keep whatever `[openshell.drivers.podman]` + // (or the struct default) provided. + if !config.sandbox_image_pull_policy.is_empty() + && let Ok(policy) = config.sandbox_image_pull_policy.parse() + { + podman.image_pull_policy = policy; + } podman.grpc_endpoint.clone_from(&config.grpc_endpoint); podman.gateway_port = config.bind_address.port(); podman diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 6caf44985..b504af537 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -33,12 +33,6 @@ data: sandbox_namespace = {{ include "openshell.sandboxNamespace" . | quote }} default_image = {{ .Values.server.sandboxImage | quote }} supervisor_image = {{ include "openshell.supervisorImage" . | quote }} - {{- if .Values.server.sandboxImagePullPolicy }} - image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} - {{- end }} - {{- if .Values.supervisor.image.pullPolicy }} - supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} - {{- end }} grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} {{- if .Values.server.sshGatewayHost }} ssh_gateway_host = {{ .Values.server.sshGatewayHost | quote }} @@ -89,3 +83,9 @@ data: [openshell.drivers.kubernetes] supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + {{- if .Values.server.sandboxImagePullPolicy }} + image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} + {{- end }} + {{- if .Values.supervisor.image.pullPolicy }} + supervisor_image_pull_policy = {{ .Values.supervisor.image.pullPolicy | quote }} + {{- end }} diff --git a/examples/gateway/gateway.example.toml b/examples/gateway/gateway.example.toml index 16f1681c3..b7cdac91b 100644 --- a/examples/gateway/gateway.example.toml +++ b/examples/gateway/gateway.example.toml @@ -33,8 +33,12 @@ ssh_gateway_port = 8080 # when the driver-specific table does not override them. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-client-tls" +# `image_pull_policy` is intentionally NOT a shared key: Kubernetes uses +# `Always | IfNotPresent | Never` (passed as a string to the K8s API) while +# Podman uses the lowercase `always | missing | never | newer` enum. There is +# no value that means the same thing in both, so set it under the relevant +# `[openshell.drivers.]` table instead. # Gateway listener TLS (distinct from the per-driver guest_tls_*). [openshell.gateway.tls] @@ -56,8 +60,9 @@ user_role = "openshell-user" # over the inherited value. [openshell.drivers.kubernetes] -namespace = "agents" -grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +namespace = "agents" +grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" # K8s vocabulary: Always | IfNotPresent | Never [openshell.drivers.docker] network_name = "openshell-docker" @@ -67,6 +72,7 @@ network_name = "openshell-docker" socket_path = "/run/podman/podman.sock" network_name = "openshell" stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer [openshell.drivers.vm] state_dir = "/var/lib/openshell/vm" diff --git a/examples/gateway/kubernetes.example.toml b/examples/gateway/kubernetes.example.toml index 9a5b77605..6dd78e5e9 100644 --- a/examples/gateway/kubernetes.example.toml +++ b/examples/gateway/kubernetes.example.toml @@ -20,7 +20,6 @@ compute_drivers = ["kubernetes"] # Shared defaults inherited into [openshell.drivers.kubernetes]. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "IfNotPresent" client_tls_secret_name = "openshell-client-tls" [openshell.gateway.tls] @@ -31,6 +30,7 @@ client_ca_path = "/etc/openshell-tls/client-ca/ca.crt" [openshell.drivers.kubernetes] namespace = "agents" grpc_endpoint = "https://openshell-gateway.agents.svc:8080" +image_pull_policy = "IfNotPresent" # Use the image volume on K8s >= 1.35 (GA in 1.36); switch to "init-container" # on older clusters or where the ImageVolume feature gate is off. supervisor_sideload_method = "image-volume" diff --git a/examples/gateway/podman.example.toml b/examples/gateway/podman.example.toml index 11d241d6e..3d963c176 100644 --- a/examples/gateway/podman.example.toml +++ b/examples/gateway/podman.example.toml @@ -18,7 +18,6 @@ compute_drivers = ["podman"] # Shared defaults inherited into [openshell.drivers.podman]. default_image = "ghcr.io/nvidia/openshell/sandbox:latest" supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" -image_pull_policy = "missing" # Podman pull policy: always | missing | never | newer guest_tls_ca = "/etc/openshell/certs/ca.pem" guest_tls_cert = "/etc/openshell/certs/client.pem" guest_tls_key = "/etc/openshell/certs/client-key.pem" @@ -28,3 +27,4 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" socket_path = "/run/user/1000/podman/podman.sock" network_name = "openshell" stop_timeout_secs = 10 +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer diff --git a/rfc/0003-gateway-configuration/README.md b/rfc/0003-gateway-configuration/README.md index 9d66e04ef..d109ce656 100644 --- a/rfc/0003-gateway-configuration/README.md +++ b/rfc/0003-gateway-configuration/README.md @@ -136,7 +136,7 @@ guest_tls_key = "/etc/openshell/certs/client-key.pem" [openshell.drivers.podman] socket_path = "/run/podman/podman.sock" default_image = "ghcr.io/nvidia/openshell/sandbox:latest" -image_pull_policy = "IfNotPresent" +image_pull_policy = "missing" # Podman vocabulary: always | missing | never | newer supervisor_image = "ghcr.io/nvidia/openshell/supervisor:latest" network_name = "openshell" stop_timeout_secs = 10