diff --git a/.agents/skills/debug-openshell-cluster/SKILL.md b/.agents/skills/debug-openshell-cluster/SKILL.md index 177c52f31..6b7f8e6b2 100644 --- a/.agents/skills/debug-openshell-cluster/SKILL.md +++ b/.agents/skills/debug-openshell-cluster/SKILL.md @@ -116,9 +116,16 @@ Check required Helm deployment secrets: kubectl -n openshell get secret \ openshell-server-tls \ openshell-server-client-ca \ - openshell-client-tls + openshell-client-tls \ + openshell-jwt-keys ``` +If the gateway exits with `failed to read sandbox JWT signing key from +/etc/openshell-jwt/signing.pem`, verify that `openshell-jwt-keys` contains +`signing.pem`, `public.pem`, and `kid`, and that the StatefulSet mounts the +`sandbox-jwt` secret at `/etc/openshell-jwt`. The sandbox JWT mount is required +even when local Helm values disable TLS. + Check the image references currently used by the gateway deployment: ```bash diff --git a/Cargo.lock b/Cargo.lock index cba681774..57e7afb78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3395,6 +3395,7 @@ dependencies = [ "rcgen", "serde", "serde_json", + "sha2 0.10.9", "tar", "tempfile", "tokio", @@ -3681,6 +3682,7 @@ name = "openshell-server" version = "0.0.0" dependencies = [ "anyhow", + "async-trait", "axum 0.8.9", "bytes", "clap", @@ -3737,6 +3739,7 @@ dependencies = [ "tower-http 0.6.8", "tracing", "tracing-subscriber", + "url", "uuid", "wiremock", "x509-parser", diff --git a/crates/openshell-bootstrap/Cargo.toml b/crates/openshell-bootstrap/Cargo.toml index c0fb7e9f4..578d59e65 100644 --- a/crates/openshell-bootstrap/Cargo.toml +++ b/crates/openshell-bootstrap/Cargo.toml @@ -16,6 +16,7 @@ bytes = { workspace = true } futures = { workspace = true } miette = { workspace = true } rcgen = { workspace = true } +sha2 = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } tar = "0.4" diff --git a/crates/openshell-bootstrap/src/jwt.rs b/crates/openshell-bootstrap/src/jwt.rs new file mode 100644 index 000000000..cf8ab0dc1 --- /dev/null +++ b/crates/openshell-bootstrap/src/jwt.rs @@ -0,0 +1,112 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted JWT signing-key generation. +//! +//! The gateway mints per-sandbox identity tokens (see PR 2 of the +//! per-sandbox identity series, issue #1354) signed with an Ed25519 +//! keypair generated once at gateway init and persisted alongside the +//! existing PKI bundle. The signing key never leaves the gateway; the +//! public key plus a stable `kid` are consumed by the gateway's own +//! validator and any future external verifiers. + +use miette::{IntoDiagnostic, Result, WrapErr}; +use rcgen::{KeyPair, PKCS_ED25519}; +use sha2::{Digest, Sha256}; + +/// All PEM-encoded material needed to mint and validate sandbox JWTs. +/// +/// The signing key stays in the gateway process. The public key is shared +/// across gateway replicas (so any replica can validate a JWT minted by +/// any other replica). The `kid` is published in every minted JWT's +/// header so the validator can pick the right key after a future rotation. +pub struct JwtKeyMaterial { + /// PKCS#8 PEM-encoded Ed25519 private key. + pub signing_key_pem: String, + /// `SubjectPublicKeyInfo` PEM-encoded Ed25519 public key. + pub public_key_pem: String, + /// Stable identifier derived from the public key (SHA-256 hex prefix). + /// Embedded in every minted JWT's `kid` header so future rotation can + /// be performed in-place by adding a second key without breaking + /// in-flight tokens. + pub kid: String, +} + +/// Generate a fresh Ed25519 JWT signing key. +/// +/// Output PEM is in the formats `jsonwebtoken` consumes via +/// `EncodingKey::from_ed_pem` (signing) and `DecodingKey::from_ed_pem` +/// (validation), so the gateway can round-trip its own tokens with no +/// further conversion. +pub fn generate_jwt_key() -> Result { + let keypair = KeyPair::generate_for(&PKCS_ED25519) + .into_diagnostic() + .wrap_err("failed to generate Ed25519 JWT signing key")?; + let signing_key_pem = keypair.serialize_pem(); + let public_key_pem = keypair.public_key_pem(); + let kid = kid_from_public_key_der(&keypair.public_key_der()); + Ok(JwtKeyMaterial { + signing_key_pem, + public_key_pem, + kid, + }) +} + +/// Stable `kid` derived from the SHA-256 of the public-key DER. +/// +/// First 16 bytes hex-encoded — collision-resistant for the small N of +/// signing keys a single deployment ever has, while staying short enough +/// to keep JWT headers compact. +fn kid_from_public_key_der(public_key_der: &[u8]) -> String { + let digest = Sha256::digest(public_key_der); + hex_encode_prefix(&digest, 16) +} + +fn hex_encode_prefix(bytes: &[u8], n: usize) -> String { + use std::fmt::Write as _; + let mut out = String::with_capacity(n * 2); + for byte in bytes.iter().take(n) { + let _ = write!(out, "{byte:02x}"); + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_jwt_key_produces_parseable_pem() { + let material = generate_jwt_key().expect("generate_jwt_key"); + assert!(material.signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(material.public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(material.kid.len(), 32, "kid is 16 bytes hex-encoded"); + assert!(material.kid.chars().all(|c| c.is_ascii_hexdigit())); + } + + #[test] + fn kid_is_stable_for_identical_public_keys() { + // Same input -> same kid. Hash of a fixed byte string. + let kid_a = kid_from_public_key_der(b"abc"); + let kid_b = kid_from_public_key_der(b"abc"); + assert_eq!(kid_a, kid_b); + } + + #[test] + fn kid_differs_for_different_public_keys() { + let kid_a = kid_from_public_key_der(b"first"); + let kid_b = kid_from_public_key_der(b"second"); + assert_ne!(kid_a, kid_b); + } + + #[test] + fn generated_keys_are_unique() { + let a = generate_jwt_key().expect("generate_jwt_key"); + let b = generate_jwt_key().expect("generate_jwt_key"); + assert_ne!( + a.kid, b.kid, + "fresh keypairs must produce distinct public keys" + ); + assert_ne!(a.signing_key_pem, b.signing_key_pem); + } +} diff --git a/crates/openshell-bootstrap/src/lib.rs b/crates/openshell-bootstrap/src/lib.rs index 0988c4b6b..8845f0392 100644 --- a/crates/openshell-bootstrap/src/lib.rs +++ b/crates/openshell-bootstrap/src/lib.rs @@ -3,6 +3,7 @@ pub mod build; pub mod edge_token; +pub mod jwt; pub mod oidc_token; mod metadata; diff --git a/crates/openshell-bootstrap/src/pki.rs b/crates/openshell-bootstrap/src/pki.rs index ed93850df..bb103bf46 100644 --- a/crates/openshell-bootstrap/src/pki.rs +++ b/crates/openshell-bootstrap/src/pki.rs @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use crate::jwt::{JwtKeyMaterial, generate_jwt_key}; use miette::{IntoDiagnostic, Result, WrapErr}; use rcgen::{BasicConstraints, CertificateParams, DnType, Ia5String, IsCa, KeyPair, SanType}; use std::net::IpAddr; @@ -15,6 +16,12 @@ pub struct PkiBundle { pub server_key_pem: String, pub client_cert_pem: String, pub client_key_pem: String, + /// PKCS#8 PEM Ed25519 private key for minting per-sandbox JWTs. + pub jwt_signing_key_pem: String, + /// SPKI PEM Ed25519 public key, paired with `jwt_signing_key_pem`. + pub jwt_public_key_pem: String, + /// Stable identifier embedded in the `kid` header of every minted JWT. + pub jwt_key_id: String, } /// Default SANs always included on the server certificate. @@ -95,6 +102,13 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { .into_diagnostic() .wrap_err("failed to sign client certificate")?; + // --- JWT signing key (Ed25519, used to mint per-sandbox identity tokens) --- + let JwtKeyMaterial { + signing_key_pem: jwt_signing_key_pem, + public_key_pem: jwt_public_key_pem, + kid: jwt_key_id, + } = generate_jwt_key().wrap_err("failed to generate JWT signing key")?; + Ok(PkiBundle { ca_cert_pem: ca_cert.pem(), ca_key_pem: ca_key.serialize_pem(), @@ -102,6 +116,9 @@ pub fn generate_pki(extra_sans: &[String]) -> Result { server_key_pem: server_key.serialize_pem(), client_cert_pem: client_cert.pem(), client_key_pem: client_key.serialize_pem(), + jwt_signing_key_pem, + jwt_public_key_pem, + jwt_key_id, }) } @@ -144,6 +161,9 @@ mod tests { assert!(bundle.server_key_pem.contains("BEGIN PRIVATE KEY")); assert!(bundle.client_cert_pem.contains("BEGIN CERTIFICATE")); assert!(bundle.client_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_signing_key_pem.contains("BEGIN PRIVATE KEY")); + assert!(bundle.jwt_public_key_pem.contains("BEGIN PUBLIC KEY")); + assert_eq!(bundle.jwt_key_id.len(), 32, "kid is 16 bytes hex-encoded"); } #[test] diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 198cb4b0a..61c79af4f 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -739,6 +739,11 @@ fn import_local_package_mtls_bundle(name: &str) -> Result> { client_key_pem: std::fs::read_to_string(&key) .into_diagnostic() .wrap_err_with(|| format!("failed to read {}", key.display()))?, + // CLI never holds the gateway's JWT signing material — only the + // gateway needs it. Fill the JWT fields with placeholders. + jwt_signing_key_pem: String::new(), + jwt_public_key_pem: String::new(), + jwt_key_id: String::new(), }; openshell_bootstrap::mtls::store_pki_bundle(name, &bundle) .wrap_err_with(|| format!("failed to store mTLS bundle for gateway '{name}'"))?; diff --git a/crates/openshell-cli/tests/ensure_providers_integration.rs b/crates/openshell-cli/tests/ensure_providers_integration.rs index bd4262b31..96f173172 100644 --- a/crates/openshell-cli/tests/ensure_providers_integration.rs +++ b/crates/openshell-cli/tests/ensure_providers_integration.rs @@ -488,6 +488,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/mtls_integration.rs b/crates/openshell-cli/tests/mtls_integration.rs index 7102ed9b6..22de566bf 100644 --- a/crates/openshell-cli/tests/mtls_integration.rs +++ b/crates/openshell-cli/tests/mtls_integration.rs @@ -397,6 +397,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/provider_commands_integration.rs b/crates/openshell-cli/tests/provider_commands_integration.rs index 49b933e67..96dce3a5b 100644 --- a/crates/openshell-cli/tests/provider_commands_integration.rs +++ b/crates/openshell-cli/tests/provider_commands_integration.rs @@ -620,6 +620,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 1ad00dd6e..6ae868487 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -574,6 +574,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs index 531599dcf..b9c52b685 100644 --- a/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs +++ b/crates/openshell-cli/tests/sandbox_name_fallback_integration.rs @@ -409,6 +409,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-core/src/config.rs b/crates/openshell-core/src/config.rs index e045d0a52..4505226c3 100644 --- a/crates/openshell-core/src/config.rs +++ b/crates/openshell-core/src/config.rs @@ -205,6 +205,13 @@ pub struct Config { #[serde(default)] pub oidc: Option, + /// Gateway-minted sandbox JWT configuration. When `Some`, the gateway + /// loads the signing key from disk and accepts gateway-issued sandbox + /// JWTs as `Principal::Sandbox`. Required for the per-sandbox identity + /// flow (issue #1354). + #[serde(default)] + pub gateway_jwt: Option, + /// Database URL for persistence. pub database_url: String, @@ -317,6 +324,37 @@ const fn default_jwks_ttl_secs() -> u64 { 3600 } +/// Gateway-minted sandbox JWT configuration. +/// +/// Points the gateway at the Ed25519 signing key (produced by `certgen`) +/// and identifies the issuer string embedded in every minted token. The +/// signing key never leaves the gateway process; the public key is loaded +/// by the same gateway so it can validate its own tokens. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GatewayJwtConfig { + /// Path to the Ed25519 signing key (PKCS#8 PEM). + pub signing_key_path: PathBuf, + /// Path to the matching public key (SPKI PEM). + pub public_key_path: PathBuf, + /// Path to the `kid` value (plain text, one line). + pub kid_path: PathBuf, + /// Stable gateway identity embedded in `iss`/`aud`. Defaults to the + /// hostname-or-`openshell` placeholder if unset. + #[serde(default = "default_gateway_id")] + pub gateway_id: String, + /// Token lifetime in seconds. Defaults to 24 hours. + #[serde(default = "default_sandbox_token_ttl_secs")] + pub ttl_secs: u64, +} + +fn default_gateway_id() -> String { + "openshell".to_string() +} + +const fn default_sandbox_token_ttl_secs() -> u64 { + 86_400 +} + fn default_roles_claim() -> String { "realm_access.roles".to_string() } @@ -340,6 +378,7 @@ impl Config { log_level: default_log_level(), tls, oidc: None, + gateway_jwt: None, database_url: String::new(), compute_drivers: vec![], ssh_session_ttl_secs: default_ssh_session_ttl_secs(), diff --git a/crates/openshell-core/src/lib.rs b/crates/openshell-core/src/lib.rs index 037174221..f58405bcb 100644 --- a/crates/openshell-core/src/lib.rs +++ b/crates/openshell-core/src/lib.rs @@ -26,7 +26,7 @@ pub mod sandbox_env; pub mod settings; pub mod time; -pub use config::{ComputeDriverKind, Config, OidcConfig, TlsConfig}; +pub use config::{ComputeDriverKind, Config, GatewayJwtConfig, OidcConfig, TlsConfig}; pub use error::{ComputeDriverError, Error, Result}; pub use metadata::{ObjectId, ObjectLabels, ObjectName}; diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index d345762ca..b367e450c 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -34,3 +34,22 @@ pub const TLS_CERT: &str = "OPENSHELL_TLS_CERT"; /// Path to the private key for mTLS communication with the gateway. pub const TLS_KEY: &str = "OPENSHELL_TLS_KEY"; + +/// Raw gateway-minted JWT identifying this sandbox. Mutually exclusive with +/// [`SANDBOX_TOKEN_FILE`] / [`K8S_SA_TOKEN_FILE`]; used only by test harnesses +/// that bypass the file-mount path. +pub const SANDBOX_TOKEN: &str = "OPENSHELL_SANDBOX_TOKEN"; + +/// Path to the file holding a gateway-minted sandbox JWT. +/// +/// Set by the Docker, Podman, and VM drivers, which write the token to a +/// bundle file at sandbox-create time. Read once at supervisor startup; +/// the token is held in process memory thereafter. +pub const SANDBOX_TOKEN_FILE: &str = "OPENSHELL_SANDBOX_TOKEN_FILE"; + +/// Path to the projected `ServiceAccount` JWT (Kubernetes driver). +/// +/// Used to bootstrap a gateway-minted JWT via `IssueSandboxToken`. Kubelet +/// writes and rotates this file; the supervisor exchanges its contents +/// for a gateway JWT at startup and on refresh. +pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 30507422b..b0b06e2d1 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -988,6 +988,19 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig ); } + // Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity series). + // Passed via env var since Docker has no native secret mount that is + // simpler than the existing bind-mount pattern; the trust boundary + // (`docker inspect` access) is already equivalent to the TLS key mount. + if let Some(spec) = sandbox.spec.as_ref() + && !spec.sandbox_token.is_empty() + { + environment.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.to_string(), + spec.sandbox_token.clone(), + ); + } + let mut pairs = environment.into_iter().collect::>(); pairs.sort_by(|left, right| left.0.cmp(&right.0)); pairs diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index 62a6b89e4..c0ce10a04 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,6 +33,7 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), + sandbox_token: String::new(), }), status: None, } diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index 28c04deb3..8e4275ab7 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -64,8 +64,25 @@ pub struct KubernetesComputeConfig { pub client_tls_secret_name: String, pub host_gateway_ip: String, pub enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes into each sandbox pod. Used only for the one-shot + /// `IssueSandboxToken` bootstrap exchange — the gateway-minted JWT + /// that follows has its own TTL set via `gateway_jwt.ttl_secs`. + /// + /// Kubelet enforces a minimum of 600 seconds; the supervisor uses + /// this token within a few seconds of pod start, so any value at + /// the floor is sufficient. Default 3600. + pub sa_token_ttl_secs: i64, } +/// Lower bound enforced by kubelet for projected SA tokens. +pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; + +/// Cap at 24h — operators who want longer-lived bootstrap tokens are +/// almost certainly misconfigured (the token is consumed seconds after +/// pod start). +pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; + impl Default for KubernetesComputeConfig { fn default() -> Self { Self { @@ -84,6 +101,22 @@ impl Default for KubernetesComputeConfig { client_tls_secret_name: String::new(), host_gateway_ip: String::new(), enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + } + } +} + +impl KubernetesComputeConfig { + /// Clamp `sa_token_ttl_secs` into the `[MIN_SA_TOKEN_TTL_SECS, + /// MAX_SA_TOKEN_TTL_SECS]` range used by the projected-volume spec. + /// Invalid (≤0) values fall back to the default 3600. + #[must_use] + pub fn effective_sa_token_ttl_secs(&self) -> i64 { + if self.sa_token_ttl_secs <= 0 { + 3600 + } else { + self.sa_token_ttl_secs + .clamp(MIN_SA_TOKEN_TTL_SECS, MAX_SA_TOKEN_TTL_SECS) } } } diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 21ec7f5bf..da9ade3eb 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -327,6 +327,7 @@ impl KubernetesComputeDriver { client_tls_secret_name: &self.config.client_tls_secret_name, host_gateway_ip: &self.config.host_gateway_ip, enable_user_namespaces: self.config.enable_user_namespaces, + sa_token_ttl_secs: self.config.effective_sa_token_ttl_secs(), }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); let api = self.api(); @@ -1042,7 +1043,6 @@ fn default_workspace_volume_claim_templates() -> serde_json::Value { } /// Parameters shared by `sandbox_to_k8s_spec` and `sandbox_template_to_k8s`. -#[derive(Default)] struct SandboxPodParams<'a> { default_image: &'a str, image_pull_policy: &'a str, @@ -1056,6 +1056,29 @@ struct SandboxPodParams<'a> { client_tls_secret_name: &'a str, host_gateway_ip: &'a str, enable_user_namespaces: bool, + /// Lifetime (seconds) of the projected `ServiceAccount` token used + /// for the bootstrap `IssueSandboxToken` exchange. + sa_token_ttl_secs: i64, +} + +impl Default for SandboxPodParams<'_> { + fn default() -> Self { + Self { + default_image: "", + image_pull_policy: "", + supervisor_image: "", + supervisor_image_pull_policy: "", + supervisor_sideload_method: SupervisorSideloadMethod::default(), + sandbox_id: "", + sandbox_name: "", + grpc_endpoint: "", + ssh_socket_path: "", + client_tls_secret_name: "", + host_gateway_ip: "", + enable_user_namespaces: false, + sa_token_ttl_secs: 3600, + } + } } fn spec_pod_env(spec: Option<&SandboxSpec>) -> std::collections::HashMap { @@ -1147,8 +1170,28 @@ fn sandbox_template_to_k8s( if !template.labels.is_empty() { metadata.insert("labels".to_string(), serde_json::json!(template.labels)); } - if let Some(annotations) = platform_config_struct(template, "annotations") { - metadata.insert("annotations".to_string(), annotations); + // Carry the sandbox UUID as a pod annotation so the gateway can resolve + // a projected SA token claim (pod name + uid) back to a sandbox identity + // when the supervisor calls `IssueSandboxToken` at startup. The gateway's + // K8s Role does NOT grant `patch pods`, so this annotation is + // effectively immutable post-create (see plan §11.8). + let mut pod_annotations = platform_config_struct(template, "annotations") + .and_then(|v| match v { + serde_json::Value::Object(map) => Some(map), + _ => None, + }) + .unwrap_or_default(); + if !params.sandbox_id.is_empty() { + pod_annotations.insert( + "openshell.io/sandbox-id".to_string(), + serde_json::Value::String(params.sandbox_id.to_string()), + ); + } + if !pod_annotations.is_empty() { + metadata.insert( + "annotations".to_string(), + serde_json::Value::Object(pod_annotations), + ); } let mut spec = serde_json::Map::new(); @@ -1235,17 +1278,26 @@ fn sandbox_template_to_k8s( }), ); - // Mount client TLS secret for mTLS to the server. + // Mount client TLS secret for mTLS to the server, plus the projected + // ServiceAccount token used to bootstrap the sandbox's gateway JWT + // via `IssueSandboxToken`. + let mut volume_mounts: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - container.insert( - "volumeMounts".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "mountPath": "/etc/openshell-tls/client", - "readOnly": true - }]), - ); - } + volume_mounts.push(serde_json::json!({ + "name": "openshell-client-tls", + "mountPath": "/etc/openshell-tls/client", + "readOnly": true + })); + } + volume_mounts.push(serde_json::json!({ + "name": "openshell-sa-token", + "mountPath": "/var/run/secrets/openshell", + "readOnly": true, + })); + container.insert( + "volumeMounts".to_string(), + serde_json::Value::Array(volume_mounts), + ); if let Some(resources) = container_resources(template, gpu) { container.insert("resources".to_string(), resources); @@ -1257,15 +1309,31 @@ fn sandbox_template_to_k8s( // Add TLS secret volume. Mode 0400 (owner-read) prevents the // unprivileged sandbox user from reading the mTLS private key. + let mut volumes: Vec = Vec::new(); if !params.client_tls_secret_name.is_empty() { - spec.insert( - "volumes".to_string(), - serde_json::json!([{ - "name": "openshell-client-tls", - "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } - }]), - ); - } + volumes.push(serde_json::json!({ + "name": "openshell-client-tls", + "secret": { "secretName": params.client_tls_secret_name, "defaultMode": 256 } + })); + } + // Projected ServiceAccountToken volume — kubelet writes a short-lived + // audience-bound JWT into /var/run/secrets/openshell/token and rotates + // it automatically. The supervisor exchanges this for a gateway-minted + // JWT via `IssueSandboxToken` once at startup. + volumes.push(serde_json::json!({ + "name": "openshell-sa-token", + "projected": { + "sources": [{ + "serviceAccountToken": { + "audience": "openshell-gateway", + "expirationSeconds": params.sa_token_ttl_secs, + "path": "token" + } + }], + "defaultMode": 256 + } + })); + spec.insert("volumes".to_string(), serde_json::Value::Array(volumes)); // Add hostAliases so sandbox pods can reach the Docker host. if !params.host_gateway_ip.is_empty() { @@ -1444,6 +1512,14 @@ fn apply_required_env( "/etc/openshell-tls/client/tls.key", ); } + // Projected ServiceAccount token written by kubelet (see the volume + // definition in `sandbox_template_to_k8s`). The supervisor reads this + // and exchanges it for a gateway-minted JWT via `IssueSandboxToken`. + upsert_env( + env, + openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, + "/var/run/secrets/openshell/token", + ); } fn upsert_env(env: &mut Vec, name: &str, value: &str) { diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index a170b5785..ac500e650 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -68,6 +68,13 @@ struct Args { #[arg(long, env = "OPENSHELL_ENABLE_USER_NAMESPACES")] enable_user_namespaces: bool, + + /// Lifetime (seconds) of the projected `ServiceAccount` token + /// kubelet writes into each sandbox pod for the `IssueSandboxToken` + /// bootstrap exchange. Kubelet enforces a minimum of 600s; the + /// gateway clamps values outside `[600, 86400]`. Default 3600. + #[arg(long, env = "OPENSHELL_K8S_SA_TOKEN_TTL_SECS", default_value_t = 3600)] + sa_token_ttl_secs: i64, } #[tokio::main] @@ -93,6 +100,7 @@ async fn main() -> Result<()> { client_tls_secret_name: args.client_tls_secret_name.unwrap_or_default(), host_gateway_ip: args.host_gateway_ip.unwrap_or_default(), enable_user_namespaces: args.enable_user_namespaces, + sa_token_ttl_secs: args.sa_token_ttl_secs, }) .await .into_diagnostic()?; diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index 1cb58e338..e00439703 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -299,6 +299,17 @@ fn build_env( ); } + // 4. Gateway-minted sandbox JWT (PR 3 of the per-sandbox identity + // series). Passed via env var; the supervisor reads it directly. + if let Some(s) = spec + && !s.sandbox_token.is_empty() + { + env.insert( + openshell_core::sandbox_env::SANDBOX_TOKEN.into(), + s.sandbox_token.clone(), + ); + } + env } diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs new file mode 100644 index 000000000..013099198 --- /dev/null +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -0,0 +1,236 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! One-shot debug RPCs exposed via `openshell-sandbox debug-rpc`. +//! +//! Designed for end-to-end verification of the per-sandbox identity +//! flow (issue #1354). A `docker exec` (or `kubectl exec`) into a +//! running sandbox can issue raw sandbox-class gRPC calls without +//! standing up a custom binary inside the sandbox image — useful for +//! confirming the cross-sandbox IDOR guard and refresh semantics. +//! +//! Subcommands: +//! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` +//! - `refresh` — call `RefreshSandboxToken` +//! - `show-token` — print the raw gateway JWT bytes +//! - `show-principal` — pretty-print the decoded JWT claims +//! (no signature verification — the supervisor already trusts the +//! token's origin) + +use base64::Engine as _; +use miette::{IntoDiagnostic, Result, WrapErr}; +use openshell_core::proto::{ + GetSandboxConfigRequest, RefreshSandboxTokenRequest, open_shell_client::OpenShellClient, +}; + +use crate::grpc_client::{AuthedChannel, connect_channel_pub}; + +/// Entry point for the `debug-rpc` subcommand. Returns the process exit +/// code; `main` propagates it. +pub async fn run(args: &[String]) -> Result { + let cmd = args + .first() + .map(String::as_str) + .ok_or_else(|| miette::miette!("{}", USAGE))?; + + match cmd { + "get-sandbox-config" => run_get_sandbox_config(&args[1..]).await, + "refresh" => run_refresh().await, + "show-token" => run_show_token(), + "show-principal" => run_show_principal(), + "--help" | "-h" => { + println!("{USAGE}"); + Ok(0) + } + other => Err(miette::miette!( + "unknown debug-rpc command '{other}'\n\n{USAGE}" + )), + } +} + +const USAGE: &str = "\ +usage: openshell-sandbox debug-rpc [options] + +commands: + get-sandbox-config --sandbox-id call GetSandboxConfig + refresh call RefreshSandboxToken + show-token print raw gateway JWT + show-principal print decoded JWT claims + +requires: OPENSHELL_ENDPOINT in env, plus one of OPENSHELL_SANDBOX_TOKEN, +OPENSHELL_SANDBOX_TOKEN_FILE, or OPENSHELL_K8S_SA_TOKEN_FILE so the +supervisor's normal token-acquisition path can resolve a JWT."; + +async fn open_client() -> Result> { + let endpoint = std::env::var(openshell_core::sandbox_env::ENDPOINT) + .into_diagnostic() + .wrap_err("OPENSHELL_ENDPOINT must be set")?; + let channel = connect_channel_pub(&endpoint).await?; + Ok(OpenShellClient::new(channel)) +} + +async fn run_get_sandbox_config(args: &[String]) -> Result { + let sandbox_id = parse_flag(args, "--sandbox-id") + .ok_or_else(|| miette::miette!("get-sandbox-config: --sandbox-id is required"))?; + let mut client = open_client().await?; + let resp = client + .get_sandbox_config(GetSandboxConfigRequest { + sandbox_id: sandbox_id.to_string(), + }) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "version={} policy_hash={} config_revision={}", + inner.version, inner.policy_hash, inner.config_revision + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + // Map gRPC status to a non-zero exit so callers can branch + // (e.g. expect-permission-denied in a shell test). + Ok(match status.code() { + tonic::Code::PermissionDenied => 7, + tonic::Code::Unauthenticated => 16, + tonic::Code::NotFound => 5, + _ => 1, + }) + } + } +} + +async fn run_refresh() -> Result { + let mut client = open_client().await?; + let resp = client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await; + match resp { + Ok(r) => { + let inner = r.into_inner(); + println!( + "token={}\nexpires_at_ms={}", + inner.token, inner.expires_at_ms + ); + Ok(0) + } + Err(status) => { + eprintln!("{}: {}", code_name(status.code()), status.message()); + Ok(1) + } + } +} + +fn run_show_token() -> Result { + let token = read_local_token()?; + println!("{token}"); + Ok(0) +} + +fn run_show_principal() -> Result { + let token = read_local_token()?; + let payload_b64 = token + .split('.') + .nth(1) + .ok_or_else(|| miette::miette!("token has no payload segment"))?; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .into_diagnostic() + .wrap_err("failed to base64-decode token payload")?; + let claims: serde_json::Value = serde_json::from_slice(&payload) + .into_diagnostic() + .wrap_err("failed to parse token payload as JSON")?; + println!( + "{}", + serde_json::to_string_pretty(&claims).into_diagnostic()? + ); + Ok(0) +} + +/// Read the token from the env/file/SA-bootstrap chain, but only the +/// "already a gateway JWT" paths — show-token / show-principal don't +/// want to actually exchange an SA token. +fn read_local_token() -> Result { + if let Ok(t) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + return Ok(t); + } + if let Ok(path) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + return Ok(std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))? + .trim() + .to_string()); + } + Err(miette::miette!( + "no in-process gateway JWT available — set OPENSHELL_SANDBOX_TOKEN or \ + OPENSHELL_SANDBOX_TOKEN_FILE. The K8s SA-bootstrap path is intentionally \ + excluded from `show-token` / `show-principal` to avoid issuing a fresh \ + token just for inspection." + )) +} + +fn parse_flag<'a>(args: &'a [String], name: &str) -> Option<&'a str> { + let mut iter = args.iter(); + while let Some(a) = iter.next() { + if a == name { + return iter.next().map(String::as_str); + } + if let Some(rest) = a.strip_prefix(&format!("{name}=")) { + return Some(rest); + } + } + None +} + +fn code_name(c: tonic::Code) -> &'static str { + match c { + tonic::Code::Ok => "OK", + tonic::Code::Cancelled => "Cancelled", + tonic::Code::Unknown => "Unknown", + tonic::Code::InvalidArgument => "InvalidArgument", + tonic::Code::DeadlineExceeded => "DeadlineExceeded", + tonic::Code::NotFound => "NotFound", + tonic::Code::AlreadyExists => "AlreadyExists", + tonic::Code::PermissionDenied => "PermissionDenied", + tonic::Code::ResourceExhausted => "ResourceExhausted", + tonic::Code::FailedPrecondition => "FailedPrecondition", + tonic::Code::Aborted => "Aborted", + tonic::Code::OutOfRange => "OutOfRange", + tonic::Code::Unimplemented => "Unimplemented", + tonic::Code::Internal => "Internal", + tonic::Code::Unavailable => "Unavailable", + tonic::Code::DataLoss => "DataLoss", + tonic::Code::Unauthenticated => "Unauthenticated", + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_flag_handles_space_separated() { + let args: Vec = ["--sandbox-id", "abc-123"] + .iter() + .map(ToString::to_string) + .collect(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_handles_equals_separated() { + let args: Vec = ["--sandbox-id=abc-123".to_string()].to_vec(); + assert_eq!(parse_flag(&args, "--sandbox-id"), Some("abc-123")); + } + + #[test] + fn parse_flag_returns_none_when_missing() { + let args: Vec = ["--other".to_string(), "x".to_string()].to_vec(); + assert!(parse_flag(&args, "--sandbox-id").is_none()); + } +} diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 28492b543..934dff2b5 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -3,22 +3,100 @@ //! gRPC client for fetching sandbox policy, provider environment, and inference //! route bundles from `OpenShell` server. +//! +//! Every request carries a gateway-minted JWT in the `Authorization` header +//! (PR 3 of the per-sandbox identity series; see issue #1354). The token is +//! resolved at startup from one of three sources: +//! +//! 1. `OPENSHELL_SANDBOX_TOKEN` — raw JWT in the env (test harness path). +//! 2. `OPENSHELL_SANDBOX_TOKEN_FILE` — file containing the JWT (Docker / +//! Podman / VM drivers write this to a bundle file at sandbox-create +//! time). +//! 3. `OPENSHELL_K8S_SA_TOKEN_FILE` — projected `ServiceAccount` JWT; the +//! supervisor exchanges it for a gateway JWT via `IssueSandboxToken` +//! once at startup. +//! +//! The resolved gateway JWT is held in process memory thereafter and +//! injected on every outbound call by [`AuthInterceptor`]. use std::collections::HashMap; -use std::time::Duration; +use std::sync::{Arc, OnceLock, RwLock}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, - GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, - PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, - SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, - inference_client::InferenceClient, open_shell_client::OpenShellClient, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, + PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, + SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, + UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; +use openshell_core::sandbox_env; +use tonic::Status; +use tonic::metadata::AsciiMetadataValue; +use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; -use tracing::debug; +use tracing::{debug, info, warn}; -/// Create a channel to the `OpenShell` server. +/// Channel type after the [`AuthInterceptor`] is applied. Aliased so the +/// generated client type signatures stay readable. +pub type AuthedChannel = InterceptedService; + +/// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read +/// the same slot, so the PR-5 refresh task can rotate the token in place +/// without rebuilding the channel. +type TokenSlot = Arc>; + +/// Process-wide token slot. Initialized by the first [`connect_channel`] +/// call and shared with every subsequent client + the refresh loop. +static TOKEN_SLOT: OnceLock = OnceLock::new(); + +/// One-shot guard so the refresh loop spawns at most once per process. +static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); + +fn install_token_slot(token: &str) -> Result { + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) + .into_diagnostic() + .wrap_err("sandbox JWT contained characters not valid for a header value")?; + if let Some(existing) = TOKEN_SLOT.get() { + *existing.write().expect("token slot poisoned") = bearer; + return Ok(existing.clone()); + } + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let _ = TOKEN_SLOT.set(slot.clone()); + Ok(TOKEN_SLOT.get().cloned().unwrap_or(slot)) +} + +/// gRPC interceptor that injects `authorization: Bearer ` on every +/// outbound request. The token lives in a shared [`TokenSlot`] so the +/// PR-5 refresh task can replace it without rebuilding clients. +#[derive(Clone)] +pub struct AuthInterceptor { + bearer: TokenSlot, +} + +impl AuthInterceptor { + fn new(bearer: TokenSlot) -> Self { + Self { bearer } + } +} + +impl tonic::service::Interceptor for AuthInterceptor { + fn call( + &mut self, + mut req: tonic::Request<()>, + ) -> std::result::Result, Status> { + let bearer = self + .bearer + .read() + .expect("auth interceptor token slot poisoned") + .clone(); + req.metadata_mut().insert("authorization", bearer); + Ok(req) + } +} + +/// Build the plain (un-intercepted) gRPC channel. /// /// When the endpoint uses `https://`, mTLS is configured using these env vars: /// - `OPENSHELL_TLS_CA` -- path to the CA certificate @@ -27,7 +105,7 @@ use tracing::debug; /// /// When the endpoint uses `http://`, a plaintext connection is used (for /// deployments where TLS is disabled, e.g. behind a Cloudflare Tunnel). -async fn connect_channel(endpoint: &str) -> Result { +async fn build_plain_channel(endpoint: &str) -> Result { let mut ep = Endpoint::from_shared(endpoint.to_string()) .into_diagnostic() .wrap_err("invalid gRPC endpoint")? @@ -43,13 +121,13 @@ async fn connect_channel(endpoint: &str) -> Result { let tls_enabled = endpoint.starts_with("https://"); if tls_enabled { - let ca_path = std::env::var(openshell_core::sandbox_env::TLS_CA) + let ca_path = std::env::var(sandbox_env::TLS_CA) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CA is required")?; - let cert_path = std::env::var(openshell_core::sandbox_env::TLS_CERT) + let cert_path = std::env::var(sandbox_env::TLS_CERT) .into_diagnostic() .wrap_err("OPENSHELL_TLS_CERT is required")?; - let key_path = std::env::var(openshell_core::sandbox_env::TLS_KEY) + let key_path = std::env::var(sandbox_env::TLS_KEY) .into_diagnostic() .wrap_err("OPENSHELL_TLS_KEY is required")?; @@ -79,24 +157,246 @@ async fn connect_channel(endpoint: &str) -> Result { .wrap_err("failed to connect to OpenShell server") } -/// Create a channel to the `OpenShell` server (public for use by `supervisor_session`). -pub async fn connect_channel_pub(endpoint: &str) -> Result { +/// Build a Bearer-authenticated channel to the gateway. +/// +/// First call per process resolves the sandbox JWT via the three-step +/// lookup (env → file → K8s SA bootstrap exchange) and installs it into +/// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached +/// slot — the refresh loop keeps the value fresh, so re-running the +/// bootstrap is both unnecessary and (on the K8s SA path) expensive +/// (one apiserver round-trip per call). The refresh loop itself is +/// spawned once per process via [`REFRESH_SPAWNED`]. +async fn connect_channel(endpoint: &str) -> Result { + let channel = build_plain_channel(endpoint).await?; + let slot = if let Some(existing) = TOKEN_SLOT.get() { + existing.clone() + } else { + let token = acquire_sandbox_token(endpoint, &channel).await?; + install_token_slot(&token)? + }; + let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot.clone())); + if REFRESH_SPAWNED.set(()).is_ok() { + let refresh_channel = intercepted.clone(); + tokio::spawn(async move { + refresh_token_loop(refresh_channel, slot).await; + }); + } + Ok(intercepted) +} + +/// Resolve the sandbox JWT used to authenticate every outbound RPC. +/// +/// `endpoint` is logged on errors but never used for transport here; the +/// actual network call lives inside this function only on the K8s +/// bootstrap path, which uses `plain_channel` to call `IssueSandboxToken` +/// once before the steady-state Bearer-authenticated channel is built. +async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Result { + if let Ok(t) = std::env::var(sandbox_env::SANDBOX_TOKEN) + && !t.is_empty() + { + debug!(source = "env", "loaded sandbox token"); + return Ok(t); + } + + if let Ok(path) = std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) + && !path.is_empty() + { + let contents = std::fs::read_to_string(&path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {path}"))?; + debug!(source = "file", path = %path, "loaded sandbox token"); + return Ok(contents.trim().to_string()); + } + + if let Ok(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) + && !sa_path.is_empty() + { + let sa_token = std::fs::read_to_string(&sa_path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read K8s SA token from {sa_path}"))? + .trim() + .to_string(); + info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); + // The bootstrap exchange uses a one-off interceptor pinned to the + // SA token; the resulting gateway JWT becomes the value in the + // shared `TOKEN_SLOT` once `connect_channel` returns. + let bootstrap_slot: TokenSlot = Arc::new(RwLock::new( + AsciiMetadataValue::try_from(format!("Bearer {sa_token}")) + .into_diagnostic() + .wrap_err("SA token contained characters not valid for a header value")?, + )); + let interceptor = AuthInterceptor::new(bootstrap_slot); + let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); + let mut client = OpenShellClient::new(bootstrap); + let resp = client + .issue_sandbox_token(IssueSandboxTokenRequest {}) + .await + .into_diagnostic() + .wrap_err("IssueSandboxToken bootstrap exchange failed")?; + return Ok(resp.into_inner().token); + } + + Err(miette::miette!( + "no sandbox token source available — set one of {}, {}, or {}", + sandbox_env::SANDBOX_TOKEN, + sandbox_env::SANDBOX_TOKEN_FILE, + sandbox_env::K8S_SA_TOKEN_FILE, + )) +} + +/// Build an authenticated channel for direct external use (e.g. the +/// long-lived `supervisor_session` control stream). +pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } +/// Background task that rotates the sandbox JWT at ~80% of its remaining +/// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all +/// in-flight and future clients pick it up on their next request. The +/// loop never panics: every failure is logged and re-attempted after a +/// bounded backoff. +async fn refresh_token_loop(channel: AuthedChannel, slot: TokenSlot) { + let mut client = OpenShellClient::new(channel); + loop { + let sleep = compute_refresh_delay(&slot); + tokio::time::sleep(sleep).await; + match client + .refresh_sandbox_token(RefreshSandboxTokenRequest {}) + .await + { + Ok(resp) => { + let new_token = resp.into_inner().token; + match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("rotated gateway sandbox JWT in-place"); + } + } + Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), + } + } + Err(status) => { + warn!(error = %status, "RefreshSandboxToken failed; will retry"); + // Backoff so we don't spin against a sustained failure. + tokio::time::sleep(Duration::from_secs(60)).await; + } + } + } +} + +/// Compute the next refresh delay: 80 % of the time remaining until the +/// current token's `exp`, plus up to 10 % jitter, floored at 60 s and +/// capped at 12 h. If the token can't be parsed (legacy/non-JWT bearer) +/// default to 6 h. +fn compute_refresh_delay(slot: &TokenSlot) -> Duration { + let token = slot + .read() + .ok() + .and_then(|v| v.to_str().ok().map(str::to_string)) + .unwrap_or_default(); + let bearer = token.strip_prefix("Bearer ").unwrap_or(&token); + let now_ms = i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX); + let remaining_ms = parse_jwt_exp_ms(bearer).map_or(21_600_000, |exp| exp - now_ms); // 6 h fallback + let mut delay_ms = (remaining_ms.max(0) * 8 / 10).clamp(60_000, 43_200_000); + // Up to 10 % jitter, derived deterministically from token bytes so + // unit tests are reproducible without injecting an RNG. + let jitter_pct = (token.len() % 10) as u64; + let jitter_ms = (u64::try_from(delay_ms).unwrap_or(0) * jitter_pct) / 100; + delay_ms = delay_ms.saturating_add(i64::try_from(jitter_ms).unwrap_or(0)); + Duration::from_millis(u64::try_from(delay_ms).unwrap_or(0)) +} + +/// Decode the `exp` claim from a JWT without verifying its signature. +/// Returns the expiry in milliseconds since the Unix epoch, or `None` if +/// the token is not a parseable JWT. +fn parse_jwt_exp_ms(jwt: &str) -> Option { + use base64::Engine; + let mut parts = jwt.splitn(3, '.'); + let _header = parts.next()?; + let payload_b64 = parts.next()?; + let decoded = base64::engine::general_purpose::URL_SAFE_NO_PAD + .decode(payload_b64) + .ok()?; + let value: serde_json::Value = serde_json::from_slice(&decoded).ok()?; + let exp_secs = value.get("exp")?.as_i64()?; + exp_secs.checked_mul(1000) +} + +#[cfg(test)] +mod auth_tests { + use super::*; + + #[test] + fn parse_jwt_exp_reads_unsigned_payload() { + use base64::Engine as _; + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD + .encode(br#"{"exp":1234567890,"sandbox_id":"sb-1"}"#); + let token = format!("h.{payload}.sig"); + assert_eq!(parse_jwt_exp_ms(&token), Some(1_234_567_890_000)); + } + + #[test] + fn parse_jwt_exp_returns_none_for_malformed_token() { + assert!(parse_jwt_exp_ms("not-a-jwt").is_none()); + assert!(parse_jwt_exp_ms("only.two").is_none()); + assert!(parse_jwt_exp_ms("a.!!!.c").is_none()); + } + + #[test] + fn compute_refresh_delay_uses_80_percent_when_token_present() { + // Build a JWT whose exp is 1000 seconds in the future. With 0-jitter + // the delay should be roughly 800 seconds. + use base64::Engine as _; + let now_s = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_secs(); + let exp = now_s + 1000; + let payload_json = format!(r#"{{"exp":{exp}}}"#); + let payload = base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(payload_json); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + // 800 s baseline + up to 10 % jitter → 800..=880 s, with some slack + // for the 1-second resolution of the exp claim. + let secs = delay.as_secs(); + assert!( + (700..=900).contains(&secs), + "expected 80%-of-1000s delay, got {secs}s" + ); + } + + #[test] + fn compute_refresh_delay_floors_at_60_seconds() { + // Already-expired token still produces a 60 s floor so the loop + // doesn't busy-spin. + use base64::Engine as _; + let exp = 1; // past + let payload = + base64::engine::general_purpose::URL_SAFE_NO_PAD.encode(format!(r#"{{"exp":{exp}}}"#)); + let token = format!("h.{payload}.s"); + let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")).unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer)); + let delay = compute_refresh_delay(&slot); + assert!(delay.as_secs() >= 60); + } +} + /// Connect to the `OpenShell` server. -/// -/// Sandboxes authenticate to the gateway via the mTLS client certificate -/// configured by `connect_channel`. They do not present an OIDC Bearer -/// token; the gateway recognises sandbox-class callers by absence of a -/// Bearer header on the request. -async fn connect(endpoint: &str) -> Result> { +async fn connect(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(OpenShellClient::new(channel)) } /// Connect to the inference service. -async fn connect_inference(endpoint: &str) -> Result> { +async fn connect_inference(endpoint: &str) -> Result> { let channel = connect_channel(endpoint).await?; Ok(InferenceClient::new(channel)) } @@ -118,7 +418,7 @@ pub async fn fetch_policy(endpoint: &str, sandbox_id: &str) -> Result, + client: &mut OpenShellClient, sandbox_id: &str, ) -> Result> { let response = client @@ -142,7 +442,7 @@ async fn fetch_policy_with_client( /// Sync a locally-discovered policy using an existing client connection. async fn sync_policy_with_client( - client: &mut OpenShellClient, + client: &mut OpenShellClient, sandbox: &str, policy: &ProtoSandboxPolicy, ) -> Result<()> { @@ -236,7 +536,7 @@ pub async fn fetch_provider_environment( /// and status reporting, avoiding per-request TLS handshake overhead. #[derive(Clone)] pub struct CachedOpenShellClient { - client: OpenShellClient, + client: OpenShellClient, } /// Settings poll result returned by [`CachedOpenShellClient::poll_settings`]. @@ -266,7 +566,7 @@ impl CachedOpenShellClient { } /// Get a clone of the underlying tonic client for direct RPC calls. - pub fn raw_client(&self) -> OpenShellClient { + pub fn raw_client(&self) -> OpenShellClient { self.client.clone() } diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index e297b9262..a9ea57fe0 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -7,6 +7,7 @@ pub mod bypass_monitor; mod child_env; +pub mod debug_rpc; pub mod denial_aggregator; mod grpc_client; mod identity; diff --git a/crates/openshell-sandbox/src/main.rs b/crates/openshell-sandbox/src/main.rs index 4a6cb1955..3c9e21578 100644 --- a/crates/openshell-sandbox/src/main.rs +++ b/crates/openshell-sandbox/src/main.rs @@ -24,6 +24,15 @@ use openshell_sandbox::run_sandbox; /// performs the copy in pure Rust. const COPY_SELF_SUBCOMMAND: &str = "copy-self"; +/// Subcommand for one-shot debug RPCs from inside a sandbox container. +/// +/// Reads the same token sources as the supervisor (env, file, K8s SA +/// bootstrap) and issues a single gRPC call against the gateway. Useful +/// for end-to-end verification: e.g. `docker exec` into a sandbox, then +/// run `openshell-sandbox debug-rpc get-sandbox-config --sandbox-id ` +/// to confirm the cross-sandbox IDOR guard fires. +const DEBUG_RPC_SUBCOMMAND: &str = "debug-rpc"; + /// `OpenShell` Sandbox - process isolation and monitoring. #[derive(Parser, Debug)] #[command(name = "openshell-sandbox")] @@ -150,6 +159,20 @@ fn main() -> Result<()> { return copy_self(dest); } + // Handle `debug-rpc [args]` before clap. Uses a small + // dedicated runtime so we don't pay the supervisor's full startup cost. + if raw_args.get(1).map(String::as_str) == Some(DEBUG_RPC_SUBCOMMAND) { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .into_diagnostic()?; + return runtime.block_on(async move { + let _ = rustls::crypto::ring::default_provider().install_default(); + let exit = openshell_sandbox::debug_rpc::run(&raw_args[2..]).await?; + std::process::exit(exit); + }); + } + let args = Args::parse(); // Try to open a rolling log file; fall back to stderr-only logging if it fails diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index 3d2f6d576..8c6eb77f3 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -155,6 +155,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { @@ -281,6 +290,15 @@ impl ProcessHandle { .kill_on_drop(true) .env(openshell_core::sandbox_env::SANDBOX, "1"); + // Strip supervisor-only credentials from the entrypoint's inherited + // environment. The entrypoint drops to the sandbox user before + // `exec`; without this strip, anything running as the sandbox user + // (e.g. an SSH-spawned shell) could read /proc//environ + // and recover the gateway-minted JWT. Issue #1354. + cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) + .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); + inject_provider_env(&mut cmd, provider_env); if let Some(dir) = workdir { diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 6485dddf0..4d7392ee3 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -28,7 +28,6 @@ use openshell_ocsf::{ use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc; use tokio_stream::StreamExt; -use tonic::transport::Channel; use tracing::{debug, warn}; use crate::grpc_client; @@ -371,7 +370,7 @@ fn handle_gateway_message( sandbox_id: &str, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: &Channel, + channel: &grpc_client::AuthedChannel, tx: &mpsc::Sender, ) { match &msg.payload { @@ -436,7 +435,7 @@ async fn handle_relay_open( relay_open: RelayOpen, ssh_socket_path: &std::path::Path, netns_fd: Option, - channel: Channel, + channel: grpc_client::AuthedChannel, tx: mpsc::Sender, ) -> Result<(), Box> { let channel_id = relay_open.channel_id.clone(); diff --git a/crates/openshell-server/Cargo.toml b/crates/openshell-server/Cargo.toml index 4bbfe24fc..fa19ab526 100644 --- a/crates/openshell-server/Cargo.toml +++ b/crates/openshell-server/Cargo.toml @@ -82,6 +82,8 @@ uuid = { workspace = true } hmac = "0.12" sha2 = { workspace = true } jsonwebtoken = { workspace = true } +async-trait = "0.1" +url = { workspace = true } hex = "0.4" russh = "0.57" rand = { workspace = true } diff --git a/crates/openshell-server/src/auth/authenticator.rs b/crates/openshell-server/src/auth/authenticator.rs new file mode 100644 index 000000000..ee11f8f35 --- /dev/null +++ b/crates/openshell-server/src/auth/authenticator.rs @@ -0,0 +1,277 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Pluggable authentication trait + chain dispatch. +//! +//! The gateway runs every authenticated request through an +//! [`AuthenticatorChain`] of [`Authenticator`] implementations. The chain +//! evaluates authenticators in order; the first one that recognizes the +//! caller produces the [`Principal`]. An authenticator that does not apply +//! (e.g. an OIDC authenticator seeing no Bearer header) returns `Ok(None)` +//! so the chain falls through to the next. An authenticator that *does* +//! apply but rejects the caller returns `Err(Status)`, which terminates +//! the chain — fail-closed. +//! +//! Live authenticators slotting into the chain: +//! - [`super::sandbox_jwt::SandboxJwtAuthenticator`] — gateway-minted JWTs +//! - [`super::k8s_sa::K8sServiceAccountAuthenticator`] — K8s projected SA +//! tokens (path-scoped to `IssueSandboxToken`) +//! - [`super::oidc::OidcAuthenticator`] — user OIDC Bearer tokens +//! - [`PermissiveUserAuthenticator`] — final-fallback dev-mode catch-all +//! that produces a synthetic user principal when no OIDC is +//! configured. Preserves the pre-PR-1 "no OIDC = open" posture for +//! singleplayer / helm-dev deployments. + +use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; +use std::sync::Arc; +use tonic::Status; + +/// Pluggable authentication step. +/// +/// Implementations are expected to be cheap to clone (they live behind +/// `Arc` inside an [`AuthenticatorChain`]). +#[async_trait] +pub trait Authenticator: Send + Sync + 'static { + /// Inspect an inbound request and return the authenticated principal. + /// + /// - `Ok(Some(principal))` — this authenticator recognized the caller. + /// The chain stops and the principal is inserted into request + /// extensions. + /// - `Ok(None)` — this authenticator does not apply (e.g. no Bearer + /// token for an OIDC authenticator). The chain falls through to + /// the next authenticator. + /// - `Err(status)` — this authenticator applies but rejected the + /// caller. The chain terminates and the status is returned to the + /// client. Fail-closed. + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status>; +} + +/// First-match-wins authenticator chain. +/// +/// The chain owns its authenticators behind `Arc` so the entire chain is +/// cheap to clone — required because `tower::Service::call` clones the +/// router on every request. +#[derive(Clone)] +pub struct AuthenticatorChain { + authenticators: Arc<[Arc]>, +} + +impl AuthenticatorChain { + /// Build a chain from an ordered list of authenticators. Earlier + /// entries are evaluated first. + pub fn new(authenticators: Vec>) -> Self { + Self { + authenticators: Arc::from(authenticators), + } + } + + /// Run the chain. Returns the first principal produced. If every + /// authenticator returns `Ok(None)`, the result is `Ok(None)` — the + /// router translates that to `unauthenticated`. + pub async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + for authenticator in self.authenticators.iter() { + if let Some(principal) = authenticator.authenticate(headers, path).await? { + return Ok(Some(principal)); + } + } + Ok(None) + } +} + +impl std::fmt::Debug for AuthenticatorChain { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AuthenticatorChain") + .field("len", &self.authenticators.len()) + .finish() + } +} + +/// Final-fallback authenticator that produces a synthetic user principal +/// for any request the earlier authenticators didn't claim. Used only +/// when no user-side authentication is configured (no OIDC, no fronting +/// proxy contract) — the pre-PR-1 gateway accepted such requests with +/// no auth at all; this preserves that posture in a principal-aware +/// way so handlers always see *some* principal in extensions. +/// +/// Producing a User principal (rather than Anonymous) means dev-mode +/// requests pass the per-handler IDOR guard via the User-bypass +/// branch — equivalent to "RBAC was the user's gate" with the dev +/// default of "every caller is a user." +pub struct PermissiveUserAuthenticator { + subject: String, +} + +impl PermissiveUserAuthenticator { + pub fn new(subject: impl Into) -> Self { + Self { + subject: subject.into(), + } + } +} + +#[async_trait] +impl Authenticator for PermissiveUserAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + Ok(Some(Principal::User(UserPrincipal { + identity: Identity { + subject: self.subject.clone(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Internal, + }, + }))) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Authenticator that always returns the configured outcome. Used by + /// tests to inject a known principal (or rejection) without running real + /// crypto. Each call records the path it was invoked with so tests can + /// assert chain ordering. + pub struct MockAuthenticator { + pub outcome: Result, Status>, + pub calls: Mutex>, + } + + impl MockAuthenticator { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + calls: Mutex::new(Vec::new()), + } + } + + pub fn call_count(&self) -> usize { + self.calls.lock().unwrap().len() + } + } + + #[async_trait] + impl Authenticator for MockAuthenticator { + async fn authenticate( + &self, + _headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + self.calls.lock().unwrap().push(path.to_string()); + self.outcome.clone() + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::MockAuthenticator; + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::UserPrincipal; + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + #[tokio::test] + async fn chain_returns_first_match() { + let first = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "second authenticator must be skipped after first matches" + ); + } + + #[tokio::test] + async fn chain_falls_through_on_none() { + let first = Arc::new(MockAuthenticator::returning(Ok(None))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap() + .expect("expected a principal"); + match result { + Principal::User(u) => assert_eq!(u.identity.subject, "bob"), + _ => panic!("expected user principal"), + } + assert_eq!(first.call_count(), 1); + assert_eq!(second.call_count(), 1); + } + + #[tokio::test] + async fn chain_fails_closed_on_first_error() { + let first = Arc::new(MockAuthenticator::returning(Err(Status::unauthenticated( + "bad token", + )))); + let second = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "bob", + ))))); + let chain = AuthenticatorChain::new(vec![first.clone(), second.clone()]); + let err = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .expect_err("must short-circuit on error"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + assert_eq!(first.call_count(), 1); + assert_eq!( + second.call_count(), + 0, + "must not consult later authenticators after an error" + ); + } + + #[tokio::test] + async fn empty_chain_returns_none() { + let chain = AuthenticatorChain::new(vec![]); + let result = chain + .authenticate(&http::HeaderMap::new(), "/some/path") + .await + .unwrap(); + assert!(result.is_none()); + } +} diff --git a/crates/openshell-server/src/auth/guard.rs b/crates/openshell-server/src/auth/guard.rs new file mode 100644 index 000000000..f5cdb8131 --- /dev/null +++ b/crates/openshell-server/src/auth/guard.rs @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Per-handler sandbox-scope guards. +//! +//! Closes the IDOR half of issue #1354: a sandbox principal may only +//! reference its own sandbox, identified by its [`Principal::Sandbox`]'s +//! `sandbox_id`. User principals retain the broad scope the RBAC layer +//! already evaluated. + +use super::principal::Principal; +use tonic::Status; +use tracing::info; + +/// Reject a sandbox-class request whose body references a sandbox other +/// than the one the calling principal was authenticated against. +/// +/// - [`Principal::User`] passes through (RBAC has already evaluated user +/// scope at the router level). +/// - [`Principal::Sandbox`] must reference the same canonical UUID it +/// was authenticated with. +/// - [`Principal::Anonymous`] is rejected — sandbox-class methods are +/// never anonymously callable. +/// +/// `claimed_sandbox_id` is the canonical UUID the request is operating +/// on. Name-keyed handlers must resolve the name to a UUID via the +/// store before calling this guard. +#[allow(clippy::result_large_err)] +pub fn ensure_sandbox_scope(principal: &Principal, claimed_sandbox_id: &str) -> Result<(), Status> { + match principal { + Principal::User(_) => Ok(()), + Principal::Sandbox(p) => { + if p.sandbox_id == claimed_sandbox_id { + Ok(()) + } else { + info!( + principal_sandbox_id = %p.sandbox_id, + requested_sandbox_id = %claimed_sandbox_id, + "cross-sandbox access denied" + ); + Err(Status::permission_denied( + "cross-sandbox access denied: principal does not own this sandbox", + )) + } + } + Principal::Anonymous => Err(Status::unauthenticated( + "sandbox-scoped methods require an authenticated caller", + )), + } +} + +/// Convenience: read the `Principal` out of a request and apply +/// [`ensure_sandbox_scope`]. Returns the principal so callers can read it +/// further (e.g. for audit logging). +#[allow(clippy::result_large_err)] +pub fn enforce_sandbox_scope( + request: &tonic::Request, + claimed_sandbox_id: &str, +) -> Result { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + ensure_sandbox_scope(&principal, claimed_sandbox_id)?; + Ok(principal) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{SandboxIdentitySource, SandboxPrincipal, UserPrincipal}; + + fn user(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox(id: &str) -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[test] + fn user_principal_bypasses_equality_check() { + // RBAC was the user's gate at the router layer. + assert!(ensure_sandbox_scope(&user("alice"), "any-sandbox").is_ok()); + } + + #[test] + fn sandbox_principal_matching_id_is_allowed() { + assert!(ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-1").is_ok()); + } + + #[test] + fn sandbox_principal_mismatched_id_is_denied() { + let err = + ensure_sandbox_scope(&sandbox("sbx-1"), "sbx-2").expect_err("must deny cross-sandbox"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[test] + fn anonymous_principal_is_rejected() { + let err = + ensure_sandbox_scope(&Principal::Anonymous, "sbx-1").expect_err("must reject anon"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[test] + fn enforce_reads_from_request_extensions() { + let mut req = tonic::Request::new(()); + req.extensions_mut().insert(sandbox("sbx-1")); + let result = enforce_sandbox_scope(&req, "sbx-1").expect("scope OK"); + assert!(matches!(result, Principal::Sandbox(_))); + } + + #[test] + fn enforce_rejects_request_without_principal() { + let req = tonic::Request::new(()); + let err = enforce_sandbox_scope(&req, "sbx-1").expect_err("must require principal"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs new file mode 100644 index 000000000..be625c703 --- /dev/null +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -0,0 +1,598 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Kubernetes `ServiceAccount` bootstrap authenticator. +//! +//! Path-scoped to `IssueSandboxToken`. Validates a projected SA token +//! presented by a sandbox pod, reads the pod's `openshell.io/sandbox-id` +//! annotation, and returns a [`Principal::Sandbox`] with +//! [`SandboxIdentitySource::K8sServiceAccount`]. The `IssueSandboxToken` +//! handler then mints a gateway-signed JWT for that sandbox id; subsequent +//! gRPC calls from the supervisor use the gateway-minted JWT validated by +//! [`super::sandbox_jwt::SandboxJwtAuthenticator`]. +//! +//! This is the only authenticator that talks to the K8s apiserver. It is +//! optional — the gateway boots without it in singleplayer deployments. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use async_trait::async_trait; +use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; +use k8s_openapi::api::core::v1::Pod; +use kube::api::Api; +use serde::Deserialize; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::{Mutex, RwLock}; +use tonic::Status; +use tracing::{debug, info, warn}; + +/// gRPC method path that this authenticator accepts. All other paths fall +/// through (return `Ok(None)`) so a gateway-minted JWT is required there. +pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandboxToken"; + +/// Pod annotation that binds a sandbox pod to its UUID. Set by the +/// Kubernetes compute driver at pod-create time. The gateway treats this +/// annotation as authoritative; the K8s `Role` granted to the gateway must +/// not include `patch pods` (see plan §11.8). +pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; + +/// Resolved identity extracted from a validated SA token + pod lookup. +#[derive(Debug, Clone)] +pub struct ResolvedK8sIdentity { + pub sandbox_id: String, + pub pod_name: String, + pub pod_uid: String, +} + +/// Apiserver-facing operations the authenticator depends on. Split out so +/// tests can fake the apiserver without standing up a kube cluster. +#[async_trait] +pub trait K8sIdentityResolver: Send + Sync + 'static { + /// Validate `token` via `TokenReview` (`aud == openshell-gateway`), + /// extract the pod name/uid, then `GET` the pod and read + /// `openshell.io/sandbox-id`. Returns `Ok(None)` when the token is + /// well-formed but does not authenticate (e.g. wrong audience); returns + /// `Err` for transport/server errors. + async fn resolve(&self, token: &str) -> Result, Status>; +} + +/// Authenticator wrapper around a [`K8sIdentityResolver`]. +pub struct K8sServiceAccountAuthenticator { + resolver: Arc, +} + +impl std::fmt::Debug for K8sServiceAccountAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("K8sServiceAccountAuthenticator") + .finish_non_exhaustive() + } +} + +impl K8sServiceAccountAuthenticator { + pub fn new(resolver: Arc) -> Self { + Self { resolver } + } +} + +#[async_trait] +impl Authenticator for K8sServiceAccountAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + path: &str, + ) -> Result, Status> { + // Scope: only the bootstrap RPC. Other paths fall through so the + // SandboxJwtAuthenticator (or OIDC) handles them. + if path != ISSUE_SANDBOX_TOKEN_PATH { + return Ok(None); + } + + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let Some(resolved) = self.resolver.resolve(token).await? else { + debug!("K8s SA token did not authenticate; falling through"); + return Ok(None); + }; + + if resolved.sandbox_id.is_empty() { + warn!( + pod = %resolved.pod_name, + "pod missing openshell.io/sandbox-id annotation; rejecting" + ); + return Err(Status::permission_denied( + "pod is not bound to a sandbox identity", + )); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: resolved.sandbox_id, + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: resolved.pod_name, + pod_uid: resolved.pod_uid, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +/// K8s apiserver discovery document (subset of fields used). +#[derive(Deserialize)] +struct ApiserverDiscovery { + issuer: String, + jwks_uri: String, +} + +/// JWKS key set returned by the apiserver's `/openid/v1/jwks` endpoint. +#[derive(Deserialize)] +struct JwkSet { + keys: Vec, +} + +#[derive(Deserialize)] +struct JwkKey { + kid: Option, + kty: String, + #[serde(default)] + n: String, + #[serde(default)] + e: String, + alg: Option, +} + +/// Claims subset extracted from a validated projected SA token. `exp`, +/// `aud`, and `serviceaccount` are validated by `jsonwebtoken` but we +/// don't read them post-decode — dead-code-allowed so the structural +/// match against the token shape stays explicit. +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct K8sSaClaims { + /// `system:serviceaccount::` + sub: String, + iss: String, + /// The audience claim is always an array for projected SA tokens. + #[serde(default)] + aud: Vec, + exp: i64, + #[serde(rename = "kubernetes.io")] + kubernetes: K8sClaim, +} + +#[derive(Debug, Deserialize)] +#[allow(dead_code)] +struct K8sClaim { + namespace: String, + pod: K8sPodClaim, + #[serde(default)] + serviceaccount: Option, +} + +#[derive(Debug, Deserialize)] +struct K8sPodClaim { + name: String, + uid: String, +} + +#[derive(Debug, Deserialize)] +struct K8sSaClaim { + #[allow(dead_code)] + name: String, + #[allow(dead_code)] + uid: String, +} + +/// JWKS cache for the K8s apiserver's projected `ServiceAccount` token +/// issuer. Discovery + key fetch lazily on first validate; subsequent +/// validations are in-process signature checks. Refreshes on `kid` miss +/// so apiserver key rotation propagates without a restart. +pub struct K8sApiserverJwks { + client: kube::Client, + expected_audience: String, + state: RwLock, + refresh: Mutex<()>, +} + +#[derive(Default)] +struct JwksState { + issuer: Option, + jwks_path: Option, + keys: HashMap, +} + +impl K8sApiserverJwks { + pub fn new(client: kube::Client, expected_audience: String) -> Self { + Self { + client, + expected_audience, + state: RwLock::new(JwksState::default()), + refresh: Mutex::new(()), + } + } + + /// Validate `token`, returning the parsed claims on success. + #[allow(clippy::result_large_err)] + async fn validate(&self, token: &str) -> Result { + // Decode the header to find the kid first; we lazily load on demand. + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "K8s SA JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + let kid = header + .kid + .ok_or_else(|| Status::unauthenticated("invalid token: missing kid"))?; + + let (issuer, key) = if let Some(pair) = self.cached_key(&kid).await { + pair + } else { + self.refresh_keys().await?; + self.cached_key(&kid).await.ok_or_else(|| { + debug!(kid = %kid, "K8s SA JWT kid not found in apiserver JWKS"); + Status::unauthenticated("invalid token: unknown signing key") + })? + }; + + let mut validation = Validation::new(Algorithm::RS256); + validation.algorithms = vec![Algorithm::RS256]; + validation.set_issuer(&[&issuer]); + validation.set_audience(&[&self.expected_audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = decode::(token, &key, &validation).map_err(|e| { + debug!(error = %e, "K8s SA JWT validation failed"); + Status::unauthenticated(format!("invalid SA token: {e}")) + })?; + Ok(data.claims) + } + + async fn cached_key(&self, kid: &str) -> Option<(String, DecodingKey)> { + let state = self.state.read().await; + let issuer = state.issuer.clone()?; + let key = state.keys.get(kid).cloned()?; + Some((issuer, key)) + } + + /// Fetch the discovery document + JWKS and replace the cached state. + /// Coalesces concurrent refreshes so the apiserver sees one fetch. + #[allow(clippy::result_large_err)] + async fn refresh_keys(&self) -> Result<(), Status> { + let _guard = self.refresh.lock().await; + info!("refreshing K8s apiserver JWKS"); + let discovery: ApiserverDiscovery = self + .request_apiserver("/.well-known/openid-configuration") + .await?; + let jwks_path = jwks_path_from_uri(&discovery.jwks_uri).ok_or_else(|| { + Status::internal(format!( + "apiserver returned unusable jwks_uri '{}'", + discovery.jwks_uri + )) + })?; + let jwks: JwkSet = self.request_apiserver(&jwks_path).await?; + let mut keys = HashMap::new(); + for key in &jwks.keys { + if key.kty != "RSA" { + continue; + } + let Some(ref kid) = key.kid else { + continue; + }; + if let Some(alg) = key.alg.as_deref() + && alg != "RS256" + { + continue; + } + match DecodingKey::from_rsa_components(&key.n, &key.e) { + Ok(dk) => { + keys.insert(kid.clone(), dk); + } + Err(e) => warn!(kid = %kid, error = %e, "skipped malformed apiserver JWK"), + } + } + info!( + count = keys.len(), + issuer = %discovery.issuer, + "loaded apiserver JWKS" + ); + let mut state = self.state.write().await; + state.issuer = Some(discovery.issuer); + state.jwks_path = Some(jwks_path); + state.keys = keys; + Ok(()) + } + + #[allow(clippy::result_large_err)] + async fn request_apiserver( + &self, + path: &str, + ) -> Result { + let req = http::Request::builder() + .uri(path) + .body(Vec::new()) + .map_err(|e| Status::internal(format!("apiserver request build: {e}")))?; + self.client + .request::(req) + .await + .map_err(|e| Status::internal(format!("apiserver request failed: {e}"))) + } +} + +/// Pull a path-only URI out of the `jwks_uri` field. The apiserver's +/// discovery doc returns an absolute URL (e.g. +/// `https://kubernetes.default.svc.cluster.local/openid/v1/jwks`); we +/// strip to the path so `kube::Client::request` can be reused. +fn jwks_path_from_uri(uri: &str) -> Option { + if uri.starts_with('/') { + return Some(uri.to_string()); + } + let parsed = url::Url::parse(uri).ok()?; + let mut out = parsed.path().to_string(); + if let Some(q) = parsed.query() { + out.push('?'); + out.push_str(q); + } + Some(out) +} + +/// Resolver backed by the apiserver's JWKS endpoint (for SA-token +/// signature verification) and `kube::Client` (for the per-pod +/// annotation lookup). +pub struct LiveK8sResolver { + jwks: Arc, + pods_api: Api, +} + +impl LiveK8sResolver { + pub fn new(client: kube::Client, namespace: &str, expected_audience: String) -> Self { + let pods_api: Api = Api::namespaced(client.clone(), namespace); + let jwks = Arc::new(K8sApiserverJwks::new(client, expected_audience)); + Self { jwks, pods_api } + } +} + +#[async_trait] +impl K8sIdentityResolver for LiveK8sResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + let claims = match self.jwks.validate(token).await { + Ok(c) => c, + Err(status) if status.code() == tonic::Code::Unauthenticated => { + // Returning Ok(None) lets the chain fall through; the + // outer router then returns Unauthenticated to the client. + return Ok(None); + } + Err(other) => return Err(other), + }; + + debug!( + sub = %claims.sub, + iss = %claims.iss, + pod_name = %claims.kubernetes.pod.name, + "validated K8s SA token" + ); + + // Look up the pod and read its sandbox-id annotation. + let pod = self + .pods_api + .get_opt(&claims.kubernetes.pod.name) + .await + .map_err(|e| { + warn!( + pod = %claims.kubernetes.pod.name, + error = %e, + "failed to fetch sandbox pod for annotation lookup" + ); + Status::internal(format!("pod GET failed: {e}")) + })?; + let Some(pod) = pod else { + warn!( + pod = %claims.kubernetes.pod.name, + "sandbox pod referenced by SA token not found in this namespace" + ); + return Err(Status::not_found("sandbox pod not found")); + }; + + // Defense-in-depth: confirm the pod UID matches the SA token's + // `kubernetes.io.pod.uid`. Prevents a replayed token from a + // recreated pod with the same name. + let actual_uid = pod.metadata.uid.as_deref().unwrap_or_default(); + if actual_uid != claims.kubernetes.pod.uid { + warn!( + pod = %claims.kubernetes.pod.name, + claimed_uid = %claims.kubernetes.pod.uid, + actual_uid = %actual_uid, + "SA token pod UID does not match live pod; rejecting" + ); + return Err(Status::permission_denied("SA token pod UID mismatch")); + } + + let sandbox_id = pod + .metadata + .annotations + .as_ref() + .and_then(|a| a.get(SANDBOX_ID_ANNOTATION)) + .cloned() + .unwrap_or_default(); + + Ok(Some(ResolvedK8sIdentity { + sandbox_id, + pod_name: claims.kubernetes.pod.name, + pod_uid: claims.kubernetes.pod.uid, + })) + } +} + +#[cfg(test)] +pub mod test_support { + use super::*; + use std::sync::Mutex; + + /// Fake resolver for unit tests. Returns the configured outcome on + /// every call and records the tokens it observed. + pub struct FakeResolver { + pub outcome: Result, Status>, + pub seen_tokens: Mutex>, + } + + impl FakeResolver { + pub fn returning(outcome: Result, Status>) -> Self { + Self { + outcome, + seen_tokens: Mutex::new(Vec::new()), + } + } + } + + #[async_trait] + impl K8sIdentityResolver for FakeResolver { + async fn resolve(&self, token: &str) -> Result, Status> { + self.seen_tokens.lock().unwrap().push(token.to_string()); + match &self.outcome { + Ok(opt) => Ok(opt.clone()), + Err(s) => Err(Status::new(s.code(), s.message())), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::test_support::FakeResolver; + use super::*; + + fn bearer_headers(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + #[test] + fn jwks_path_extracts_absolute_url() { + let path = + jwks_path_from_uri("https://kubernetes.default.svc.cluster.local/openid/v1/jwks") + .expect("apiserver-style URL must parse"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_relative_path() { + let path = jwks_path_from_uri("/openid/v1/jwks").expect("relative path must round-trip"); + assert_eq!(path, "/openid/v1/jwks"); + } + + #[test] + fn jwks_path_preserves_query_string() { + let path = jwks_path_from_uri("https://apiserver/openid/v1/jwks?version=v1") + .expect("query strings must be preserved"); + assert_eq!(path, "/openid/v1/jwks?version=v1"); + } + + #[test] + fn jwks_path_rejects_garbage() { + assert!(jwks_path_from_uri("not a url").is_none()); + } + + #[tokio::test] + async fn authenticates_on_issue_path_only() { + let resolved = ResolvedK8sIdentity { + sandbox_id: "sandbox-a".to_string(), + pod_name: "openshell-sandbox-a".to_string(), + pod_uid: "uid-a".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake.clone()); + + let on_issue = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap() + .expect("expected principal"); + match on_issue { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + assert!(matches!( + p.source, + SandboxIdentitySource::K8sServiceAccount { .. } + )); + } + _ => panic!("expected sandbox principal"), + } + + let off_issue = auth + .authenticate( + &bearer_headers("sa-jwt"), + "/openshell.v1.OpenShell/GetSandboxConfig", + ) + .await + .unwrap(); + assert!( + off_issue.is_none(), + "K8s SA authenticator must be scoped to IssueSandboxToken" + ); + assert_eq!( + fake.seen_tokens.lock().unwrap().len(), + 1, + "off-path call must not consult the apiserver" + ); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate(&http::HeaderMap::new(), ISSUE_SANDBOX_TOKEN_PATH) + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn resolver_returning_none_falls_through() { + let fake = Arc::new(FakeResolver::returning(Ok(None))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let result = auth + .authenticate( + &bearer_headers("not-a-real-sa-token"), + ISSUE_SANDBOX_TOKEN_PATH, + ) + .await + .unwrap(); + assert!(result.is_none(), "non-authenticating tokens fall through"); + } + + #[tokio::test] + async fn pod_without_annotation_is_rejected() { + let resolved = ResolvedK8sIdentity { + sandbox_id: String::new(), + pod_name: "stray-pod".to_string(), + pod_uid: "uid".to_string(), + }; + let fake = Arc::new(FakeResolver::returning(Ok(Some(resolved)))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("unbound pod must be rejected"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn resolver_error_propagates() { + let fake = Arc::new(FakeResolver::returning(Err(Status::unavailable( + "apiserver down", + )))); + let auth = K8sServiceAccountAuthenticator::new(fake); + let err = auth + .authenticate(&bearer_headers("sa-jwt"), ISSUE_SANDBOX_TOKEN_PATH) + .await + .expect_err("resolver error must propagate"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/auth/mod.rs b/crates/openshell-server/src/auth/mod.rs index 8e4f332d8..d4c6978af 100644 --- a/crates/openshell-server/src/auth/mod.rs +++ b/crates/openshell-server/src/auth/mod.rs @@ -8,9 +8,15 @@ //! - `identity`: Provider-agnostic identity representation //! - `http`: HTTP endpoints for auth discovery and token exchange +pub mod authenticator; pub mod authz; +pub mod guard; mod http; pub mod identity; +pub mod k8s_sa; pub mod oidc; +pub mod principal; +pub mod revocation; +pub mod sandbox_jwt; pub use http::router; diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index 92298579e..6c1339e4f 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -10,7 +10,10 @@ //! This module owns authentication (verifying who the caller is). //! Authorization (deciding what the caller can do) is in `authz.rs`. +use super::authenticator::Authenticator; use super::identity::{Identity, IdentityProvider}; +use super::principal::{Principal, UserPrincipal}; +use async_trait::async_trait; use jsonwebtoken::{Algorithm, DecodingKey, Validation, decode, decode_header}; use openshell_core::OidcConfig; use reqwest::Client; @@ -22,15 +25,6 @@ use tokio::sync::RwLock; use tonic::Status; use tracing::{debug, info, warn}; -/// Internal metadata header set by the auth middleware to mark a request as -/// originating from a sandbox. This is stripped from all incoming requests -/// first so external callers cannot spoof it. -pub const INTERNAL_AUTH_SOURCE_HEADER: &str = "x-openshell-auth-source"; -/// Internal auth-source marker for requests originating from a sandbox -/// (no OIDC Bearer; trust derives from the mTLS channel or operator's -/// fronting proxy). -pub const AUTH_SOURCE_SANDBOX: &str = "sandbox"; - /// Truly unauthenticated methods — health probes and infrastructure. const UNAUTHENTICATED_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/Health", @@ -40,40 +34,6 @@ const UNAUTHENTICATED_METHODS: &[&str] = &[ /// Path prefixes that bypass OIDC validation (gRPC reflection, health probes). const UNAUTHENTICATED_PREFIXES: &[&str] = &["/grpc.reflection.", "/grpc.health."]; -/// Sandbox-to-server RPCs that are called by sandboxes instead of CLI -/// users. These do not require an OIDC Bearer token; the gRPC channel's -/// mTLS handshake (or the operator's fronting proxy when -/// `--disable-gateway-auth` is set) is the trust boundary. -const SANDBOX_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/ReportPolicyStatus", - "/openshell.v1.OpenShell/PushSandboxLogs", - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment", - "/openshell.v1.OpenShell/SubmitPolicyAnalysis", - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig", - "/openshell.inference.v1.Inference/GetInferenceBundle", -]; - -/// Methods that accept either an OIDC Bearer token (CLI users, full scope) -/// or no Bearer (sandbox supervisor, sandbox-restricted scope). -/// `UpdateConfig` is called by both CLI (policy/settings mutations) and the -/// sandbox supervisor (policy sync on startup). -/// `OpenShell/GetSandboxConfig` serves CLI settings reads while remaining -/// compatible with sandbox callers. -/// `GetDraftPolicy` serves CLI reviewer surfaces (`openshell rule get`, -/// TUI inbox) AND the sandbox-side `policy.local /wait` long-poll that -/// blocks on the agent's proposal until the developer decides. -const DUAL_AUTH_METHODS: &[&str] = &[ - "/openshell.v1.OpenShell/UpdateConfig", - "/openshell.v1.OpenShell/GetSandboxConfig", - "/openshell.v1.OpenShell/GetDraftPolicy", -]; - -/// Returns `true` if the method accepts either an OIDC Bearer token or a -/// sandbox-class caller (no Bearer). -pub fn is_dual_auth_method(path: &str) -> bool { - DUAL_AUTH_METHODS.contains(&path) -} - /// Returns `true` if the method needs no authentication at all. pub fn is_unauthenticated_method(path: &str) -> bool { UNAUTHENTICATED_METHODS.contains(&path) @@ -82,34 +42,6 @@ pub fn is_unauthenticated_method(path: &str) -> bool { .any(|prefix| path.starts_with(prefix)) } -/// Returns `true` if the method is an exclusively sandbox-class call (does -/// not accept OIDC Bearer). -pub fn is_sandbox_method(path: &str) -> bool { - SANDBOX_METHODS.contains(&path) -} - -/// Remove internal auth-source markers from the request before any auth -/// decision is made so external callers cannot spoof them. -pub fn clear_internal_auth_markers(headers: &mut http::HeaderMap) { - headers.remove(INTERNAL_AUTH_SOURCE_HEADER); -} - -/// Mark the request as originating from a sandbox caller. -pub fn mark_sandbox_caller(headers: &mut http::HeaderMap) { - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); -} - -/// Returns `true` if the request metadata indicates a sandbox caller. -pub fn is_sandbox_caller(metadata: &tonic::metadata::MetadataMap) -> bool { - metadata - .get(INTERNAL_AUTH_SOURCE_HEADER) - .and_then(|v| v.to_str().ok()) - == Some(AUTH_SOURCE_SANDBOX) -} - /// Cached JWKS key set fetched from the OIDC issuer. /// /// A `refresh_mutex` ensures that only one refresh runs at a time, @@ -419,6 +351,42 @@ impl JwksCache { } } +/// Authenticator that validates `Authorization: Bearer ` headers against +/// the configured OIDC issuer. +/// +/// Returns `Ok(None)` when no Bearer header is present, so the chain can fall +/// through to other authenticators (e.g. the gateway-minted sandbox JWT +/// authenticator added in PR 2). +pub struct OidcAuthenticator { + cache: Arc, +} + +impl OidcAuthenticator { + pub fn new(cache: Arc) -> Self { + Self { cache } + } +} + +#[async_trait] +impl Authenticator for OidcAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + + let identity = self.cache.validate_token(token).await?; + Ok(Some(Principal::User(UserPrincipal { identity }))) + } +} + #[cfg(test)] mod tests { use super::*; @@ -433,7 +401,6 @@ mod tests { assert!(!is_unauthenticated_method( "/openshell.v1.OpenShell/CreateSandbox" )); - assert!(!is_sandbox_method("/openshell.v1.OpenShell/CreateSandbox")); } #[test] @@ -451,74 +418,6 @@ mod tests { assert!(is_unauthenticated_method("/grpc.health.v1.Health/Check")); } - #[test] - fn sandbox_rpcs_are_sandbox_methods() { - assert!(is_sandbox_method( - "/openshell.sandbox.v1.SandboxService/GetSandboxConfig" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxProviderEnvironment" - )); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/ReportPolicyStatus" - )); - assert!(is_sandbox_method("/openshell.v1.OpenShell/PushSandboxLogs")); - assert!(is_sandbox_method( - "/openshell.v1.OpenShell/SubmitPolicyAnalysis" - )); - assert!(is_sandbox_method( - "/openshell.inference.v1.Inference/GetInferenceBundle" - )); - } - - #[test] - fn openshell_get_sandbox_config_is_dual_auth() { - assert!(!is_sandbox_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetSandboxConfig" - )); - } - - #[test] - fn openshell_get_draft_policy_is_dual_auth() { - // policy.local calls GetDraftPolicy from inside the sandbox - // supervisor (no Bearer, authenticated via mTLS), and the CLI/TUI - // reviewer surfaces call it with an OIDC Bearer. Sandbox-only - // would lock CLI out; Bearer-only would 401 the /wait long-poll - // in OIDC-enabled deployments. - assert!(!is_sandbox_method("/openshell.v1.OpenShell/GetDraftPolicy")); - assert!(is_dual_auth_method( - "/openshell.v1.OpenShell/GetDraftPolicy" - )); - } - - #[test] - fn sandbox_caller_marker_round_trips_through_metadata() { - let mut headers = http::HeaderMap::new(); - mark_sandbox_caller(&mut headers); - let metadata = tonic::metadata::MetadataMap::from_headers(headers); - assert!(is_sandbox_caller(&metadata)); - } - - #[test] - fn unmarked_request_is_not_sandbox_caller() { - let metadata = tonic::metadata::MetadataMap::new(); - assert!(!is_sandbox_caller(&metadata)); - } - - #[test] - fn clear_internal_markers_strips_spoofed_header() { - let mut headers = http::HeaderMap::new(); - headers.insert( - INTERNAL_AUTH_SOURCE_HEADER, - http::HeaderValue::from_static(AUTH_SOURCE_SANDBOX), - ); - clear_internal_auth_markers(&mut headers); - assert!(headers.get(INTERNAL_AUTH_SOURCE_HEADER).is_none()); - } - #[test] fn extract_roles_keycloak_path() { let json = serde_json::json!({ diff --git a/crates/openshell-server/src/auth/principal.rs b/crates/openshell-server/src/auth/principal.rs new file mode 100644 index 000000000..fac3f6099 --- /dev/null +++ b/crates/openshell-server/src/auth/principal.rs @@ -0,0 +1,82 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authenticated caller principals. +//! +//! A `Principal` is the result of running the [`super::authenticator::Authenticator`] +//! chain on an inbound request. It generalizes over the kinds of callers the +//! gateway recognizes — human users (OIDC), sandbox supervisors (gateway-minted +//! JWT, future SPIFFE), and anonymous callers (truly unauthenticated methods +//! like health probes). +//! +//! Handlers read the principal from the gRPC `Request` extensions and gate +//! access accordingly. Sandbox-class handlers MUST compare +//! `Principal::Sandbox.sandbox_id` against the request body's `sandbox_id` +//! to prevent cross-sandbox access (see issue #1354). + +use super::identity::Identity; + +/// Who is calling. +/// +/// Inserted into `tonic::Request::extensions` by the auth router. Handlers +/// retrieve it via `req.extensions().get::()`. +#[derive(Debug, Clone)] +pub enum Principal { + /// Human caller authenticated via OIDC (Keycloak, Entra ID, Okta, etc.). + User(UserPrincipal), + /// Sandbox supervisor authenticated by an identity bound to a specific + /// sandbox UUID. The wrapped `sandbox_id` MUST match any sandbox referenced + /// in the request body for sandbox-class methods (PR-4 guard). + Sandbox(#[allow(dead_code)] SandboxPrincipal), + /// Truly unauthenticated caller (health probes, reflection). Sandbox-class + /// and user-class methods reject this variant. + #[allow(dead_code)] + Anonymous, +} + +/// User caller — wraps the existing provider-agnostic [`Identity`]. +#[derive(Debug, Clone)] +pub struct UserPrincipal { + /// The verified identity from the authentication provider. + pub identity: Identity, +} + +/// Sandbox caller — bound to one specific sandbox UUID. +/// +/// `sandbox_id` and `source` are consumed by the PR-4 handler guard; until +/// then they only exist in the type so the trait shape is stable across the +/// PR series. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub struct SandboxPrincipal { + /// Canonical sandbox UUID. Empty string only for the PR-1 legacy marker; + /// PR 2 onwards always populates this from a verified credential. + pub sandbox_id: String, + /// How this principal was verified — used for audit logs and to gate the + /// PR-4 IDOR check against unverified sources. + pub source: SandboxIdentitySource, + /// SPIFFE trust domain. Populated when the credential is SPIFFE-shaped; + /// reserved for future per-sandbox cert / SPIRE authenticators. + pub trust_domain: Option, +} + +/// How a [`SandboxPrincipal`] was authenticated. +/// +/// Variant fields are populated by the producing authenticator and consumed +/// by audit logging + the PR-4 IDOR guard. Until PR 4 lands those readers +/// they look unused to the dead-code lint. +#[derive(Debug, Clone)] +#[allow(dead_code)] +pub enum SandboxIdentitySource { + /// Gateway-minted JWT validated against the gateway's signing key. + /// Produced by [`super::sandbox_jwt::SandboxJwtAuthenticator`]. + BootstrapJwt { issuer: String, jti: String }, + /// Per-sandbox client certificate. Reserved for the v2 channel-bound + /// identity follow-up. + BootstrapCert { fingerprint: String }, + /// SPIRE-issued SVID. Reserved for the SPIFFE/SPIRE follow-up. + SpiffeSvid { spiffe_id: String }, + /// K8s `ServiceAccount` token used to bootstrap a gateway-minted JWT + /// via `IssueSandboxToken`. Populated only on that one RPC path. + K8sServiceAccount { pod_name: String, pod_uid: String }, +} diff --git a/crates/openshell-server/src/auth/revocation.rs b/crates/openshell-server/src/auth/revocation.rs new file mode 100644 index 000000000..3cca82211 --- /dev/null +++ b/crates/openshell-server/src/auth/revocation.rs @@ -0,0 +1,100 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Sandbox-JWT revocation set. +//! +//! Tracks `jti` claims that have been explicitly revoked (sandbox deleted +//! or token refreshed). The validator consults this set on every sandbox +//! JWT validation and rejects matches as `Unauthenticated`. +//! +//! PR-2 implementation is in-memory only; a gateway restart clears the +//! set. The token TTL (24 h default) bounds the exposure window. PR 5 +//! (refresh RPC) introduces persistence to `Store` so revocations survive +//! restarts. + +use std::collections::HashMap; +use std::sync::RwLock; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// In-memory `jti` deny-list with TTL-based pruning. +#[derive(Debug, Default)] +pub struct RevocationSet { + entries: RwLock>, +} + +impl RevocationSet { + pub fn new() -> Self { + Self::default() + } + + /// Mark `jti` as revoked until `expires_at_ms` (after which it would + /// naturally fail signature validation due to `exp`, so we can drop it). + pub fn revoke(&self, jti: &str, expires_at_ms: i64) { + let mut entries = self.entries.write().expect("revocation lock poisoned"); + entries.insert(jti.to_string(), expires_at_ms); + } + + /// Returns true if `jti` is currently revoked. + pub fn is_revoked(&self, jti: &str) -> bool { + let entries = self.entries.read().expect("revocation lock poisoned"); + entries.contains_key(jti) + } + + /// Drop entries whose `exp` is in the past. Called periodically (or on + /// demand from tests) to bound memory growth. + pub fn prune_expired(&self) -> usize { + let now = now_ms(); + let mut entries = self.entries.write().expect("revocation lock poisoned"); + let before = entries.len(); + entries.retain(|_, exp| *exp > now); + before - entries.len() + } + + /// Number of currently tracked revocations. Test/diagnostic only. + #[cfg(test)] + pub fn len(&self) -> usize { + self.entries.read().unwrap().len() + } +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn revoked_jti_is_detected() { + let set = RevocationSet::new(); + let future = now_ms() + 60_000; + set.revoke("abc", future); + assert!(set.is_revoked("abc")); + assert!(!set.is_revoked("xyz")); + } + + #[test] + fn prune_drops_expired_entries() { + let set = RevocationSet::new(); + set.revoke("expired", now_ms() - 1_000); + set.revoke("future", now_ms() + 60_000); + let dropped = set.prune_expired(); + assert_eq!(dropped, 1); + assert!(!set.is_revoked("expired")); + assert!(set.is_revoked("future")); + } + + #[test] + fn re_revoking_overwrites_expiry() { + let set = RevocationSet::new(); + set.revoke("dup", now_ms() + 1_000); + set.revoke("dup", now_ms() + 99_000); + assert_eq!(set.len(), 1); + } +} diff --git a/crates/openshell-server/src/auth/sandbox_jwt.rs b/crates/openshell-server/src/auth/sandbox_jwt.rs new file mode 100644 index 000000000..6b1736dbe --- /dev/null +++ b/crates/openshell-server/src/auth/sandbox_jwt.rs @@ -0,0 +1,397 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Gateway-minted per-sandbox JWTs. +//! +//! The gateway signs an Ed25519 JWT for each sandbox at create time and +//! the sandbox supervisor presents it as `Authorization: Bearer ` on +//! every gRPC call (PR 3). This module implements both sides of the +//! gateway-controlled token: +//! - [`SandboxJwtIssuer`] mints fresh tokens (called from +//! `handle_create_sandbox` and the `IssueSandboxToken` RPC). +//! - [`SandboxJwtAuthenticator`] validates tokens on inbound requests and +//! produces a [`Principal::Sandbox`] with [`SandboxIdentitySource::BootstrapJwt`]. +//! +//! Algorithm: `EdDSA` (Ed25519). Pinned via `Validation::algorithms` to +//! prevent algorithm-confusion attacks. + +use super::authenticator::Authenticator; +use super::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; +use super::revocation::RevocationSet; +use async_trait::async_trait; +use jsonwebtoken::{ + Algorithm, DecodingKey, EncodingKey, Header, Validation, decode, decode_header, encode, +}; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tonic::Status; +use tracing::{debug, warn}; +use uuid::Uuid; + +/// SPIFFE-shaped subject prefix. Embedded in the `sub` claim of every +/// minted token so a future migration to per-sandbox certs or SPIRE can +/// reuse the same subject namespace without breaking handler equality +/// checks. +const SPIFFE_SUBJECT_PREFIX: &str = "spiffe://openshell/sandbox/"; + +/// JWT claim set serialized in every gateway-minted sandbox token. +#[derive(Debug, Serialize, Deserialize)] +pub struct SandboxJwtClaims { + /// `spiffe://openshell/sandbox/`. SPIFFE-shaped for forward + /// compatibility with channel-bound identity (per-sandbox cert / SPIRE). + pub sub: String, + /// Gateway identity (`openshell-gateway:`). Both `iss` and + /// `aud` use the same value so any future replicas of the same + /// deployment validate each others' tokens without configuration. + pub iss: String, + pub aud: String, + pub iat: i64, + pub exp: i64, + pub jti: String, + /// Canonical sandbox UUID, denormalized from `sub` for cheap parsing + /// without a SPIFFE library. + pub sandbox_id: String, +} + +/// Mints fresh sandbox JWTs. +pub struct SandboxJwtIssuer { + encoding_key: EncodingKey, + kid: String, + issuer: String, + audience: String, + ttl: Duration, +} + +impl std::fmt::Debug for SandboxJwtIssuer { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtIssuer") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .field("ttl", &self.ttl) + .finish_non_exhaustive() + } +} + +/// Outcome of a successful mint — caller persists the `jti` so the same +/// token can be revoked on `DeleteSandbox` / refresh. +#[derive(Debug, Clone)] +pub struct MintedToken { + pub token: String, + pub jti: String, + pub expires_at_ms: i64, +} + +impl SandboxJwtIssuer { + pub fn from_pem( + signing_key_pem: &[u8], + kid: String, + gateway_id: &str, + ttl: Duration, + ) -> Result { + let encoding_key = EncodingKey::from_ed_pem(signing_key_pem) + .map_err(|e| format!("failed to parse Ed25519 signing key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + encoding_key, + kid, + issuer: identity.clone(), + audience: identity, + ttl, + }) + } + + /// Mint a fresh token for `sandbox_id`. The caller MUST track the + /// returned `jti` (in the `RevocationSet`'s mint-time index if we ever + /// need to revoke the most-recent token for a given sandbox). + #[allow(clippy::result_large_err)] // `tonic::Status` is the natural error here + pub fn mint(&self, sandbox_id: &str) -> Result { + let now = now_secs(); + let exp = now + i64::try_from(self.ttl.as_secs()).unwrap_or(86_400); + let jti = Uuid::new_v4().to_string(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}{sandbox_id}"), + iss: self.issuer.clone(), + aud: self.audience.clone(), + iat: now, + exp, + jti: jti.clone(), + sandbox_id: sandbox_id.to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(self.kid.clone()); + let token = encode(&header, &claims, &self.encoding_key).map_err(|e| { + warn!(error = %e, "failed to mint sandbox JWT"); + Status::internal("failed to mint sandbox token") + })?; + Ok(MintedToken { + token, + jti, + expires_at_ms: exp.saturating_mul(1000), + }) + } + + pub fn ttl(&self) -> Duration { + self.ttl + } +} + +/// Authenticator that validates gateway-minted sandbox JWTs. +pub struct SandboxJwtAuthenticator { + decoding_key: DecodingKey, + kid: String, + issuer: String, + audience: String, + revocation: Arc, +} + +impl std::fmt::Debug for SandboxJwtAuthenticator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SandboxJwtAuthenticator") + .field("kid", &self.kid) + .field("issuer", &self.issuer) + .field("audience", &self.audience) + .finish_non_exhaustive() + } +} + +impl SandboxJwtAuthenticator { + pub fn from_pem( + public_key_pem: &[u8], + kid: String, + gateway_id: &str, + revocation: Arc, + ) -> Result { + let decoding_key = DecodingKey::from_ed_pem(public_key_pem) + .map_err(|e| format!("failed to parse Ed25519 public key PEM: {e}"))?; + let identity = format!("openshell-gateway:{gateway_id}"); + Ok(Self { + decoding_key, + kid, + issuer: identity.clone(), + audience: identity, + revocation, + }) + } + + #[allow(clippy::result_large_err)] + fn validate_bearer(&self, token: &str) -> Result, Status> { + let header = decode_header(token).map_err(|e| { + debug!(error = %e, "sandbox JWT header decode failed"); + Status::unauthenticated("invalid token") + })?; + + // Fall through to other authenticators when the kid does not match — + // OIDC issuers may share the Bearer slot. + if header.kid.as_deref() != Some(self.kid.as_str()) { + return Ok(None); + } + if !matches!(header.alg, Algorithm::EdDSA) { + return Ok(None); + } + + let mut validation = Validation::new(Algorithm::EdDSA); + validation.algorithms = vec![Algorithm::EdDSA]; + validation.set_issuer(&[&self.issuer]); + validation.set_audience(&[&self.audience]); + validation.set_required_spec_claims(&["iss", "aud", "exp", "sub"]); + + let data = + decode::(token, &self.decoding_key, &validation).map_err(|e| { + debug!(error = %e, "sandbox JWT validation failed"); + Status::unauthenticated(format!("invalid token: {e}")) + })?; + + let claims = data.claims; + if self.revocation.is_revoked(&claims.jti) { + debug!(jti = %claims.jti, "sandbox JWT rejected: jti revoked"); + return Err(Status::unauthenticated("revoked token")); + } + + Ok(Some(Principal::Sandbox(SandboxPrincipal { + sandbox_id: claims.sandbox_id, + source: SandboxIdentitySource::BootstrapJwt { + issuer: claims.iss, + jti: claims.jti, + }, + trust_domain: Some("openshell".to_string()), + }))) + } +} + +#[async_trait] +impl Authenticator for SandboxJwtAuthenticator { + async fn authenticate( + &self, + headers: &http::HeaderMap, + _path: &str, + ) -> Result, Status> { + let Some(token) = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|v| v.strip_prefix("Bearer ")) + else { + return Ok(None); + }; + self.validate_bearer(token) + } +} + +fn now_secs() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_or(0, |d| d.as_secs()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use openshell_bootstrap::jwt::generate_jwt_key; + + fn header_map_with_bearer(token: &str) -> http::HeaderMap { + let mut h = http::HeaderMap::new(); + h.insert( + "authorization", + http::HeaderValue::from_str(&format!("Bearer {token}")).unwrap(), + ); + h + } + + fn pair() -> ( + SandboxJwtIssuer, + SandboxJwtAuthenticator, + Arc, + ) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid, + "test-gateway", + revocation.clone(), + ) + .unwrap(); + (issuer, auth, revocation) + } + + #[tokio::test] + async fn mint_and_validate_round_trip() { + let (issuer, auth, _) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + let principal = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap() + .expect("expected principal"); + match principal { + Principal::Sandbox(p) => { + assert_eq!(p.sandbox_id, "sandbox-a"); + match p.source { + SandboxIdentitySource::BootstrapJwt { issuer: iss, jti } => { + assert_eq!(iss, "openshell-gateway:test-gateway"); + assert_eq!(jti, minted.jti); + } + other => panic!("unexpected source: {other:?}"), + } + } + _ => panic!("expected Sandbox principal"), + } + } + + #[tokio::test] + async fn revoked_jti_is_rejected() { + let (issuer, auth, revocation) = pair(); + let minted = issuer.mint("sandbox-a").unwrap(); + revocation.revoke(&minted.jti, minted.expires_at_ms); + let err = auth + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .expect_err("revoked must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn token_signed_by_other_key_is_rejected() { + let (_, auth_a, _) = pair(); + let (issuer_b, _, _) = pair(); // different keypair + let minted = issuer_b.mint("sandbox-b").unwrap(); + // The token has a different `kid` than auth_a expects, so the + // authenticator yields None (lets the chain fall through). That is + // the documented behavior for cross-issuer Bearer headers. + let result = auth_a + .authenticate(&header_map_with_bearer(&minted.token), "/anything") + .await + .unwrap(); + assert!(result.is_none(), "different kid must fall through"); + } + + #[tokio::test] + async fn missing_bearer_yields_none() { + let (_, auth, _) = pair(); + let result = auth + .authenticate(&http::HeaderMap::new(), "/anything") + .await + .unwrap(); + assert!(result.is_none()); + } + + #[tokio::test] + async fn malformed_token_is_rejected() { + let (_, auth, _) = pair(); + let err = auth + .authenticate(&header_map_with_bearer("not.a.jwt"), "/anything") + .await + .expect_err("malformed must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } + + #[tokio::test] + async fn expired_token_is_rejected() { + // Mint a token whose iat is far in the past so its TTL window is + // already closed by `now`. We sign the JWT directly with the same + // signing key to bypass the issuer's TTL-vs-now coupling. + let mat = generate_jwt_key().unwrap(); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Duration::from_secs(3600), + ) + .unwrap(); + let auth = SandboxJwtAuthenticator::from_pem( + mat.public_key_pem.as_bytes(), + mat.kid.clone(), + "g", + Arc::new(RevocationSet::new()), + ) + .unwrap(); + let claims = SandboxJwtClaims { + sub: format!("{SPIFFE_SUBJECT_PREFIX}sandbox-c"), + iss: "openshell-gateway:g".to_string(), + aud: "openshell-gateway:g".to_string(), + iat: now_secs() - 7200, + exp: now_secs() - 3600, + jti: Uuid::new_v4().to_string(), + sandbox_id: "sandbox-c".to_string(), + }; + let mut header = Header::new(Algorithm::EdDSA); + header.kid = Some(mat.kid); + let token = encode(&header, &claims, &issuer.encoding_key).unwrap(); + let err = auth + .authenticate(&header_map_with_bearer(&token), "/anything") + .await + .expect_err("expired token must reject"); + assert_eq!(err.code(), tonic::Code::Unauthenticated); + } +} diff --git a/crates/openshell-server/src/certgen.rs b/crates/openshell-server/src/certgen.rs index f7dcc0803..9fa94e08d 100644 --- a/crates/openshell-server/src/certgen.rs +++ b/crates/openshell-server/src/certgen.rs @@ -52,6 +52,12 @@ pub struct CertgenArgs { #[arg(long, required_unless_present = "output_dir")] client_secret_name: Option, + /// Name of the sandbox-JWT signing-key Secret (`Opaque`) to create. + /// Holds `signing.pem`, `public.pem`, and `kid` keys. Mounted on the + /// gateway pod (only) so it can mint and validate per-sandbox JWTs. + #[arg(long, required_unless_present = "output_dir")] + jwt_secret_name: Option, + /// Extra Subject Alternative Name for the server certificate. Repeatable. /// Auto-detected as an IP address or DNS name. #[arg(long = "server-san", value_name = "SAN")] @@ -93,10 +99,10 @@ enum K8sAction { Create, } -fn decide_k8s(server_exists: bool, client_exists: bool) -> K8sAction { - match (server_exists, client_exists) { - (true, true) => K8sAction::SkipExists, - (false, false) => K8sAction::Create, +fn decide_k8s(server_exists: bool, client_exists: bool, jwt_exists: bool) -> K8sAction { + match (server_exists, client_exists, jwt_exists) { + (true, true, true) => K8sAction::SkipExists, + (false, false, false) => K8sAction::Create, _ => K8sAction::PartialState, } } @@ -114,6 +120,10 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .client_secret_name .as_deref() .ok_or_else(|| miette::miette!("--client-secret-name is required"))?; + let jwt_name = args + .jwt_secret_name + .as_deref() + .ok_or_else(|| miette::miette!("--jwt-secret-name is required"))?; let client = Client::try_default() .await @@ -133,22 +143,29 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .into_diagnostic() .wrap_err_with(|| format!("failed to read secret {client_name}"))? .is_some(); + let jwt_exists = api + .get_opt(jwt_name) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to read secret {jwt_name}"))? + .is_some(); - match decide_k8s(server_exists, client_exists) { + match decide_k8s(server_exists, client_exists, jwt_exists) { K8sAction::SkipExists => { info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets already exist, skipping." ); return Ok(()); } K8sAction::PartialState => { return Err(miette::miette!( - "partial PKI state in namespace {namespace}: exactly one of \ - {server_name} / {client_name} exists. Recover with: \ - kubectl delete secret -n {namespace} {server_name} {client_name}", + "partial PKI state in namespace {namespace}: only some of \ + {server_name} / {client_name} / {jwt_name} exist. Recover with: \ + kubectl delete secret -n {namespace} {server_name} {client_name} {jwt_name}", )); } K8sAction::Create => {} @@ -166,6 +183,12 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { &bundle.client_key_pem, &bundle.ca_cert_pem, ); + let jwt_secret = jwt_signing_secret( + jwt_name, + &bundle.jwt_signing_key_pem, + &bundle.jwt_public_key_pem, + &bundle.jwt_key_id, + ); api.create(&PostParams::default(), &server_secret) .await @@ -175,11 +198,16 @@ async fn run_kubernetes(args: &CertgenArgs, bundle: &PkiBundle) -> Result<()> { .await .into_diagnostic() .wrap_err_with(|| format!("failed to create secret {client_name}"))?; + api.create(&PostParams::default(), &jwt_secret) + .await + .into_diagnostic() + .wrap_err_with(|| format!("failed to create secret {jwt_name}"))?; info!( namespace = %namespace, server = %server_name, client = %client_name, + jwt = %jwt_name, "PKI secrets created." ); Ok(()) @@ -207,6 +235,31 @@ fn tls_secret(name: &str, crt_pem: &str, key_pem: &str, ca_pem: &str) -> Secret } } +/// Build an `Opaque` Secret carrying the gateway-minted sandbox JWT +/// signing material. Mounted only on the gateway pod — sandbox pods +/// receive a per-pod gateway-signed token, never the signing key itself. +fn jwt_signing_secret(name: &str, signing_pem: &str, public_pem: &str, kid: &str) -> Secret { + let mut data = BTreeMap::new(); + data.insert( + "signing.pem".to_string(), + ByteString(signing_pem.as_bytes().to_vec()), + ); + data.insert( + "public.pem".to_string(), + ByteString(public_pem.as_bytes().to_vec()), + ); + data.insert("kid".to_string(), ByteString(kid.as_bytes().to_vec())); + Secret { + metadata: ObjectMeta { + name: Some(name.to_string()), + ..Default::default() + }, + type_: Some("Opaque".to_string()), + data: Some(data), + ..Default::default() + } +} + // ─────────────────────────────── Local mode ─────────────────────────────── #[derive(Debug, PartialEq, Eq)] @@ -235,12 +288,17 @@ struct LocalPaths { client_dir: PathBuf, client_crt: PathBuf, client_key: PathBuf, + jwt_dir: PathBuf, + jwt_signing: PathBuf, + jwt_public: PathBuf, + jwt_kid: PathBuf, } impl LocalPaths { fn resolve(dir: &Path) -> Self { let server_dir = dir.join("server"); let client_dir = dir.join("client"); + let jwt_dir = dir.join("jwt"); Self { ca_crt: dir.join("ca.crt"), ca_key: dir.join("ca.key"), @@ -250,10 +308,14 @@ impl LocalPaths { client_crt: client_dir.join("tls.crt"), client_key: client_dir.join("tls.key"), client_dir, + jwt_signing: jwt_dir.join("signing.pem"), + jwt_public: jwt_dir.join("public.pem"), + jwt_kid: jwt_dir.join("kid"), + jwt_dir, } } - fn all_files(&self) -> [&Path; 6] { + fn all_files(&self) -> [&Path; 9] { [ &self.ca_crt, &self.ca_key, @@ -261,6 +323,9 @@ impl LocalPaths { &self.server_key, &self.client_crt, &self.client_key, + &self.jwt_signing, + &self.jwt_public, + &self.jwt_kid, ] } @@ -271,7 +336,7 @@ impl LocalPaths { fn decide_local(present: usize) -> LocalAction { match present { - 6 => LocalAction::Skip, + 9 => LocalAction::Skip, 0 => LocalAction::Create, _ => LocalAction::PartialState, } @@ -318,6 +383,9 @@ fn read_local_bundle(paths: &LocalPaths) -> Result { server_key_pem: read_pem(&paths.server_key)?, client_cert_pem: read_pem(&paths.client_crt)?, client_key_pem: read_pem(&paths.client_key)?, + jwt_signing_key_pem: read_pem(&paths.jwt_signing)?, + jwt_public_key_pem: read_pem(&paths.jwt_public)?, + jwt_key_id: read_pem(&paths.jwt_kid)?.trim().to_string(), }) } @@ -339,9 +407,11 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res let temp_server = temp.join("server"); let temp_client = temp.join("client"); + let temp_jwt = temp.join("jwt"); create_dir_restricted(&temp)?; create_dir_restricted(&temp_server)?; create_dir_restricted(&temp_client)?; + create_dir_restricted(&temp_jwt)?; write_pem(&temp.join("ca.crt"), &bundle.ca_cert_pem, false)?; write_pem(&temp.join("ca.key"), &bundle.ca_key_pem, true)?; @@ -349,19 +419,34 @@ fn write_local_bundle(dir: &Path, bundle: &PkiBundle, paths: &LocalPaths) -> Res write_pem(&temp_server.join("tls.key"), &bundle.server_key_pem, true)?; write_pem(&temp_client.join("tls.crt"), &bundle.client_cert_pem, false)?; write_pem(&temp_client.join("tls.key"), &bundle.client_key_pem, true)?; + write_pem( + &temp_jwt.join("signing.pem"), + &bundle.jwt_signing_key_pem, + true, + )?; + write_pem( + &temp_jwt.join("public.pem"), + &bundle.jwt_public_key_pem, + false, + )?; + write_pem(&temp_jwt.join("kid"), &bundle.jwt_key_id, false)?; // Final destination (might not exist yet on first run). create_dir_restricted(dir)?; create_dir_restricted(&paths.server_dir)?; create_dir_restricted(&paths.client_dir)?; + create_dir_restricted(&paths.jwt_dir)?; - let renames: [(PathBuf, &Path); 6] = [ + let renames: [(PathBuf, &Path); 9] = [ (temp.join("ca.crt"), paths.ca_crt.as_path()), (temp.join("ca.key"), paths.ca_key.as_path()), (temp_server.join("tls.crt"), paths.server_crt.as_path()), (temp_server.join("tls.key"), paths.server_key.as_path()), (temp_client.join("tls.crt"), paths.client_crt.as_path()), (temp_client.join("tls.key"), paths.client_key.as_path()), + (temp_jwt.join("signing.pem"), paths.jwt_signing.as_path()), + (temp_jwt.join("public.pem"), paths.jwt_public.as_path()), + (temp_jwt.join("kid"), paths.jwt_kid.as_path()), ]; for (from, to) in &renames { std::fs::rename(from, to) @@ -406,8 +491,8 @@ fn print_bundle(bundle: &PkiBundle) { #[cfg(test)] mod tests { use super::{ - K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, read_local_bundle, - sibling_temp_dir, tls_secret, write_local_bundle, + K8sAction, LocalAction, LocalPaths, decide_k8s, decide_local, jwt_signing_secret, + read_local_bundle, sibling_temp_dir, tls_secret, write_local_bundle, }; use openshell_bootstrap::pki::generate_pki; use std::path::Path; @@ -415,23 +500,32 @@ mod tests { // ── Kubernetes-mode decision ── #[test] - fn decide_k8s_skip_when_both_exist() { - assert_eq!(decide_k8s(true, true), K8sAction::SkipExists); + fn decide_k8s_skip_when_all_three_exist() { + assert_eq!(decide_k8s(true, true, true), K8sAction::SkipExists); } #[test] - fn decide_k8s_create_when_neither_exists() { - assert_eq!(decide_k8s(false, false), K8sAction::Create); + fn decide_k8s_create_when_none_exist() { + assert_eq!(decide_k8s(false, false, false), K8sAction::Create); } #[test] - fn decide_k8s_partial_when_only_server_exists() { - assert_eq!(decide_k8s(true, false), K8sAction::PartialState); - } - - #[test] - fn decide_k8s_partial_when_only_client_exists() { - assert_eq!(decide_k8s(false, true), K8sAction::PartialState); + fn decide_k8s_partial_for_any_mixed_state() { + let mixes = [ + (true, false, false), + (false, true, false), + (false, false, true), + (true, true, false), + (true, false, true), + (false, true, true), + ]; + for (s, c, j) in mixes { + assert_eq!( + decide_k8s(s, c, j), + K8sAction::PartialState, + "({s},{c},{j})" + ); + } } #[test] @@ -446,11 +540,23 @@ mod tests { assert_eq!(data["ca.crt"].0, b"CA-PEM"); } + #[test] + fn jwt_signing_secret_has_opaque_type_and_three_keys() { + let s = jwt_signing_secret("jwt", "SIGN", "PUB", "kid-1"); + assert_eq!(s.metadata.name.as_deref(), Some("jwt")); + assert_eq!(s.type_.as_deref(), Some("Opaque")); + let data = s.data.expect("data set"); + assert_eq!(data.len(), 3); + assert_eq!(data["signing.pem"].0, b"SIGN"); + assert_eq!(data["public.pem"].0, b"PUB"); + assert_eq!(data["kid"].0, b"kid-1"); + } + // ── Local-mode decision ── #[test] - fn decide_local_skip_when_all_six_present() { - assert_eq!(decide_local(6), LocalAction::Skip); + fn decide_local_skip_when_all_nine_present() { + assert_eq!(decide_local(9), LocalAction::Skip); } #[test] @@ -460,7 +566,7 @@ mod tests { #[test] fn decide_local_partial_for_any_count_in_between() { - for n in 1..=5 { + for n in 1..=8 { assert_eq!(decide_local(n), LocalAction::PartialState, "n = {n}"); } } diff --git a/crates/openshell-server/src/cli.rs b/crates/openshell-server/src/cli.rs index a2cfacde5..913da706d 100644 --- a/crates/openshell-server/src/cli.rs +++ b/crates/openshell-server/src/cli.rs @@ -332,6 +332,18 @@ async fn run_from_args(mut args: RunArgs, matches: ArgMatches) -> Result<()> { }); } + // PR-2 wires gateway_jwt via the config file only — there's no CLI + // flag yet because the standard deployments (helm chart + RPM init + // script) drop the keypair to a known path and pass that path through + // the TOML. A CLI shortcut can be added if a singleplayer operator + // needs to override. + if let Some(jwt) = file + .as_ref() + .and_then(|f| f.openshell.gateway.gateway_jwt.clone()) + { + config.gateway_jwt = Some(jwt); + } + let vm_config = build_vm_config(file.as_ref())?; let docker_config = build_docker_config(file.as_ref())?; @@ -746,6 +758,8 @@ mod tests { "openshell-server-tls", "--client-secret-name", "openshell-client-tls", + "--jwt-secret-name", + "openshell-jwt-keys", "--server-san", "openshell.example.com", "--server-san", diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d8e823df9..427a219ce 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -421,7 +421,11 @@ impl ComputeRuntime { .map(|_| ()) } - pub async fn create_sandbox(&self, sandbox: Sandbox) -> Result { + pub async fn create_sandbox( + &self, + sandbox: Sandbox, + sandbox_token: Option, + ) -> Result { let existing = self .store .get_message_by_name::(sandbox.object_name()) @@ -440,7 +444,12 @@ impl ComputeRuntime { .await .map_err(|e| Status::internal(format!("persist sandbox failed: {e}")))?; - let driver_sandbox = driver_sandbox_from_public(&sandbox); + let mut driver_sandbox = driver_sandbox_from_public(&sandbox); + if let Some(token) = sandbox_token + && let Some(spec) = driver_sandbox.spec.as_mut() + { + spec.sandbox_token = token; + } match self .driver .create_sandbox(Request::new(CreateSandboxRequest { @@ -1131,6 +1140,7 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), + sandbox_token: String::new(), } } diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 2a1320a55..eddcd2615 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -25,7 +25,7 @@ use std::net::SocketAddr; use std::path::{Path, PathBuf}; use openshell_core::config::ComputeDriverKind; -use openshell_core::{OidcConfig, TlsConfig}; +use openshell_core::{GatewayJwtConfig, OidcConfig, TlsConfig}; use serde::{Deserialize, Serialize}; /// Latest schema version this build understands. @@ -115,6 +115,11 @@ pub struct GatewayFileSection { pub host_gateway_ip: Option, #[serde(default)] pub enable_user_namespaces: Option, + /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet + /// writes for the `IssueSandboxToken` bootstrap exchange. Driver + /// clamps to `[600, 86400]`. + #[serde(default)] + pub sa_token_ttl_secs: Option, #[serde(default)] pub guest_tls_ca: Option, #[serde(default)] @@ -133,6 +138,8 @@ pub struct GatewayFileSection { pub tls: Option, #[serde(default)] pub oidc: Option, + #[serde(default)] + pub gateway_jwt: Option, // ── Disallowed-in-file fields ──────────────────────────────────────── // @@ -247,6 +254,7 @@ fn inheritable_keys(driver: ComputeDriverKind) -> &'static [&'static str] { "client_tls_secret_name", "host_gateway_ip", "enable_user_namespaces", + "sa_token_ttl_secs", ], ComputeDriverKind::Docker => &[ "sandbox_namespace", @@ -281,6 +289,7 @@ fn gateway_inherited_value(g: &GatewayFileSection, key: &str) -> Option g.client_tls_secret_name.as_deref().map(string_value), "host_gateway_ip" => g.host_gateway_ip.as_deref().map(string_value), "enable_user_namespaces" => g.enable_user_namespaces.map(toml::Value::Boolean), + "sa_token_ttl_secs" => g.sa_token_ttl_secs.map(toml::Value::Integer), "guest_tls_ca" => g.guest_tls_ca.as_deref().map(path_value), "guest_tls_cert" => g.guest_tls_cert.as_deref().map(path_value), "guest_tls_key" => g.guest_tls_key.as_deref().map(path_value), diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs new file mode 100644 index 000000000..2519035be --- /dev/null +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -0,0 +1,307 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Authentication-related RPC handlers. +//! +//! Hosts the two sandbox-identity RPCs: +//! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) +//! - `RefreshSandboxToken` — rotate a still-valid gateway JWT +//! +//! Both end in a fresh gateway-signed JWT minted by +//! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. `RefreshSandboxToken` +//! additionally revokes the previous JWT's `jti` so the old token +//! becomes unusable as soon as the new one is handed back. + +use crate::ServerState; +use crate::auth::principal::{Principal, SandboxIdentitySource}; +use openshell_core::proto::{ + IssueSandboxTokenRequest, IssueSandboxTokenResponse, RefreshSandboxTokenRequest, + RefreshSandboxTokenResponse, +}; +use std::sync::Arc; +use std::time::SystemTime; +use tonic::{Request, Response, Status}; +use tracing::{debug, info, warn}; + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_issue_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "IssueSandboxToken requires a sandbox principal", + )); + }; + + // Only the bootstrap K8s ServiceAccount path can mint a fresh + // gateway JWT via this RPC. Sandboxes already holding a gateway JWT + // use `RefreshSandboxToken` instead, which also revokes the old jti. + if !matches!( + sandbox.source, + SandboxIdentitySource::K8sServiceAccount { .. } + ) { + debug!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken rejected: non-bootstrap principal source" + ); + return Err(Status::permission_denied( + "this principal cannot mint a sandbox token; use RefreshSandboxToken", + )); + } + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "IssueSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + let minted = issuer.mint(&sandbox.sandbox_id)?; + info!( + sandbox_id = %sandbox.sandbox_id, + jti = %minted.jti, + "issued gateway sandbox JWT" + ); + Ok(Response::new(IssueSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +#[allow(clippy::result_large_err, clippy::unused_async)] +pub async fn handle_refresh_sandbox_token( + state: &Arc, + request: Request, +) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; + + let Principal::Sandbox(sandbox) = principal else { + return Err(Status::permission_denied( + "RefreshSandboxToken requires a sandbox principal", + )); + }; + + // Only callers already holding a gateway-minted JWT may refresh; the + // K8s bootstrap path must use `IssueSandboxToken`. + let SandboxIdentitySource::BootstrapJwt { jti: old_jti, .. } = &sandbox.source else { + debug!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken rejected: non-gateway-JWT principal source" + ); + return Err(Status::permission_denied( + "this principal cannot refresh; use IssueSandboxToken for bootstrap", + )); + }; + + let issuer = state.sandbox_jwt_issuer.as_ref().ok_or_else(|| { + warn!( + sandbox_id = %sandbox.sandbox_id, + "RefreshSandboxToken called but sandbox JWT issuer is not configured" + ); + Status::unavailable("sandbox JWT minting is not configured on this gateway") + })?; + + // Mint the new token first; only revoke the old jti after we have a + // replacement so a failure here doesn't leave the sandbox stranded. + let minted = issuer.mint(&sandbox.sandbox_id)?; + + // Best-effort revocation of the old token. The plan calls for the + // jti deny-list to live in memory in PR 2; PR 5 only needs to drop + // the old jti into it. We use the new token's expiry as a safe upper + // bound for the revocation entry — the old jti can't outlive its own + // `exp`, and on TTL pruning the entry drops out cleanly. + state + .sandbox_jwt_revocation + .revoke(old_jti, minted.expires_at_ms.max(now_ms())); + info!( + sandbox_id = %sandbox.sandbox_id, + revoked_jti = %old_jti, + new_jti = %minted.jti, + "refreshed gateway sandbox JWT" + ); + + Ok(Response::new(RefreshSandboxTokenResponse { + token: minted.token, + expires_at_ms: minted.expires_at_ms, + })) +} + +fn now_ms() -> i64 { + i64::try_from( + SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .map_or(0, |d| d.as_millis()), + ) + .unwrap_or(i64::MAX) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ServerState; + use crate::auth::principal::{Principal, SandboxPrincipal, UserPrincipal}; + use crate::auth::revocation::RevocationSet; + use crate::auth::sandbox_jwt::SandboxJwtIssuer; + use crate::compute::new_test_runtime; + use crate::persistence::Store; + use crate::sandbox_index::SandboxIndex; + use crate::sandbox_watch::SandboxWatchBus; + use crate::supervisor_session::SupervisorSessionRegistry; + use crate::tracing_bus::TracingLogBus; + use openshell_bootstrap::jwt::generate_jwt_key; + use openshell_core::Config; + use std::time::Duration; + + async fn state_with_issuer() -> (Arc, SandboxJwtIssuer, Arc) { + let mat = generate_jwt_key().expect("jwt key"); + let revocation = Arc::new(RevocationSet::new()); + let issuer = SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .expect("issuer"); + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let mut state = ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + ); + state.sandbox_jwt_revocation = revocation.clone(); + // We don't need the authenticator for these tests; only the issuer. + // The handler tests only exercise the mint+revoke path; they + // don't need the issuer to be the same instance that produced + // `issuer` above. A fresh keypair is fine. + let issuer_clone = SandboxJwtIssuer::from_pem( + generate_jwt_key().unwrap().signing_key_pem.as_bytes(), + "kid".to_string(), + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + state.sandbox_jwt_issuer = Some(Arc::new(issuer_clone)); + (Arc::new(state), issuer, revocation) + } + + fn sandbox_principal(sandbox_id: &str, jti: &str) -> Principal { + use crate::auth::principal::SandboxIdentitySource; + Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test-gateway".to_string(), + jti: jti.to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn refresh_revokes_old_jti_and_returns_new_token() { + let (state, _issuer, revocation) = state_with_issuer().await; + let old_jti = "j-original"; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", old_jti)); + let resp = handle_refresh_sandbox_token(&state, req) + .await + .expect("refresh OK") + .into_inner(); + assert!(!resp.token.is_empty()); + assert!(revocation.is_revoked(old_jti), "old jti must be revoked"); + } + + #[tokio::test] + async fn refresh_rejects_user_principal() { + use crate::auth::identity::{Identity, IdentityProvider}; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("user must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_rejects_k8s_sa_principal() { + // K8s SA-bootstrap principals must use IssueSandboxToken, not + // RefreshSandboxToken — the refresh path assumes a still-valid + // gateway-minted JWT exists. + use crate::auth::principal::SandboxIdentitySource; + let (state, _, _) = state_with_issuer().await; + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::K8sServiceAccount { + pod_name: "pod-a".to_string(), + pod_uid: "uid-a".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("K8s SA principal must not refresh"); + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + + #[tokio::test] + async fn refresh_fails_when_issuer_not_configured() { + // Build a ServerState without the issuer to confirm the handler + // returns Unavailable. + let store = Arc::new( + Store::connect("sqlite::memory:?cache=shared") + .await + .unwrap(), + ); + let compute = new_test_runtime(store.clone()).await; + let state = Arc::new(ServerState::new( + Config::new(None).with_database_url("sqlite::memory:?cache=shared"), + store, + compute, + SandboxIndex::new(), + SandboxWatchBus::new(), + TracingLogBus::new(), + Arc::new(SupervisorSessionRegistry::new()), + None, + )); + let mut req = Request::new(RefreshSandboxTokenRequest {}); + req.extensions_mut() + .insert(sandbox_principal("sandbox-a", "j-1")); + let err = handle_refresh_sandbox_token(&state, req) + .await + .expect_err("missing issuer must yield unavailable"); + assert_eq!(err.code(), tonic::Code::Unavailable); + } +} diff --git a/crates/openshell-server/src/grpc/mod.rs b/crates/openshell-server/src/grpc/mod.rs index 9ea8d7ece..85c47a19a 100644 --- a/crates/openshell-server/src/grpc/mod.rs +++ b/crates/openshell-server/src/grpc/mod.rs @@ -3,6 +3,7 @@ //! gRPC service implementation. +mod auth_rpc; pub mod policy; mod provider; mod sandbox; @@ -25,15 +26,16 @@ use openshell_core::proto::{ GetSandboxLogsResponse, GetSandboxPolicyStatusRequest, GetSandboxPolicyStatusResponse, GetSandboxProviderEnvironmentRequest, GetSandboxProviderEnvironmentResponse, GetSandboxRequest, GetServiceRequest, HealthRequest, HealthResponse, ImportProviderProfilesRequest, - ImportProviderProfilesResponse, LintProviderProfilesRequest, LintProviderProfilesResponse, - ListProviderProfilesRequest, ListProviderProfilesResponse, ListProvidersRequest, - ListProvidersResponse, ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, - ListSandboxProvidersRequest, ListSandboxProvidersResponse, ListSandboxesRequest, - ListSandboxesResponse, ListServicesRequest, ListServicesResponse, ProviderProfileResponse, - ProviderResponse, PushSandboxLogsRequest, PushSandboxLogsResponse, RejectDraftChunkRequest, - RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, ReportPolicyStatusResponse, - RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxResponse, SandboxStreamEvent, - ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, + ImportProviderProfilesResponse, IssueSandboxTokenRequest, IssueSandboxTokenResponse, + LintProviderProfilesRequest, LintProviderProfilesResponse, ListProviderProfilesRequest, + ListProviderProfilesResponse, ListProvidersRequest, ListProvidersResponse, + ListSandboxPoliciesRequest, ListSandboxPoliciesResponse, ListSandboxProvidersRequest, + ListSandboxProvidersResponse, ListSandboxesRequest, ListSandboxesResponse, ListServicesRequest, + ListServicesResponse, ProviderProfileResponse, ProviderResponse, PushSandboxLogsRequest, + PushSandboxLogsResponse, RefreshSandboxTokenRequest, RefreshSandboxTokenResponse, + RejectDraftChunkRequest, RejectDraftChunkResponse, RelayFrame, ReportPolicyStatusRequest, + ReportPolicyStatusResponse, RevokeSshSessionRequest, RevokeSshSessionResponse, SandboxResponse, + SandboxStreamEvent, ServiceEndpointResponse, ServiceStatus, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, SupervisorMessage, TcpForwardFrame, UndoDraftChunkRequest, UndoDraftChunkResponse, UpdateConfigRequest, UpdateConfigResponse, UpdateProviderRequest, WatchSandboxRequest, open_shell_server::OpenShell, @@ -510,6 +512,22 @@ impl OpenShell for OpenShellService { policy::handle_get_draft_history(&self.state, request).await } + // --- Sandbox identity --- + + async fn issue_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_issue_sandbox_token(&self.state, request).await + } + + async fn refresh_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + auth_rpc::handle_refresh_sandbox_token(&self.state, request).await + } + // --- Supervisor session --- type ConnectSupervisorStream = diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 315b06f3c..edee228cc 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -10,9 +10,9 @@ #![allow(clippy::cast_precision_loss)] // f64->f32 for confidence scores #![allow(clippy::items_after_statements)] // DB_PORTS const inside function +use crate::ServerState; use crate::persistence::{DraftChunkRecord, ObjectId, ObjectName, ObjectType, PolicyRecord, Store}; use crate::policy_store::PolicyStoreExt; -use crate::{ServerState, auth::oidc}; use openshell_core::proto::policy_merge_operation; use openshell_core::proto::setting_value; use openshell_core::proto::{ @@ -314,8 +314,14 @@ fn truncate_for_log(input: &str, max_chars: usize) -> String { } } +#[cfg(test)] fn is_sandbox_caller(request: &Request) -> bool { - oidc::is_sandbox_caller(request.metadata()) + matches!( + request + .extensions() + .get::(), + Some(crate::auth::principal::Principal::Sandbox(_)) + ) } /// Sandbox-class callers may only perform sandbox-scoped policy sync. They @@ -352,7 +358,9 @@ pub(super) async fn handle_get_sandbox_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -609,7 +617,9 @@ pub(super) async fn handle_get_sandbox_provider_environment( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_id = request.into_inner().sandbox_id; + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; + drop(request); let sandbox = state .store @@ -651,10 +661,32 @@ pub(super) async fn handle_update_config( state: &Arc, request: Request, ) -> Result, Status> { - let sandbox_caller = is_sandbox_caller(&request); + let principal = request + .extensions() + .get::() + .cloned(); + let sandbox_caller = matches!( + principal, + Some(crate::auth::principal::Principal::Sandbox(_)) + ); let req = request.into_inner(); if sandbox_caller { validate_sandbox_caller_update(&req)?; + // Resolve req.name to a sandbox UUID and verify the calling + // sandbox principal owns it. User callers (CLI / TUI) bypass + // this check because RBAC was their gate. + let sandbox = state + .store + .get_message_by_name::(&req.name) + .await + .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? + .ok_or_else(|| Status::not_found("sandbox not found"))?; + crate::auth::guard::ensure_sandbox_scope( + principal + .as_ref() + .expect("sandbox_caller implies principal"), + sandbox.object_id(), + )?; } let key = req.setting_key.trim(); let has_policy = req.policy.is_some(); @@ -1180,6 +1212,8 @@ pub(super) async fn handle_report_policy_status( state: &Arc, request: Request, ) -> Result, Status> { + let sandbox_id = request.get_ref().sandbox_id.clone(); + crate::auth::guard::enforce_sandbox_scope(&request, &sandbox_id)?; let req = request.into_inner(); if req.sandbox_id.is_empty() { return Err(Status::invalid_argument("sandbox_id is required")); @@ -1294,6 +1328,11 @@ pub(super) async fn handle_push_sandbox_logs( state: &Arc, request: Request>, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let mut stream = request.into_inner(); let mut validated = false; @@ -1307,6 +1346,10 @@ pub(super) async fn handle_push_sandbox_logs( } if !validated { + // The streaming RPC carries the sandbox_id in every frame, but + // the equality check only needs to run once on the first frame + // — the principal is stable across the stream. + crate::auth::guard::ensure_sandbox_scope(&principal, &batch.sandbox_id)?; state .store .get_message::(&batch.sandbox_id) @@ -1335,6 +1378,11 @@ pub(super) async fn handle_submit_policy_analysis( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1347,6 +1395,9 @@ pub(super) async fn handle_submit_policy_analysis( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + // Name → id resolved; now enforce that a sandbox principal only acts + // on its own sandbox. User principals are unaffected. + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let current_version = state .store @@ -1463,6 +1514,11 @@ pub(super) async fn handle_get_draft_policy( state: &Arc, request: Request, ) -> Result, Status> { + let principal = request + .extensions() + .get::() + .cloned() + .ok_or_else(|| Status::unauthenticated("missing principal"))?; let req = request.into_inner(); if req.name.is_empty() { return Err(Status::invalid_argument("name is required")); @@ -1475,6 +1531,7 @@ pub(super) async fn handle_get_draft_policy( .map_err(|e| Status::internal(format!("fetch sandbox failed: {e}")))? .ok_or_else(|| Status::not_found("sandbox not found"))?; let sandbox_id = sandbox.object_id().to_string(); + crate::auth::guard::ensure_sandbox_scope(&principal, &sandbox_id)?; let status_filter = if req.status_filter.is_empty() { None @@ -2789,6 +2846,10 @@ fn materialize_global_settings( mod tests { use super::*; use crate::ServerState; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; use crate::compute::new_test_runtime; use crate::persistence::Store; use crate::sandbox_index::SandboxIndex; @@ -2800,6 +2861,41 @@ mod tests { use std::sync::Arc; use tonic::Code; + /// Wrap a request with a user `Principal` so handlers' scope guards + /// (introduced in PR 4) treat the test caller as a CLI user — equivalent + /// to the pre-PR-4 behavior where all tests effectively ran as user. + fn with_user(mut request: Request) -> Request { + request + .extensions_mut() + .insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "test-user".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + request + } + + /// Wrap a request with a sandbox `Principal` bound to `sandbox_id`. + /// Use for tests that exercise sandbox-caller code paths. + #[allow(dead_code)] + fn with_sandbox(mut request: Request, sandbox_id: &str) -> Request { + request + .extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: sandbox_id.to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-test".to_string(), + }, + trust_domain: Some("openshell".to_string()), + })); + request + } + #[test] fn sandbox_caller_update_validation_allows_sandbox_policy_sync() { let req = UpdateConfigRequest { @@ -2834,15 +2930,201 @@ mod tests { } #[test] - fn sandbox_caller_marker_detected_from_metadata() { + fn sandbox_caller_detected_from_principal_extension() { + use crate::auth::principal::{Principal, SandboxIdentitySource, SandboxPrincipal}; let mut req = Request::new(()); - req.metadata_mut().insert( - oidc::INTERNAL_AUTH_SOURCE_HEADER, - oidc::AUTH_SOURCE_SANDBOX.parse().unwrap(), - ); + req.extensions_mut() + .insert(Principal::Sandbox(SandboxPrincipal { + sandbox_id: "test-sandbox".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: None, + })); assert!(is_sandbox_caller(&req)); } + #[test] + fn user_principal_not_treated_as_sandbox_caller() { + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{Principal, UserPrincipal}; + let mut req = Request::new(()); + req.extensions_mut().insert(Principal::User(UserPrincipal { + identity: Identity { + subject: "alice".to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + })); + assert!(!is_sandbox_caller(&req)); + } + + // ---- PR-4 IDOR guard (issue #1354) ---- + + #[tokio::test] + async fn cross_sandbox_get_sandbox_config_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + // Two sandboxes; the caller is principal of A, the request body + // references B. + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-b".to_string(), + }), + "sb-a", + ); + let err = handle_get_sandbox_config(&state, req) + .await + .expect_err("cross-sandbox call must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn same_sandbox_get_sandbox_config_allowed() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-self".to_string(), + name: "self".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_sandbox( + Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-self".to_string(), + }), + "sb-self", + ); + handle_get_sandbox_config(&state, req) + .await + .expect("matching principal must be allowed"); + } + + #[tokio::test] + async fn cross_sandbox_submit_policy_analysis_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(SubmitPolicyAnalysisRequest { + name: "sandbox-b".to_string(), + ..Default::default() + }), + "sb-a", + ); + let err = handle_submit_policy_analysis(&state, req) + .await + .expect_err("cross-sandbox submit must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn cross_sandbox_get_draft_policy_denied() { + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + for (id, name) in [("sb-a", "sandbox-a"), ("sb-b", "sandbox-b")] { + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: id.to_string(), + name: name.to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + } + let req = with_sandbox( + Request::new(GetDraftPolicyRequest { + name: "sandbox-b".to_string(), + status_filter: String::new(), + }), + "sb-a", + ); + let err = handle_get_draft_policy(&state, req) + .await + .expect_err("cross-sandbox draft read must be denied"); + assert_eq!(err.code(), Code::PermissionDenied); + } + + #[tokio::test] + async fn user_principal_can_read_any_sandbox_config() { + // RBAC was the user gate; the IDOR guard must NOT trip for users. + use openshell_core::proto::{SandboxPhase, SandboxSpec}; + let state = test_server_state().await; + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-x".to_string(), + name: "x".to_string(), + created_at_ms: 1_000_000, + labels: HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Provisioning as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + let req = with_user(Request::new(GetSandboxConfigRequest { + sandbox_id: "sb-x".to_string(), + })); + handle_get_sandbox_config(&state, req) + .await + .expect("user principal must succeed"); + } + // ---- Sandbox without policy ---- #[tokio::test] @@ -2951,9 +3233,9 @@ mod tests { async fn get_sandbox_policy(state: &Arc, sandbox_id: &str) -> ProtoSandboxPolicy { handle_get_sandbox_config( state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: sandbox_id.to_string(), - }), + })), ) .await .unwrap() @@ -3391,9 +3673,9 @@ mod tests { let legacy_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3403,9 +3685,9 @@ mod tests { enable_providers_v2(&state).await; let v2_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-env".to_string(), - }), + })), ) .await .unwrap() @@ -3437,9 +3719,9 @@ mod tests { let first = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3453,9 +3735,9 @@ mod tests { let second = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-provider-revision".to_string(), - }), + })), ) .await .unwrap() @@ -3507,9 +3789,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3517,10 +3799,10 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "attach-lifecycle".to_string(), provider_name: "work-github".to_string(), - }), + })), ) .await .unwrap(); @@ -3534,9 +3816,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3569,9 +3851,9 @@ mod tests { let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3665,9 +3947,9 @@ mod tests { ); let baseline_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3675,10 +3957,10 @@ mod tests { handle_attach_sandbox_provider( &state, - Request::new(AttachSandboxProviderRequest { + with_user(Request::new(AttachSandboxProviderRequest { sandbox_name: "custom-attach-lifecycle".to_string(), provider_name: "work-custom".to_string(), - }), + })), ) .await .unwrap(); @@ -3695,9 +3977,9 @@ mod tests { let attached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3729,9 +4011,9 @@ mod tests { ); let detached_env = handle_get_sandbox_provider_environment( &state, - Request::new(GetSandboxProviderEnvironmentRequest { + with_user(Request::new(GetSandboxProviderEnvironmentRequest { sandbox_id: "sb-custom-attach-lifecycle".to_string(), - }), + })), ) .await .unwrap() @@ -3827,9 +4109,9 @@ mod tests { let response = handle_get_sandbox_config( &state, - Request::new(GetSandboxConfigRequest { + with_user(Request::new(GetSandboxConfigRequest { sandbox_id: "sb-global-profile".to_string(), - }), + })), ) .await .unwrap() @@ -3972,7 +4254,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -3986,7 +4268,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -3998,10 +4280,10 @@ mod tests { let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4068,10 +4350,10 @@ mod tests { let draft_policy_after_undo = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4120,10 +4402,10 @@ mod tests { let draft_policy_after_clear = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4181,7 +4463,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4190,7 +4472,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4211,10 +4493,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4289,7 +4571,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "agent_authored".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4298,7 +4580,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4318,10 +4600,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name.clone(), status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4395,7 +4677,7 @@ mod tests { async move { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name, analysis_mode: "mechanistic".to_string(), proposed_chunks: vec![PolicyChunk { @@ -4404,7 +4686,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4418,10 +4700,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4490,7 +4772,7 @@ mod tests { let submit = handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_name.clone(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4498,7 +4780,7 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap() @@ -4538,10 +4820,10 @@ mod tests { let draft = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_name, status_filter: String::new(), - }), + })), ) .await .unwrap() @@ -4610,7 +4892,7 @@ mod tests { handle_submit_policy_analysis( &state, - Request::new(SubmitPolicyAnalysisRequest { + with_user(Request::new(SubmitPolicyAnalysisRequest { name: sandbox_a.object_name().to_string(), proposed_chunks: vec![PolicyChunk { rule_name: "allow_example".to_string(), @@ -4624,17 +4906,17 @@ mod tests { ..Default::default() }], ..Default::default() - }), + })), ) .await .unwrap(); let draft_policy = handle_get_draft_policy( &state, - Request::new(GetDraftPolicyRequest { + with_user(Request::new(GetDraftPolicyRequest { name: sandbox_a.object_name().to_string(), status_filter: String::new(), - }), + })), ) .await .unwrap() diff --git a/crates/openshell-server/src/grpc/sandbox.rs b/crates/openshell-server/src/grpc/sandbox.rs index 5c523b10e..a66a48202 100644 --- a/crates/openshell-server/src/grpc/sandbox.rs +++ b/crates/openshell-server/src/grpc/sandbox.rs @@ -128,7 +128,28 @@ pub(super) async fn handle_create_sandbox( status })?; - let sandbox = state.compute.create_sandbox(sandbox).await?; + // Mint the gateway JWT for singleplayer drivers. K8s sandboxes skip + // this mint and bootstrap via `IssueSandboxToken` at supervisor + // startup; identifying "is this K8s?" lives in the compute layer, so + // we mint unconditionally here when the issuer is configured and let + // the K8s driver simply ignore the field. + let sandbox_token = state.sandbox_jwt_issuer.as_ref().map(|issuer| { + issuer.mint(&id).map(|minted| { + tracing::info!( + sandbox_id = %id, + jti = %minted.jti, + "minted sandbox JWT" + ); + minted.token + }) + }); + let sandbox_token = match sandbox_token { + Some(Ok(token)) => Some(token), + Some(Err(status)) => return Err(status), + None => None, + }; + + let sandbox = state.compute.create_sandbox(sandbox, sandbox_token).await?; info!( sandbox_id = %id, diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index 50d1e8df3..c219fc5ce 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -57,8 +57,25 @@ impl ObjectType for InferenceRoute { impl Inference for InferenceService { async fn get_inference_bundle( &self, - _request: Request, + request: Request, ) -> Result, Status> { + // GetInferenceBundle is gateway-wide (no per-sandbox routes yet), + // so it has no `sandbox_id` to compare against. Just reject + // anonymous callers; both user and sandbox principals are allowed. + match request + .extensions() + .get::() + { + Some( + crate::auth::principal::Principal::User(_) + | crate::auth::principal::Principal::Sandbox(_), + ) => {} + Some(crate::auth::principal::Principal::Anonymous) | None => { + return Err(Status::unauthenticated( + "GetInferenceBundle requires an authenticated caller", + )); + } + } resolve_inference_bundle(self.state.store.as_ref()) .await .map(Response::new) diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index a6e337dec..3fa5b313b 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -103,6 +103,26 @@ pub struct ServerState { /// OIDC JWKS cache for JWT validation. `None` when OIDC is not configured. pub oidc_cache: Option>, + + /// Gateway-minted sandbox JWT issuer. `None` when `config.gateway_jwt` + /// is not configured; in that mode `IssueSandboxToken` returns + /// `Status::unavailable`. Populated at startup from the on-disk key + /// material that `certgen` writes. + pub sandbox_jwt_issuer: Option>, + + /// Authenticator that validates gateway-minted sandbox JWTs on every + /// inbound request. Always set when `sandbox_jwt_issuer` is, so callers + /// presenting a freshly minted token are recognized. + pub sandbox_jwt_authenticator: Option>, + + /// Optional K8s `ServiceAccount` authenticator that backs the + /// `IssueSandboxToken` bootstrap path. Only present when the gateway + /// runs in-cluster. + pub k8s_sa_authenticator: Option>, + + /// In-memory revocation set for gateway-minted sandbox JWTs. + /// Populated by `DeleteSandbox` and (in PR 5) `RefreshSandboxToken`. + pub sandbox_jwt_revocation: Arc, } fn is_benign_tls_handshake_failure(error: &std::io::Error) -> bool { @@ -147,6 +167,10 @@ impl ServerState { settings_mutex: tokio::sync::Mutex::new(()), supervisor_sessions, oidc_cache, + sandbox_jwt_issuer: None, + sandbox_jwt_authenticator: None, + k8s_sa_authenticator: None, + sandbox_jwt_revocation: Arc::new(auth::revocation::RevocationSet::new()), } } } @@ -204,7 +228,7 @@ pub async fn run_server( supervisor_sessions.clone(), ) .await?; - let state = Arc::new(ServerState::new( + let mut state = ServerState::new( config.clone(), store.clone(), compute, @@ -213,7 +237,101 @@ pub async fn run_server( tracing_log_bus, supervisor_sessions, oidc_cache, - )); + ); + + // Load the gateway-minted sandbox JWT signing key when configured. + // Optional in PR 2 so single-driver dev deployments without certgen + // continue to start. The helm-deployed gateway and the RPM init script + // populate `gateway_jwt` once `certgen` has produced the on-disk + // material. + if let Some(ref jwt) = config.gateway_jwt { + let signing_pem = std::fs::read(&jwt.signing_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT signing key from {}: {e}", + jwt.signing_key_path.display() + )) + })?; + let public_pem = std::fs::read(&jwt.public_key_path).map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT public key from {}: {e}", + jwt.public_key_path.display() + )) + })?; + let kid = std::fs::read_to_string(&jwt.kid_path) + .map_err(|e| { + Error::config(format!( + "failed to read sandbox JWT kid from {}: {e}", + jwt.kid_path.display() + )) + })? + .trim() + .to_string(); + if kid.is_empty() { + return Err(Error::config(format!( + "sandbox JWT kid file {} is empty", + jwt.kid_path.display() + ))); + } + let issuer = auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + &signing_pem, + kid.clone(), + &jwt.gateway_id, + Duration::from_secs(jwt.ttl_secs), + ) + .map_err(Error::config)?; + let authenticator = auth::sandbox_jwt::SandboxJwtAuthenticator::from_pem( + &public_pem, + kid, + &jwt.gateway_id, + state.sandbox_jwt_revocation.clone(), + ) + .map_err(Error::config)?; + info!( + gateway_id = %jwt.gateway_id, + ttl_secs = jwt.ttl_secs, + "gateway-minted sandbox JWT enabled" + ); + state.sandbox_jwt_issuer = Some(Arc::new(issuer)); + state.sandbox_jwt_authenticator = Some(Arc::new(authenticator)); + } + + // K8s ServiceAccount bootstrap authenticator. Only constructed when + // the gateway is running in-cluster (kubelet provides the API host + // env var) and has a sandbox JWT issuer to mint replacements against; + // outside the cluster we can't talk to the apiserver's JWKS endpoint, + // and without the issuer there's nothing to exchange the SA token + // for. + if state.sandbox_jwt_issuer.is_some() && std::env::var_os("KUBERNETES_SERVICE_HOST").is_some() { + // Pod lookups must target the sandbox namespace (where the K8s + // driver places sandbox pods), not the gateway's own pod + // namespace. Sourced from the merged + // `[openshell.drivers.kubernetes].namespace` config, falling + // back to "default" only if the driver config can't be parsed. + let sandbox_namespace = kubernetes_config_from_file(config_file.as_ref()) + .map_or_else(|_| "default".to_string(), |cfg| cfg.namespace); + match kube::Client::try_default().await { + Ok(client) => { + let resolver = Arc::new(auth::k8s_sa::LiveK8sResolver::new( + client, + &sandbox_namespace, + "openshell-gateway".to_string(), + )); + let authenticator = auth::k8s_sa::K8sServiceAccountAuthenticator::new(resolver); + state.k8s_sa_authenticator = Some(Arc::new(authenticator)); + info!( + namespace = %sandbox_namespace, + "K8s ServiceAccount bootstrap authenticator enabled" + ); + } + Err(e) => warn!( + error = %e, + "in-cluster K8s client construction failed; \ + K8s ServiceAccount bootstrap is disabled" + ), + } + } + + let state = Arc::new(state); // Resume sandboxes that were stopped during the previous gateway // shutdown so the running compute state matches the persisted store. diff --git a/crates/openshell-server/src/multiplex.rs b/crates/openshell-server/src/multiplex.rs index deac9ee78..567df2272 100644 --- a/crates/openshell-server/src/multiplex.rs +++ b/crates/openshell-server/src/multiplex.rs @@ -31,8 +31,15 @@ use tower_http::request_id::{MakeRequestId, RequestId}; use tracing::Span; use crate::{ - OpenShellService, ServerState, auth::authz::AuthzPolicy, auth::identity::Identity, auth::oidc, - http_router, inference::InferenceService, service_http_router, + OpenShellService, ServerState, + auth::authenticator::{AuthenticatorChain, PermissiveUserAuthenticator}, + auth::authz::AuthzPolicy, + auth::identity::Identity, + auth::oidc::{self, OidcAuthenticator}, + auth::principal::{Principal, UserPrincipal}, + http_router, + inference::InferenceService, + service_http_router, }; /// Request-ID generator that produces a UUID v4 for each inbound request. @@ -153,17 +160,11 @@ impl MultiplexService { user_role: oidc.user_role.clone(), scopes_enabled: !oidc.scopes_claim.is_empty(), }); - let has_client_ca = self - .state - .config - .tls - .as_ref() - .is_some_and(|tls| tls.client_ca_path.is_some()); - let grpc_service = AuthGrpcRouter::new( + let authenticator_chain = build_authenticator_chain(&self.state); + let grpc_service = AuthGrpcRouter::with_peer_identity( GrpcRouter::new(openshell, inference), - self.state.oidc_cache.clone(), + authenticator_chain, authz_policy, - has_client_ca, peer_identity, ); let http_service = http_router(self.state.clone()); @@ -256,50 +257,103 @@ where } } -/// gRPC router wrapper that authenticates and authorizes requests. +/// Assemble the authenticator chain for the gateway. /// -/// When `oidc_cache` is `Some`, extracts the `authorization: Bearer ` -/// header, validates the JWT (authentication), then checks RBAC roles -/// (authorization) before forwarding to the inner gRPC router. +/// Chain order (first-match-wins): +/// 1. `K8sServiceAccountAuthenticator` (path-scoped to `IssueSandboxToken`) +/// — exchanges a projected SA token for a `Principal::Sandbox` so the +/// `IssueSandboxToken` handler can mint a gateway JWT. No-op on every +/// other path; only present when the gateway runs in-cluster. +/// 2. `SandboxJwtAuthenticator` — validates gateway-minted JWTs. Recognized +/// via a distinctive `kid` so non-matching Bearer tokens fall through. +/// 3. `OidcAuthenticator` — validates user Bearer tokens against the +/// configured OIDC issuer. Returns `Unauthenticated` for missing +/// Bearer headers so non-OIDC clients can't sneak through. +/// 4. `PermissiveUserAuthenticator` — installed only when no OIDC is +/// configured (singleplayer / helm-dev). Catches anything the +/// sandbox authenticators didn't claim and produces a synthetic +/// user principal, preserving the pre-PR-1 "no OIDC = open" posture. /// -/// Authentication is provider-specific (currently OIDC via `oidc.rs`). -/// Authorization is provider-agnostic (via `authz.rs`). This separation -/// aligns with RFC 0001's control-plane identity design. +/// When neither OIDC nor gateway-minted JWTs are configured (a barebones +/// dev gateway), the chain is left as `None` so the router short-circuits +/// to pass-through. +fn build_authenticator_chain(state: &ServerState) -> Option { + let mut authenticators: Vec> = Vec::new(); + if let Some(k8s) = state.k8s_sa_authenticator.clone() { + authenticators.push(k8s); + } + if let Some(jwt) = state.sandbox_jwt_authenticator.clone() { + authenticators.push(jwt); + } + if let Some(cache) = state.oidc_cache.clone() { + authenticators.push(Arc::new(OidcAuthenticator::new(cache))); + } else if !authenticators.is_empty() { + // No OIDC, but sandbox-side authentication IS configured — + // user CLI calls must still pass through, so install a + // permissive final fallback. Production deployments configure + // OIDC and this branch is unused. + authenticators.push(Arc::new(PermissiveUserAuthenticator::new("dev-anonymous"))); + } + if authenticators.is_empty() { + return None; + } + Some(AuthenticatorChain::new(authenticators)) +} + +/// gRPC router wrapper that runs the [`AuthenticatorChain`] and inserts the +/// resulting [`Principal`] into the request's extensions. /// -/// Sandbox-class methods (`oidc::is_sandbox_method`) accept callers without -/// a Bearer token: the gRPC channel's mTLS handshake is the trust -/// boundary. The router marks such requests with the -/// `INTERNAL_AUTH_SOURCE_HEADER` so handlers (`policy.rs`) can apply -/// sandbox-restricted scope. +/// Behavior: +/// - Strip any external `x-openshell-auth-source` marker first (so callers +/// cannot spoof a sandbox identity). +/// - Health probes / reflection bypass the chain entirely. +/// - When no chain is configured (OIDC not configured), forward without +/// authentication — preserves today's pass-through behavior. +/// - Otherwise, run the chain. The first match produces a `Principal`. +/// `Principal::User` is gated by the RBAC `AuthzPolicy`. The legacy +/// sandbox marker also inserts the metadata marker for backwards-compat +/// with handlers that still consume it (PR-1 only; removed in PR 3). #[derive(Clone)] pub struct AuthGrpcRouter { inner: S, - oidc_cache: Option>, + authenticator_chain: Option, authz_policy: Option, - /// Whether a client CA is configured (mTLS is a valid auth mechanism). - has_client_ca: bool, /// mTLS peer identity extracted from the TLS handshake. peer_identity: Option, } impl AuthGrpcRouter { + #[cfg(test)] fn new( inner: S, - oidc_cache: Option>, + authenticator_chain: Option, + authz_policy: Option, + ) -> Self { + Self::with_peer_identity(inner, authenticator_chain, authz_policy, None) + } + + fn with_peer_identity( + inner: S, + authenticator_chain: Option, authz_policy: Option, - has_client_ca: bool, peer_identity: Option, ) -> Self { Self { inner, - oidc_cache, + authenticator_chain, authz_policy, - has_client_ca, peer_identity, } } } +fn status_response(status: tonic::Status) -> Response { + let response = status.into_http(); + let (parts, body) = response.into_parts(); + let body = tonic::body::BoxBody::new(body); + Response::from_parts(parts, body) +} + impl tower::Service> for AuthGrpcRouter where S: tower::Service, Response = Response> @@ -319,28 +373,21 @@ where } fn call(&mut self, req: Request) -> Self::Future { - let oidc_cache = self.oidc_cache.clone(); + let chain = self.authenticator_chain.clone(); let authz_policy = self.authz_policy.clone(); - let has_client_ca = self.has_client_ca; let peer_identity = self.peer_identity.clone(); let mut inner = self.inner.clone(); Box::pin(async move { let mut req = req; - oidc::clear_internal_auth_markers(req.headers_mut()); - // No auth configured — pass through. - if oidc_cache.is_none() && !has_client_ca { + // No chain configured — pass through. Preserves today's + // "auth not configured means open" behavior for dev / + // fronting-proxy deployments. + let Some(chain) = chain else { return inner.ready().await?.call(req).await; - } - - // mTLS-only (no OIDC) — TLS layer already enforced client certs, - // so if we got here the peer is authenticated. - if oidc_cache.is_none() && has_client_ca { - return inner.ready().await?.call(req).await; - } + }; - let cache = oidc_cache.expect("checked above"); let path = req.uri().path().to_string(); // Health probes and reflection — truly unauthenticated. @@ -348,72 +395,32 @@ where return inner.ready().await?.call(req).await; } - // Sandbox-class RPCs — no Bearer expected. The gRPC channel's - // mTLS handshake (or the operator's fronting proxy when - // `--disable-gateway-auth` is set) is the trust boundary. - if oidc::is_sandbox_method(&path) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Dual-auth methods (e.g. UpdateConfig) — Bearer present grants - // full scope (CLI users); Bearer absent marks the caller as - // sandbox-class for restricted scope downstream. - if oidc::is_dual_auth_method(&path) && !has_bearer_token(req.headers()) { - oidc::mark_sandbox_caller(req.headers_mut()); - return inner.ready().await?.call(req).await; - } - - // Extract Bearer token from the authorization header. - let token = req - .headers() - .get("authorization") - .and_then(|v| v.to_str().ok()) - .and_then(|v| v.strip_prefix("Bearer ")); - - let Some(token) = token else { - // No bearer token — fall back to mTLS if a client cert was - // presented (only possible when both OIDC and client CA are - // configured and require_client_auth is false). - if let Some(ref identity) = peer_identity { - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(identity, &path) - { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + let principal = match chain.authenticate(req.headers(), &path).await { + Ok(Some(p)) => p, + Ok(None) => { + if let Some(identity) = peer_identity { + Principal::User(UserPrincipal { identity }) + } else { + return Ok(status_response(tonic::Status::unauthenticated( + "missing authorization header", + ))); } - return inner.ready().await?.call(req).await; } - let status = tonic::Status::unauthenticated("missing authorization header"); - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + Err(status) => return Ok(status_response(status)), }; - // Authenticate: validate the JWT and produce an Identity. - let identity = match cache.validate_token(token).await { - Ok(id) => id, - Err(status) => { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); - } - }; - - // Authorize: check RBAC roles against the method. - if let Some(ref policy) = authz_policy - && let Err(status) = policy.check(&identity, &path) + // Authorize user principals via RBAC. Sandbox principals get + // a per-handler `sandbox_id` equality check in PR 4; right now + // they bypass RBAC because the public sandbox-class methods + // they call were path-bypassed before this refactor too. + if let Principal::User(ref user) = principal + && let Some(ref policy) = authz_policy + && let Err(status) = policy.check(&user.identity, &path) { - let response = status.into_http(); - let (parts, body) = response.into_parts(); - let body = tonic::body::BoxBody::new(body); - return Ok(Response::from_parts(parts, body)); + return Ok(status_response(status)); } + req.extensions_mut().insert(principal); inner.ready().await?.call(req).await }) } @@ -513,13 +520,6 @@ where } } -fn has_bearer_token(headers: &http::HeaderMap) -> bool { - headers - .get("authorization") - .and_then(|v| v.to_str().ok()) - .is_some_and(|v| v.starts_with("Bearer ")) -} - fn grpc_method_from_path(path: &str) -> String { path.rsplit('/').next().unwrap_or(path).to_string() } @@ -860,4 +860,187 @@ mod tests { fn normalize_root_path() { assert_eq!(normalize_http_path("/"), "unknown"); } + + mod auth_router { + use super::*; + use crate::auth::authenticator::test_support::MockAuthenticator; + use crate::auth::identity::{Identity, IdentityProvider}; + use crate::auth::principal::{ + Principal, SandboxIdentitySource, SandboxPrincipal, UserPrincipal, + }; + use http_body_util::Full; + use std::sync::Arc; + use std::sync::Mutex; + use tower::Service; + + type RecordedPrincipal = Arc>>; + + /// Service that snapshots the `Principal` from request extensions + /// and returns 200 OK. Used by router-level tests to assert the + /// chain's effect on the downstream service. + #[derive(Clone)] + struct PrincipalRecorder { + recorded: RecordedPrincipal, + } + + impl PrincipalRecorder { + fn new() -> (Self, RecordedPrincipal) { + let recorded = Arc::new(Mutex::new(None)); + ( + Self { + recorded: recorded.clone(), + }, + recorded, + ) + } + } + + impl Service> for PrincipalRecorder { + type Response = Response; + type Error = std::convert::Infallible; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let principal = req.extensions().get::().cloned(); + *self.recorded.lock().unwrap() = principal; + Box::pin(async move { + let body = tonic::body::BoxBody::new( + Full::new(Bytes::new()) + .map_err(|never| match never {}) + .boxed_unsync(), + ); + Ok(Response::new(body)) + }) + } + } + + fn empty_request(path: &str) -> Request> { + Request::builder() + .uri(path) + .body(Full::new(Bytes::new())) + .unwrap() + } + + fn user_principal(subject: &str) -> Principal { + Principal::User(UserPrincipal { + identity: Identity { + subject: subject.to_string(), + display_name: None, + roles: vec![], + scopes: vec![], + provider: IdentityProvider::Oidc, + }, + }) + } + + fn sandbox_principal() -> Principal { + Principal::Sandbox(SandboxPrincipal { + sandbox_id: "sandbox-a".to_string(), + source: SandboxIdentitySource::BootstrapJwt { + issuer: "openshell-gateway:test".to_string(), + jti: "j-1".to_string(), + }, + trust_domain: Some("openshell".to_string()), + }) + } + + #[tokio::test] + async fn user_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(user_principal( + "alice", + ))))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + let principal = seen.lock().unwrap().clone().expect("principal"); + match principal { + Principal::User(u) => assert_eq!(u.identity.subject, "alice"), + _ => panic!("expected user principal"), + } + } + + #[tokio::test] + async fn sandbox_principal_lands_in_request_extensions() { + let mock = Arc::new(MockAuthenticator::returning(Ok(Some(sandbox_principal())))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let _ = router + .call(empty_request("/openshell.v1.OpenShell/ReportPolicyStatus")) + .await + .unwrap(); + let captured = seen.lock().unwrap().clone(); + match captured { + Some(Principal::Sandbox(p)) => assert_eq!(p.sandbox_id, "sandbox-a"), + other => panic!("expected sandbox principal, got {other:?}"), + } + } + + #[tokio::test] + async fn missing_principal_returns_unauthenticated() { + let mock = Arc::new(MockAuthenticator::returning(Ok(None))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + // tonic sets grpc-status=16 (UNAUTHENTICATED) in trailers. + let grpc_status = res + .headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()); + assert_eq!(grpc_status.as_deref(), Some("16")); + } + + #[tokio::test] + async fn authenticator_error_short_circuits() { + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("forged"), + ))); + let chain = AuthenticatorChain::new(vec![mock]); + let (recorder, seen) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/ListSandboxes")) + .await + .unwrap(); + assert!(seen.lock().unwrap().is_none()); + assert_eq!( + res.headers() + .get("grpc-status") + .map(|v| v.to_str().unwrap().to_string()) + .as_deref(), + Some("16") + ); + } + + #[tokio::test] + async fn health_methods_bypass_chain() { + // Authenticator is wired to fail-closed; the request still gets + // through because the path is exempt. + let mock = Arc::new(MockAuthenticator::returning(Err( + tonic::Status::unauthenticated("would reject"), + ))); + let chain = AuthenticatorChain::new(vec![mock.clone()]); + let (recorder, _) = PrincipalRecorder::new(); + let mut router = AuthGrpcRouter::new(recorder, Some(chain), None); + let res = router + .call(empty_request("/openshell.v1.OpenShell/Health")) + .await + .unwrap(); + assert_eq!(res.status(), 200); + assert_eq!(mock.call_count(), 0, "health must not consult the chain"); + } + } } diff --git a/crates/openshell-server/tests/auth_endpoint_integration.rs b/crates/openshell-server/tests/auth_endpoint_integration.rs index 59c2a23f6..bed244145 100644 --- a/crates/openshell-server/tests/auth_endpoint_integration.rs +++ b/crates/openshell-server/tests/auth_endpoint_integration.rs @@ -779,6 +779,22 @@ impl openshell_core::proto::open_shell_server::OpenShell for TestOpenShell { Err(tonic::Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, tonic::Status> + { + Err(tonic::Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/edge_tunnel_auth.rs b/crates/openshell-server/tests/edge_tunnel_auth.rs index 73ad0aff0..fc676ae7b 100644 --- a/crates/openshell-server/tests/edge_tunnel_auth.rs +++ b/crates/openshell-server/tests/edge_tunnel_auth.rs @@ -409,6 +409,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_integration.rs b/crates/openshell-server/tests/multiplex_integration.rs index 14a63c566..572308d1e 100644 --- a/crates/openshell-server/tests/multiplex_integration.rs +++ b/crates/openshell-server/tests/multiplex_integration.rs @@ -378,6 +378,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/multiplex_tls_integration.rs b/crates/openshell-server/tests/multiplex_tls_integration.rs index 00ed1657f..17e045a9e 100644 --- a/crates/openshell-server/tests/multiplex_tls_integration.rs +++ b/crates/openshell-server/tests/multiplex_tls_integration.rs @@ -391,6 +391,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/crates/openshell-server/tests/supervisor_relay_integration.rs b/crates/openshell-server/tests/supervisor_relay_integration.rs index d82c9c261..0e3de66d1 100644 --- a/crates/openshell-server/tests/supervisor_relay_integration.rs +++ b/crates/openshell-server/tests/supervisor_relay_integration.rs @@ -367,6 +367,18 @@ impl OpenShell for RelayGateway { ) -> Result, Status> { Err(Status::unimplemented("unused")) } + async fn issue_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } + async fn refresh_sandbox_token( + &self, + _: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("unused")) + } } // --------------------------------------------------------------------------- diff --git a/crates/openshell-server/tests/ws_tunnel_integration.rs b/crates/openshell-server/tests/ws_tunnel_integration.rs index 277cffb51..28b615c2f 100644 --- a/crates/openshell-server/tests/ws_tunnel_integration.rs +++ b/crates/openshell-server/tests/ws_tunnel_integration.rs @@ -404,6 +404,20 @@ impl OpenShell for TestOpenShell { Err(Status::unimplemented("not implemented in test")) } + async fn issue_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + + async fn refresh_sandbox_token( + &self, + _request: tonic::Request, + ) -> Result, Status> { + Err(Status::unimplemented("not implemented in test")) + } + async fn connect_supervisor( &self, _request: tonic::Request>, diff --git a/deploy/helm/openshell/templates/certgen.yaml b/deploy/helm/openshell/templates/certgen.yaml index ef4500db6..61203760b 100644 --- a/deploy/helm/openshell/templates/certgen.yaml +++ b/deploy/helm/openshell/templates/certgen.yaml @@ -100,6 +100,7 @@ spec: - generate-certs - --server-secret-name={{ .Values.server.tls.certSecretName }} - --client-secret-name={{ .Values.server.tls.clientTlsSecretName }} + - --jwt-secret-name={{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} {{- range .Values.pkiInitJob.serverDnsNames }} - --server-san={{ . }} {{- end }} diff --git a/deploy/helm/openshell/templates/gateway-config.yaml b/deploy/helm/openshell/templates/gateway-config.yaml index 9d95e45c1..302a5806f 100644 --- a/deploy/helm/openshell/templates/gateway-config.yaml +++ b/deploy/helm/openshell/templates/gateway-config.yaml @@ -64,6 +64,13 @@ data: {{- end }} {{- end }} + [openshell.gateway.gateway_jwt] + signing_key_path = "/etc/openshell-jwt/signing.pem" + public_key_path = "/etc/openshell-jwt/public.pem" + kid_path = "/etc/openshell-jwt/kid" + gateway_id = {{ .Values.server.sandboxJwt.gatewayId | default (include "openshell.fullname" .) | quote }} + ttl_secs = {{ .Values.server.sandboxJwt.ttlSecs | default 86400 }} + {{- if .Values.server.oidc.issuer }} [openshell.gateway.oidc] @@ -87,6 +94,7 @@ data: [openshell.drivers.kubernetes] grpc_endpoint = {{ include "openshell.grpcEndpoint" . | quote }} supervisor_sideload_method = {{ include "openshell.supervisorSideloadMethod" . | quote }} + sa_token_ttl_secs = {{ .Values.server.sandboxJwt.k8sSaTokenTtlSecs | default 3600 }} {{- if .Values.server.sandboxImagePullPolicy }} image_pull_policy = {{ .Values.server.sandboxImagePullPolicy | quote }} {{- end }} diff --git a/deploy/helm/openshell/templates/role.yaml b/deploy/helm/openshell/templates/role.yaml index 1d756117c..4d26451bf 100644 --- a/deploy/helm/openshell/templates/role.yaml +++ b/deploy/helm/openshell/templates/role.yaml @@ -29,3 +29,14 @@ rules: - get - list - watch + # Per-sandbox identity (issue #1354): the gateway resolves a sandbox + # pod's projected SA token to its `openshell.io/sandbox-id` annotation + # via a pod GET when the supervisor calls IssueSandboxToken. patch is + # intentionally NOT granted — the annotation is set once at pod create + # and must remain immutable for the lifetime of the sandbox. + - apiGroups: + - "" + resources: + - pods + verbs: + - get diff --git a/deploy/helm/openshell/templates/statefulset.yaml b/deploy/helm/openshell/templates/statefulset.yaml index c6ff21491..5dd4f1caf 100644 --- a/deploy/helm/openshell/templates/statefulset.yaml +++ b/deploy/helm/openshell/templates/statefulset.yaml @@ -75,6 +75,9 @@ spec: - name: gateway-config mountPath: /etc/openshell readOnly: true + - name: sandbox-jwt + mountPath: /etc/openshell-jwt + readOnly: true {{- if not .Values.server.disableTls }} - name: tls-cert mountPath: /etc/openshell-tls/server @@ -84,12 +87,12 @@ spec: mountPath: /etc/openshell-tls/client-ca readOnly: true {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca mountPath: /etc/openshell-tls/oidc-ca readOnly: true {{- end }} - {{- end }} ports: - name: grpc containerPort: {{ .Values.service.port }} @@ -131,6 +134,10 @@ spec: - name: gateway-config configMap: name: {{ include "openshell.fullname" . }}-config + - name: sandbox-jwt + secret: + secretName: {{ .Values.server.sandboxJwt.signingSecretName | default (printf "%s-jwt-keys" (include "openshell.fullname" .)) }} + defaultMode: 0400 {{- if not .Values.server.disableTls }} - name: tls-cert secret: @@ -147,12 +154,12 @@ spec: secretName: {{ .Values.server.tls.clientCaSecretName }} {{- end }} {{- end }} + {{- end }} {{- if and .Values.server.oidc.issuer .Values.server.oidc.caConfigMapName }} - name: oidc-ca configMap: name: {{ .Values.server.oidc.caConfigMapName }} {{- end }} - {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deploy/helm/openshell/values.yaml b/deploy/helm/openshell/values.yaml index c7fa50296..54323068b 100644 --- a/deploy/helm/openshell/values.yaml +++ b/deploy/helm/openshell/values.yaml @@ -128,6 +128,24 @@ server: clientCaSecretName: openshell-server-client-ca # K8s secret mounted into sandbox pods for mTLS to the server clientTlsSecretName: openshell-client-tls + # Gateway-minted sandbox JWT signing keys. The pre-install certgen hook + # generates an Ed25519 keypair and writes it to a secret containing + # signing.pem (PKCS#8), public.pem (SPKI), and kid (plain text). + sandboxJwt: + # Name of the Opaque Secret holding the signing key material. Empty + # falls back to "-jwt-keys". + signingSecretName: "" + # Stable gateway identity embedded in iss/aud of every minted token. + # Defaults to the release name so HA replicas share identity. + gatewayId: "" + # Token TTL in seconds. Defaults to 86400 (24h). + ttlSecs: 86400 + # Lifetime (seconds) of the projected ServiceAccount token kubelet + # writes into each sandbox pod for the IssueSandboxToken bootstrap + # exchange. Kubelet enforces a minimum of 600s; the driver clamps + # values outside [600, 86400]. Default 3600 — generous, since the + # supervisor consumes the token within seconds of pod start. + k8sSaTokenTtlSecs: 3600 # OIDC (OpenID Connect) configuration for JWT-based authentication. # When issuer is set, the server validates Bearer tokens on gRPC requests. oidc: diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index d8acbd191..cd2948ec4 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -77,6 +77,37 @@ EOF printf '%s' "${name}" >"${config_home}/openshell/active_gateway" } +e2e_toml_string() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '"%s"' "${value}" +} + +e2e_generate_gateway_jwt() { + local jwt_dir=$1 + + mkdir -p "${jwt_dir}" + ( + umask 077 + openssl genpkey -algorithm Ed25519 -out "${jwt_dir}/signing.pem" >/dev/null 2>&1 + ) + openssl pkey -in "${jwt_dir}/signing.pem" -pubout -out "${jwt_dir}/public.pem" >/dev/null 2>&1 + openssl rand -hex 16 >"${jwt_dir}/kid" +} + +e2e_write_gateway_jwt_config() { + local jwt_dir=$1 + local gateway_id=$2 + + printf '[openshell.gateway.gateway_jwt]\n' + printf 'signing_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/signing.pem")" + printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" + printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" + printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" + printf 'ttl_secs = 86400\n\n' +} + e2e_build_gateway_binaries() { local root=$1 local target_var=$2 @@ -160,4 +191,3 @@ e2e_print_gateway_log_on_failure() { echo "=== end gateway log ===" fi } - diff --git a/e2e/with-docker-gateway.sh b/e2e/with-docker-gateway.sh index 83e4185f2..06a27742b 100755 --- a/e2e/with-docker-gateway.sh +++ b/e2e/with-docker-gateway.sh @@ -429,6 +429,7 @@ cd "${ROOT}" HOST_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" GATEWAY_ENDPOINT="https://host.openshell.internal:${HOST_PORT}" E2E_NAMESPACE="e2e-docker-$$-${HOST_PORT}" @@ -448,6 +449,7 @@ else fi echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -466,6 +468,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" { printf '[openshell]\nversion = 1\n\n' printf '[openshell.gateway]\nlog_level = "info"\n\n' + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-docker-${HOST_PORT}" printf '[openshell.drivers.docker]\n' printf 'sandbox_namespace = %s\n' "$(toml_string "${E2E_NAMESPACE}")" printf 'network_name = %s\n' "$(toml_string "${DOCKER_NETWORK_NAME}")" diff --git a/e2e/with-podman-gateway.sh b/e2e/with-podman-gateway.sh index 727737d25..940b706db 100755 --- a/e2e/with-podman-gateway.sh +++ b/e2e/with-podman-gateway.sh @@ -332,6 +332,7 @@ HOST_PORT=$(e2e_pick_port) HEALTH_PORT=$(e2e_pick_port) STATE_DIR="${WORKDIR}/state" mkdir -p "${STATE_DIR}" +JWT_DIR="${STATE_DIR}/jwt" E2E_NAMESPACE="e2e-podman-$$-${HOST_PORT}" PODMAN_NETWORK_NAME="${E2E_NAMESPACE}" @@ -343,6 +344,7 @@ export OPENSHELL_E2E_NETWORK_NAME="${PODMAN_NETWORK_NAME}" export OPENSHELL_E2E_SANDBOX_NAMESPACE="${E2E_NAMESPACE}" echo "Starting openshell-gateway on port ${HOST_PORT} (namespace: ${E2E_NAMESPACE})..." +e2e_generate_gateway_jwt "${JWT_DIR}" # Driver-specific options moved from CLI flags into a TOML config table # (commit 560550d2). Synthesize a minimal config here and pass --config. @@ -359,6 +361,7 @@ GATEWAY_CONFIG="${STATE_DIR}/gateway.toml" { printf '[openshell]\nversion = 1\n\n' printf '[openshell.gateway]\nlog_level = "info"\n\n' + e2e_write_gateway_jwt_config "${JWT_DIR}" "openshell-e2e-podman-${HOST_PORT}" printf '[openshell.drivers.podman]\n' # The Podman driver scopes isolation by network rather than namespace. printf 'network_name = %s\n' "$(toml_string "${PODMAN_NETWORK_NAME}")" diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6de13f3e5 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,6 +90,13 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Gateway-minted JWT identifying this sandbox to the gateway. Set by + // the gateway on create; the driver materialises it via its native + // secret mechanism (Docker/Podman/VM bind-mount a per-sandbox file; + // the Kubernetes driver ignores this field and relies on its projected + // ServiceAccount token bootstrap instead). Never echoed to the public + // Sandbox proto. + string sandbox_token = 11; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index e4a1b0673..0ca74d4f0 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -208,6 +208,51 @@ service OpenShell { // Get decision history for a sandbox's draft policy. rpc GetDraftHistory(GetDraftHistoryRequest) returns (GetDraftHistoryResponse); + + // Exchange a sandbox-bootstrap credential (e.g. a Kubernetes projected + // ServiceAccount token) for a gateway-minted JWT bound to the calling + // sandbox's UUID. Used by the Kubernetes driver path; singleplayer + // drivers receive the gateway JWT directly from the create-sandbox flow + // and never call this RPC. + rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); + + // Rotate the calling sandbox's gateway JWT. The previously-issued + // token is revoked (its jti added to the gateway's deny list) and a + // fresh token bound to the same sandbox UUID is returned. The + // supervisor calls this from a background task at ~80% of the token's + // lifetime; the new token is cached in memory only — the on-disk + // bootstrap file is intentionally not rewritten. + rpc RefreshSandboxToken(RefreshSandboxTokenRequest) + returns (RefreshSandboxTokenResponse); +} + +// IssueSandboxToken request. Empty body; identity is established by the +// authentication credentials carried in the request headers (a projected +// Kubernetes ServiceAccount JWT in the K8s driver path). +message IssueSandboxTokenRequest {} + +// IssueSandboxToken response. The supervisor caches the returned token in +// memory and presents it as `Authorization: Bearer` on every subsequent +// gateway RPC. +message IssueSandboxTokenResponse { + // Gateway-minted JWT bound to the calling sandbox's UUID. + string token = 1; + // Absolute expiry of the issued token, milliseconds since the epoch. + int64 expires_at_ms = 2; +} + +// RefreshSandboxToken request. Empty body; the calling principal must +// already be a sandbox principal (i.e. the request carries a still-valid +// gateway-minted JWT in its Authorization header). +message RefreshSandboxTokenRequest {} + +// RefreshSandboxToken response. The previous token is revoked server-side +// before this response is sent. +message RefreshSandboxTokenResponse { + // Fresh gateway-minted JWT bound to the same sandbox UUID. + string token = 1; + // Absolute expiry of the new token, milliseconds since the epoch. + int64 expires_at_ms = 2; } // Health check request.