From 504c75c0307d987843d3d58ab844e283a34923b2 Mon Sep 17 00:00:00 2001 From: Ryan Angilly Date: Tue, 12 May 2026 14:58:07 -0600 Subject: [PATCH 1/3] feat(sandbox): add GPU count support Signed-off-by: Ryan Angilly --- architecture/compute-runtimes.md | 5 +- crates/openshell-cli/src/main.rs | 41 ++++++ crates/openshell-cli/src/run.rs | 13 +- .../sandbox_create_lifecycle_integration.rs | 55 ++++++- crates/openshell-driver-docker/src/tests.rs | 1 + crates/openshell-driver-kubernetes/README.md | 8 +- .../openshell-driver-kubernetes/src/driver.rs | 136 ++++++++++++++++-- crates/openshell-server/src/compute/mod.rs | 26 +++- .../openshell-server/src/grpc/validation.rs | 30 ++++ docs/kubernetes/setup.mdx | 1 + docs/sandboxes/manage-sandboxes.mdx | 10 ++ proto/compute_driver.proto | 3 + proto/openshell.proto | 3 + 13 files changed, 310 insertions(+), 22 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 9ab512f98..b048af605 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -22,7 +22,7 @@ Each runtime receives a sandbox spec from the gateway and is responsible for: |---|---|---|---| | Docker | Local development with Docker available. | Container plus nested sandbox namespace. | Uses host networking so loopback gateway endpoints work from the supervisor. | | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | -| Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | +| Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and `nvidia.com/gpu` limits for GPU requests. | | VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. | VM runtime state paths are derived only from driver-validated sandbox IDs @@ -62,7 +62,8 @@ users. Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The -runtime still owns GPU device injection. +runtime still owns GPU device injection or resource scheduling. Kubernetes maps +GPU counts to pod `nvidia.com/gpu` limits when the cluster exposes that resource. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 9cffb243b..8e5d54faa 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1084,6 +1084,13 @@ enum SandboxCommands { #[arg(long)] gpu: bool, + /// Request a specific number of GPUs. + /// + /// This implies --gpu. Kubernetes-backed gateways schedule pods with + /// the corresponding nvidia.com/gpu resource limit. + #[arg(long, value_name = "COUNT", conflicts_with = "gpu_device", value_parser = clap::value_parser!(u32).range(1..))] + gpu_count: Option, + /// Target a driver-specific GPU device. Docker and Podman use CDI device IDs /// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index. /// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection. @@ -2364,6 +2371,7 @@ async fn main() -> Result<()> { no_keep, editor, gpu, + gpu_count, gpu_device, providers, policy, @@ -2430,6 +2438,7 @@ async fn main() -> Result<()> { upload_spec.as_ref(), keep, gpu, + gpu_count, gpu_device.as_deref(), editor, &providers, @@ -3821,4 +3830,36 @@ mod tests { other => panic!("expected service delete command, got: {other:?}"), } } + + #[test] + fn sandbox_create_gpu_count_rejects_zero() { + let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "0"]); + assert!( + result.is_err(), + "sandbox create --gpu-count 0 should be rejected" + ); + let err = result.unwrap_err().to_string(); + assert!(err.contains("--gpu-count")); + } + + #[test] + fn sandbox_create_gpu_count_conflicts_with_gpu_device() { + let result = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--gpu", + "--gpu-count", + "2", + "--gpu-device", + "nvidia.com/gpu=0", + ]); + assert!( + result.is_err(), + "sandbox create should reject combining --gpu-count with --gpu-device" + ); + let err = result.unwrap_err().to_string(); + assert!(err.contains("--gpu-count")); + assert!(err.contains("--gpu-device")); + } } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 3205b8f68..5e5a48ac5 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -1467,6 +1467,7 @@ pub async fn sandbox_create( upload: Option<&(String, Option, bool)>, keep: bool, gpu: bool, + gpu_count: Option, gpu_device: Option<&str>, editor: Option, providers: &[String], @@ -1483,6 +1484,14 @@ pub async fn sandbox_create( "--editor cannot be used with a trailing command; use `openshell sandbox connect --editor ...` after the sandbox is ready" )); } + if gpu_count == Some(0) { + return Err(miette::miette!("--gpu-count must be greater than zero")); + } + if gpu_count.is_some() && gpu_device.is_some() { + return Err(miette::miette!( + "--gpu-count cannot be combined with --gpu-device" + )); + } // Check port availability *before* creating the sandbox so we don't // leave an orphaned sandbox behind when the forward would fail. @@ -1518,7 +1527,8 @@ pub async fn sandbox_create( } None => None, }; - let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu); + let requested_gpu = + gpu || gpu_count.is_some() || image.as_deref().is_some_and(image_requests_gpu); let inferred_types: Vec = inferred_provider_type(command).into_iter().collect(); let configured_providers = ensure_required_providers( @@ -1539,6 +1549,7 @@ pub async fn sandbox_create( let request = CreateSandboxRequest { spec: Some(SandboxSpec { gpu: requested_gpu, + gpu_count: gpu_count.unwrap_or_default(), gpu_device: gpu_device.unwrap_or_default().to_string(), policy, providers: configured_providers, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 6e7d66d11..dfdf975a4 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -84,6 +84,7 @@ impl Drop for EnvVarGuard { #[derive(Clone, Default)] struct SandboxState { deleted_names: Arc>>>, + created_requests: Arc>>, } #[derive(Clone, Default)] @@ -107,7 +108,9 @@ impl OpenShell for TestOpenShell { &self, request: tonic::Request, ) -> Result, Status> { - let name = request.into_inner().name; + let request = request.into_inner(); + let name = request.name.clone(); + self.state.created_requests.lock().await.push(request); let sandbox_name = if name.is_empty() { "test-sandbox".to_string() } else { @@ -648,6 +651,10 @@ async fn deleted_names(server: &TestServer) -> Vec> { server.openshell.state.deleted_names.lock().await.clone() } +async fn created_requests(server: &TestServer) -> Vec { + server.openshell.state.created_requests.lock().await.clone() +} + fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } @@ -671,6 +678,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { false, None, None, + None, &[], None, None, @@ -691,6 +699,47 @@ async fn sandbox_create_keeps_command_sessions_by_default() { ); } +#[tokio::test] +async fn sandbox_create_gpu_count_sets_gpu_intent_and_count() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("gpu-count"), + None, + "openshell", + None, + true, + false, + Some(2), + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = created_requests(&server).await; + let spec = requests + .last() + .and_then(|request| request.spec.as_ref()) + .expect("create request should include a spec"); + assert!(spec.gpu); + assert_eq!(spec.gpu_count, 2); +} + #[tokio::test] async fn sandbox_create_deletes_command_sessions_with_no_keep() { let server = run_server().await; @@ -710,6 +759,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { false, None, None, + None, &[], None, None, @@ -752,6 +802,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { false, None, None, + None, &[], None, None, @@ -794,6 +845,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { false, None, None, + None, &[], None, None, @@ -836,6 +888,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { false, None, None, + None, &[], None, Some(openshell_core::forward::ForwardSpec::new(forward_port)), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index df68d39d6..f5155fd13 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,6 +33,7 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), + gpu_count: 0, }), status: None, } diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 4a8a8f76b..6202e69f6 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -44,6 +44,8 @@ pods do not need direct external ingress for SSH. ## GPU Support When a sandbox requests GPU support, the driver checks node allocatable capacity -for `nvidia.com/gpu` and requests one GPU resource in the workload spec. The -sandbox image must provide the user-space libraries needed by the agent -workload. +for `nvidia.com/gpu`. A request with only `gpu=true` asks for one GPU. A request +with `gpu_count > 0` sets the pod resource limit to that count. The cluster must +expose `nvidia.com/gpu` through the NVIDIA device plugin or an equivalent +device-plugin implementation, and the sandbox image must provide the user-space +libraries needed by the agent workload. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 56b73447a..a434731c3 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -194,7 +194,10 @@ impl KubernetesComputeDriver { } pub async fn validate_sandbox_create(&self, sandbox: &Sandbox) -> Result<(), tonic::Status> { - let gpu_requested = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); + let gpu_requested = sandbox + .spec + .as_ref() + .is_some_and(|spec| spec.gpu || spec.gpu_count > 0); if gpu_requested && !self.has_gpu_capacity().await.map_err(|err| { tonic::Status::internal(format!("check GPU node capacity failed: {err}")) @@ -1011,7 +1014,14 @@ fn sandbox_to_k8s_spec( if let Some(template) = spec.template.as_ref() { root.insert( "podTemplate".to_string(), - sandbox_template_to_k8s(template, spec.gpu, &pod_env, inject_workspace, params), + sandbox_template_to_k8s( + template, + spec.gpu, + spec.gpu_count, + &pod_env, + inject_workspace, + params, + ), ); if !template.agent_socket_path.is_empty() { root.insert( @@ -1044,6 +1054,7 @@ fn sandbox_to_k8s_spec( sandbox_template_to_k8s( &SandboxTemplate::default(), spec.is_some_and(|s| s.gpu), + spec.map_or(0, |s| s.gpu_count), &pod_env, inject_workspace, params, @@ -1059,6 +1070,7 @@ fn sandbox_to_k8s_spec( fn sandbox_template_to_k8s( template: &SandboxTemplate, gpu: bool, + gpu_count: u32, spec_environment: &std::collections::HashMap, inject_workspace: bool, params: &SandboxPodParams<'_>, @@ -1091,7 +1103,7 @@ fn sandbox_template_to_k8s( if use_user_namespaces { spec.insert("hostUsers".to_string(), serde_json::json!(false)); - if gpu { + if gpu || gpu_count > 0 { warn!( "GPU sandbox with user namespaces enabled — \ NVIDIA device plugin compatibility is unverified" @@ -1169,7 +1181,7 @@ fn sandbox_template_to_k8s( ); } - if let Some(resources) = container_resources(template, gpu) { + if let Some(resources) = container_resources(template, gpu, gpu_count) { container.insert("resources".to_string(), resources); } spec.insert( @@ -1225,7 +1237,11 @@ fn sandbox_template_to_k8s( result } -fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option { +fn container_resources( + template: &SandboxTemplate, + gpu: bool, + gpu_count: u32, +) -> Option { // Start from the raw resources passthrough in platform_config (preserves // custom resource types like GPU limits that users set via the public API // Struct), then overlay the typed DriverResourceRequirements on top. @@ -1247,8 +1263,11 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option 0 { + let quantity = gpu_count.to_string(); + apply_gpu_limit(&mut resources, &quantity); + } else if gpu { + apply_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY); } if resources.as_object().is_some_and(serde_json::Map::is_empty) { None @@ -1257,10 +1276,10 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option Value { + Value { + kind: Some(Kind::StringValue(value.to_string())), + } + } + + fn struct_value(fields: impl IntoIterator) -> Value { + Value { + kind: Some(Kind::StructValue(Struct { + fields: fields + .into_iter() + .map(|(key, value)| (key.to_string(), value)) + .collect(), + })), + } + } + #[test] fn apply_required_env_always_injects_ssh_handshake_secret() { let mut env = Vec::new(); @@ -1788,6 +1821,7 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), true, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1804,6 +1838,26 @@ mod tests { ); } + #[test] + fn gpu_count_sandbox_sets_requested_gpu_limit() { + let pod_template = { + let params = SandboxPodParams::default(); + sandbox_template_to_k8s( + &SandboxTemplate::default(), + true, + 4, + &std::collections::HashMap::new(), + true, + ¶ms, + ) + }; + + assert_eq!( + pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], + serde_json::json!("4") + ); + } + #[test] fn gpu_sandbox_uses_template_runtime_class_name_when_set() { let template = SandboxTemplate { @@ -1824,6 +1878,7 @@ mod tests { sandbox_template_to_k8s( &template, true, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1856,6 +1911,7 @@ mod tests { sandbox_template_to_k8s( &template, false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1884,6 +1940,7 @@ mod tests { sandbox_template_to_k8s( &template, true, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1898,6 +1955,49 @@ mod tests { ); } + #[test] + fn gpu_count_preserves_existing_resource_limits() { + use openshell_core::proto::compute::v1::DriverResourceRequirements; + + let template = SandboxTemplate { + resources: Some(DriverResourceRequirements { + cpu_limit: "2".to_string(), + ..Default::default() + }), + platform_config: Some(Struct { + fields: std::iter::once(( + "resources_raw".to_string(), + struct_value([( + "limits", + struct_value([ + ("example.com/custom", string_value("7")), + (GPU_RESOURCE_NAME, string_value("2")), + ]), + )]), + )) + .collect(), + }), + ..SandboxTemplate::default() + }; + + let pod_template = { + let params = SandboxPodParams::default(); + sandbox_template_to_k8s( + &template, + true, + 4, + &std::collections::HashMap::new(), + true, + ¶ms, + ) + }; + + let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; + assert_eq!(limits["cpu"], serde_json::json!("2")); + assert_eq!(limits["example.com/custom"], serde_json::json!("7")); + assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("4")); + } + #[test] fn host_aliases_injected_when_gateway_ip_set() { let pod_template = { @@ -1908,6 +2008,7 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1933,6 +2034,7 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1956,6 +2058,7 @@ mod tests { sandbox_template_to_k8s( &template, false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2095,6 +2198,7 @@ mod tests { let pod_template = sandbox_template_to_k8s( &SandboxTemplate::default(), false, + 0, &std::collections::HashMap::new(), false, // user provided custom VCTs ¶ms, @@ -2133,6 +2237,7 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2198,6 +2303,7 @@ mod tests { let pod_template = sandbox_template_to_k8s( &template, false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2236,6 +2342,7 @@ mod tests { let pod_template = sandbox_template_to_k8s( &template, false, + 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2262,6 +2369,7 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, + 0, &std::collections::HashMap::new(), true, ¶ms, diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d2fd34011..38120d67b 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1132,6 +1132,7 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), + gpu_count: spec.gpu_count, } } @@ -1491,7 +1492,8 @@ fn derive_phase(status: Option<&DriverSandboxStatus>) -> SandboxPhase { } fn rewrite_user_facing_conditions(status: &mut Option, spec: Option<&SandboxSpec>) { - let gpu_requested = spec.is_some_and(|sandbox_spec| sandbox_spec.gpu); + let gpu_requested = + spec.is_some_and(|sandbox_spec| sandbox_spec.gpu || sandbox_spec.gpu_count > 0); if !gpu_requested { return; } @@ -2008,6 +2010,28 @@ mod tests { assert!(build_platform_config(&template).is_none()); } + #[test] + fn driver_sandbox_spec_from_public_maps_gpu_count() { + let public = SandboxSpec { + gpu: true, + gpu_count: 4, + ..Default::default() + }; + + let driver = driver_sandbox_spec_from_public(&public); + + assert!(driver.gpu); + assert_eq!(driver.gpu_count, 4); + } + + #[test] + fn driver_sandbox_spec_from_public_preserves_default_gpu_count() { + let driver = driver_sandbox_spec_from_public(&SandboxSpec::default()); + + assert!(!driver.gpu); + assert_eq!(driver.gpu_count, 0); + } + #[test] fn build_platform_config_preserves_non_typed_resource_fields() { let template = SandboxTemplate { diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 160b7e031..518babb20 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -131,6 +131,13 @@ pub(super) fn validate_sandbox_spec( validate_sandbox_template(tmpl)?; } + // --- GPU allocation mode --- + if spec.gpu_count > 0 && !spec.gpu_device.is_empty() { + return Err(Status::invalid_argument( + "gpu_count cannot be combined with gpu_device", + )); + } + // --- spec.policy serialized size --- if let Some(ref policy) = spec.policy { let size = policy.encoded_len(); @@ -674,6 +681,29 @@ mod tests { assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); } + #[test] + fn validate_sandbox_spec_accepts_gpu_count() { + let spec = SandboxSpec { + gpu_count: 4, + ..Default::default() + }; + assert!(validate_sandbox_spec("gpu-count-sandbox", &spec).is_ok()); + } + + #[test] + fn validate_sandbox_spec_rejects_gpu_count_with_gpu_device() { + let spec = SandboxSpec { + gpu: true, + gpu_count: 4, + gpu_device: "nvidia.com/gpu=0".to_string(), + ..Default::default() + }; + let err = validate_sandbox_spec("gpu-count-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("gpu_count")); + assert!(err.message().contains("gpu_device")); + } + #[test] fn validate_sandbox_spec_accepts_empty_defaults() { assert!(validate_sandbox_spec("", &default_spec()).is_ok()); diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 067e0d122..4adccecde 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -25,6 +25,7 @@ Make sure the following are in place before you install. | Agent Sandbox controller and CRDs | Yes | Install before the OpenShell chart. Refer to [Install Agent Sandbox](#install-agent-sandbox). | | cert-manager | No | Refer to [Managing Certificates](/kubernetes/managing-certificates). Use cert-manager only if you prefer it over the built-in PKI job. | | Kubernetes Gateway API | No | Refer to [Ingress](/kubernetes/ingress). Use it only for external access without port-forwarding. | +| NVIDIA device plugin or equivalent | No | Required only for GPU sandboxes. The cluster must expose allocatable `nvidia.com/gpu` resources for `--gpu` and `--gpu-count`. | ## Install Agent Sandbox diff --git a/docs/sandboxes/manage-sandboxes.mdx b/docs/sandboxes/manage-sandboxes.mdx index 3eddecec5..eddbe210b 100644 --- a/docs/sandboxes/manage-sandboxes.mdx +++ b/docs/sandboxes/manage-sandboxes.mdx @@ -35,6 +35,16 @@ To request GPU resources, add `--gpu`: openshell sandbox create --gpu -- claude ``` +To request multiple GPUs on a Kubernetes-backed gateway, set a GPU count: + +```shell +openshell sandbox create --gpu-count 4 -- claude +``` + +GPU counts schedule Kubernetes sandbox pods with an `nvidia.com/gpu` resource +limit. Use `--gpu-device` only when you need driver-specific device selection; +it cannot be combined with `--gpu-count`. + For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker CDI after the gateway starts, restart the gateway so OpenShell can detect the updated Docker daemon capability. diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..603f5e954 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,6 +90,9 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Number of NVIDIA GPUs requested for this sandbox. 0 means unspecified; + // when gpu=true and gpu_count=0, drivers keep their existing default. + uint32 gpu_count = 11; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index bb2ce6cec..f2c17f451 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -256,6 +256,9 @@ message SandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Number of NVIDIA GPUs requested for this sandbox. 0 means unspecified; + // when gpu=true and gpu_count=0, drivers keep their existing default. + uint32 gpu_count = 11; } // Public sandbox template mapped onto compute-driver template inputs. From 3dd8d5a0c050028eb306d10945a43661b68c9eb7 Mon Sep 17 00:00:00 2001 From: Ryan Angilly Date: Tue, 12 May 2026 19:38:33 -0600 Subject: [PATCH 2/3] feat(sandbox): add resource JSON passthrough Signed-off-by: Ryan Angilly --- architecture/compute-runtimes.md | 3 +- crates/openshell-cli/src/main.rs | 46 ++-- crates/openshell-cli/src/run.rs | 198 ++++++++++++++++-- .../sandbox_create_lifecycle_integration.rs | 92 +++++++- crates/openshell-driver-docker/src/tests.rs | 1 - crates/openshell-driver-kubernetes/README.md | 11 +- .../openshell-driver-kubernetes/src/driver.rs | 72 +------ crates/openshell-server/src/compute/mod.rs | 26 +-- .../openshell-server/src/grpc/validation.rs | 30 --- docs/sandboxes/manage-sandboxes.mdx | 14 +- proto/compute_driver.proto | 3 - proto/openshell.proto | 3 - 12 files changed, 334 insertions(+), 165 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index b048af605..11f6de22a 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -63,7 +63,8 @@ Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The runtime still owns GPU device injection or resource scheduling. Kubernetes maps -GPU counts to pod `nvidia.com/gpu` limits when the cluster exposes that resource. +template resource limits such as `nvidia.com/gpu` into the sandbox pod when the +cluster exposes those resources. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 8e5d54faa..d1bc7ca40 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1088,9 +1088,17 @@ enum SandboxCommands { /// /// This implies --gpu. Kubernetes-backed gateways schedule pods with /// the corresponding nvidia.com/gpu resource limit. - #[arg(long, value_name = "COUNT", conflicts_with = "gpu_device", value_parser = clap::value_parser!(u32).range(1..))] + #[arg(long, value_name = "COUNT", value_parser = clap::value_parser!(u32).range(1..))] gpu_count: Option, + /// Set compute resource requests and limits as JSON. + /// + /// The JSON must be an object with the same shape as + /// SandboxTemplate.resources, for example: + /// {"requests":{"cpu":"2"},"limits":{"cpu":"16"}} + #[arg(long, value_name = "JSON")] + resources_json: Option, + /// Target a driver-specific GPU device. Docker and Podman use CDI device IDs /// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index. /// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection. @@ -2372,6 +2380,7 @@ async fn main() -> Result<()> { editor, gpu, gpu_count, + resources_json, gpu_device, providers, policy, @@ -2439,6 +2448,7 @@ async fn main() -> Result<()> { keep, gpu, gpu_count, + resources_json.as_deref(), gpu_device.as_deref(), editor, &providers, @@ -3843,23 +3853,27 @@ mod tests { } #[test] - fn sandbox_create_gpu_count_conflicts_with_gpu_device() { - let result = Cli::try_parse_from([ + fn sandbox_create_resources_json_parses() { + let cli = Cli::try_parse_from([ "openshell", "sandbox", "create", - "--gpu", - "--gpu-count", - "2", - "--gpu-device", - "nvidia.com/gpu=0", - ]); - assert!( - result.is_err(), - "sandbox create should reject combining --gpu-count with --gpu-device" - ); - let err = result.unwrap_err().to_string(); - assert!(err.contains("--gpu-count")); - assert!(err.contains("--gpu-device")); + "--resources-json", + r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#, + ]) + .expect("sandbox create --resources-json should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { resources_json, .. }), + .. + }) => { + assert_eq!( + resources_json.as_deref(), + Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#) + ); + } + other => panic!("expected sandbox create command, got: {other:?}"), + } } } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 5e5a48ac5..07ee6fd69 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -67,6 +67,8 @@ pub use openshell_core::forward::{ find_forward_by_port, list_forwards, stop_forward, stop_forwards_for_sandbox, }; +const NVIDIA_GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; + /// Convert a sandbox phase integer to a human-readable string. fn phase_name(phase: i32) -> &'static str { match SandboxPhase::try_from(phase) { @@ -1457,6 +1459,93 @@ async fn finalize_sandbox_create_session( session_result } +fn parse_sandbox_resources_json( + resources_json: Option<&str>, + gpu_count: Option, +) -> Result> { + let mut resources = match resources_json { + Some(raw) => { + if raw.trim().is_empty() { + return Err(miette::miette!("--resources-json cannot be empty")); + } + + let value: serde_json::Value = serde_json::from_str(raw) + .into_diagnostic() + .wrap_err("invalid --resources-json JSON")?; + let serde_json::Value::Object(fields) = value else { + return Err(miette::miette!("--resources-json must be a JSON object")); + }; + + Some(json_object_to_prost_struct(fields)) + } + None => None, + }; + + if let Some(count) = gpu_count { + if count == 0 { + return Err(miette::miette!("--gpu-count must be greater than zero")); + } + inject_gpu_count_resource(resources.get_or_insert_with(Default::default), count); + } + + Ok(resources) +} + +fn json_object_to_prost_struct( + fields: serde_json::Map, +) -> prost_types::Struct { + prost_types::Struct { + fields: fields + .into_iter() + .map(|(key, value)| (key, json_value_to_prost_value(value))) + .collect(), + } +} + +fn json_value_to_prost_value(value: serde_json::Value) -> prost_types::Value { + use prost_types::{ListValue, Value, value::Kind}; + + let kind = match value { + serde_json::Value::Null => Kind::NullValue(0), + serde_json::Value::Bool(value) => Kind::BoolValue(value), + serde_json::Value::Number(value) => Kind::NumberValue(value.as_f64().unwrap_or_default()), + serde_json::Value::String(value) => Kind::StringValue(value), + serde_json::Value::Array(values) => Kind::ListValue(ListValue { + values: values.into_iter().map(json_value_to_prost_value).collect(), + }), + serde_json::Value::Object(fields) => Kind::StructValue(json_object_to_prost_struct(fields)), + }; + + Value { kind: Some(kind) } +} + +fn inject_gpu_count_resource(resources: &mut prost_types::Struct, gpu_count: u32) { + use prost_types::{Struct, Value, value::Kind}; + + let limits_kind = &mut resources + .fields + .entry("limits".to_string()) + .or_insert_with(|| Value { + kind: Some(Kind::StructValue(Struct::default())), + }) + .kind; + + if !matches!(limits_kind, Some(Kind::StructValue(_))) { + *limits_kind = Some(Kind::StructValue(Struct::default())); + } + + let Some(Kind::StructValue(limits)) = limits_kind.as_mut() else { + unreachable!("limits kind was normalized to a StructValue"); + }; + + limits.fields.insert( + NVIDIA_GPU_RESOURCE_NAME.to_string(), + Value { + kind: Some(Kind::StringValue(gpu_count.to_string())), + }, + ); +} + /// Create a sandbox with default settings. #[allow(clippy::too_many_arguments, clippy::implicit_hasher)] // user-facing CLI command; default hasher is fine pub async fn sandbox_create( @@ -1468,6 +1557,7 @@ pub async fn sandbox_create( keep: bool, gpu: bool, gpu_count: Option, + resources_json: Option<&str>, gpu_device: Option<&str>, editor: Option, providers: &[String], @@ -1484,14 +1574,7 @@ pub async fn sandbox_create( "--editor cannot be used with a trailing command; use `openshell sandbox connect --editor ...` after the sandbox is ready" )); } - if gpu_count == Some(0) { - return Err(miette::miette!("--gpu-count must be greater than zero")); - } - if gpu_count.is_some() && gpu_device.is_some() { - return Err(miette::miette!( - "--gpu-count cannot be combined with --gpu-device" - )); - } + let resources = parse_sandbox_resources_json(resources_json, gpu_count)?; // Check port availability *before* creating the sandbox so we don't // leave an orphaned sandbox behind when the forward would fail. @@ -1541,15 +1624,19 @@ pub async fn sandbox_create( let policy = load_sandbox_policy(policy)?; - let template = image.map(|img| SandboxTemplate { - image: img, - ..SandboxTemplate::default() - }); + let template = if image.is_some() || resources.is_some() { + Some(SandboxTemplate { + image: image.unwrap_or_default(), + resources, + ..SandboxTemplate::default() + }) + } else { + None + }; let request = CreateSandboxRequest { spec: Some(SandboxSpec { gpu: requested_gpu, - gpu_count: gpu_count.unwrap_or_default(), gpu_device: gpu_device.unwrap_or_default().to_string(), policy, providers: configured_providers, @@ -6028,9 +6115,9 @@ mod tests { gateway_env_override_warning, gateway_select_with, gateway_type_label, git_sync_files, http_health_check, image_requests_gpu, import_local_package_mtls_bundle, inferred_provider_type, package_managed_tls_dirs, parse_cli_setting_value, - parse_credential_pairs, plaintext_gateway_is_remote, provisioning_timeout_message, - ready_false_condition_message, resolve_from, sandbox_should_persist, - service_expose_status_error, service_url_for_gateway, + parse_credential_pairs, parse_sandbox_resources_json, plaintext_gateway_is_remote, + provisioning_timeout_message, ready_false_condition_message, resolve_from, + sandbox_should_persist, service_expose_status_error, service_url_for_gateway, }; use crate::TEST_ENV_LOCK; use hyper::StatusCode; @@ -6048,6 +6135,85 @@ mod tests { Provider, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, }; + fn prost_struct_field<'a>( + value: &'a prost_types::Struct, + field: &str, + ) -> &'a prost_types::Struct { + let value = value + .fields + .get(field) + .and_then(|value| value.kind.as_ref()) + .unwrap_or_else(|| panic!("expected {field} field")); + let prost_types::value::Kind::StructValue(value) = value else { + panic!("expected {field} to be an object"); + }; + value + } + + fn prost_string_field(value: &prost_types::Struct, field: &str) -> String { + let value = value + .fields + .get(field) + .and_then(|value| value.kind.as_ref()) + .unwrap_or_else(|| panic!("expected {field} field")); + let prost_types::value::Kind::StringValue(value) = value else { + panic!("expected {field} to be a string"); + }; + value.clone() + } + + #[test] + fn parse_sandbox_resources_json_rejects_empty_string() { + let err = parse_sandbox_resources_json(Some(""), None) + .expect_err("empty resources JSON should fail"); + + assert!(err.to_string().contains("--resources-json")); + assert!(err.to_string().contains("empty")); + } + + #[test] + fn parse_sandbox_resources_json_rejects_invalid_json() { + let err = parse_sandbox_resources_json(Some("{bad"), None) + .expect_err("invalid resources JSON should fail"); + + assert!(err.to_string().contains("invalid --resources-json JSON")); + } + + #[test] + fn parse_sandbox_resources_json_rejects_non_object_json() { + let err = parse_sandbox_resources_json(Some(r#"["cpu"]"#), None) + .expect_err("non-object resources JSON should fail"); + + assert!(err.to_string().contains("--resources-json")); + assert!(err.to_string().contains("JSON object")); + } + + #[test] + fn parse_sandbox_resources_json_injects_gpu_count_resource() { + let resources = parse_sandbox_resources_json( + Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#), + Some(4), + ) + .expect("resources should parse") + .expect("resources should be present"); + + let requests = prost_struct_field(&resources, "requests"); + let limits = prost_struct_field(&resources, "limits"); + + assert_eq!(prost_string_field(requests, "cpu"), "2"); + assert_eq!(prost_string_field(limits, "cpu"), "16"); + assert_eq!(prost_string_field(limits, "nvidia.com/gpu"), "4"); + } + + #[test] + fn parse_sandbox_resources_json_rejects_zero_gpu_count() { + let err = + parse_sandbox_resources_json(None, Some(0)).expect_err("zero GPU count should fail"); + + assert!(err.to_string().contains("--gpu-count")); + assert!(err.to_string().contains("greater than zero")); + } + struct EnvVarGuard { key: &'static str, original: Option, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index dfdf975a4..861c8c011 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -659,6 +659,30 @@ fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } +fn struct_field<'a>(value: &'a prost_types::Struct, field: &str) -> &'a prost_types::Struct { + let value = value + .fields + .get(field) + .and_then(|value| value.kind.as_ref()) + .unwrap_or_else(|| panic!("expected {field} field")); + let prost_types::value::Kind::StructValue(value) = value else { + panic!("expected {field} to be an object"); + }; + value +} + +fn string_field(value: &prost_types::Struct, field: &str) -> String { + let value = value + .fields + .get(field) + .and_then(|value| value.kind.as_ref()) + .unwrap_or_else(|| panic!("expected {field} field")); + let prost_types::value::Kind::StringValue(value) = value else { + panic!("expected {field} to be a string"); + }; + value.clone() +} + #[tokio::test] async fn sandbox_create_keeps_command_sessions_by_default() { let server = run_server().await; @@ -679,6 +703,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { None, None, None, + None, &[], None, None, @@ -700,7 +725,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { } #[tokio::test] -async fn sandbox_create_gpu_count_sets_gpu_intent_and_count() { +async fn sandbox_create_gpu_count_sets_gpu_intent_and_resources() { let server = run_server().await; let fake_ssh_dir = tempfile::tempdir().unwrap(); let xdg_dir = tempfile::tempdir().unwrap(); @@ -719,6 +744,7 @@ async fn sandbox_create_gpu_count_sets_gpu_intent_and_count() { Some(2), None, None, + None, &[], None, None, @@ -737,7 +763,65 @@ async fn sandbox_create_gpu_count_sets_gpu_intent_and_count() { .and_then(|request| request.spec.as_ref()) .expect("create request should include a spec"); assert!(spec.gpu); - assert_eq!(spec.gpu_count, 2); + + let resources = spec + .template + .as_ref() + .and_then(|template| template.resources.as_ref()) + .expect("gpu count should create template resources"); + let limits = struct_field(resources, "limits"); + assert_eq!(string_field(limits, "nvidia.com/gpu"), "2"); +} + +#[tokio::test] +async fn sandbox_create_resources_json_sets_template_resources() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("resources-json"), + None, + "openshell", + None, + true, + false, + None, + Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#), + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = created_requests(&server).await; + let spec = requests + .last() + .and_then(|request| request.spec.as_ref()) + .expect("create request should include a spec"); + assert!(!spec.gpu); + + let resources = spec + .template + .as_ref() + .and_then(|template| template.resources.as_ref()) + .expect("resources JSON should create template resources"); + let requests = struct_field(resources, "requests"); + let limits = struct_field(resources, "limits"); + assert_eq!(string_field(requests, "cpu"), "2"); + assert_eq!(string_field(limits, "cpu"), "16"); } #[tokio::test] @@ -760,6 +844,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { None, None, None, + None, &[], None, None, @@ -803,6 +888,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { None, None, None, + None, &[], None, None, @@ -846,6 +932,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { None, None, None, + None, &[], None, None, @@ -889,6 +976,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { None, None, None, + None, &[], None, Some(openshell_core::forward::ForwardSpec::new(forward_port)), diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index f5155fd13..df68d39d6 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -33,7 +33,6 @@ fn test_sandbox() -> DriverSandbox { }), gpu: false, gpu_device: String::new(), - gpu_count: 0, }), status: None, } diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 6202e69f6..f2324b757 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -44,8 +44,9 @@ pods do not need direct external ingress for SSH. ## GPU Support When a sandbox requests GPU support, the driver checks node allocatable capacity -for `nvidia.com/gpu`. A request with only `gpu=true` asks for one GPU. A request -with `gpu_count > 0` sets the pod resource limit to that count. The cluster must -expose `nvidia.com/gpu` through the NVIDIA device plugin or an equivalent -device-plugin implementation, and the sandbox image must provide the user-space -libraries needed by the agent workload. +for `nvidia.com/gpu`. A request with only `gpu=true` asks for one GPU unless the +template already provides an explicit `limits["nvidia.com/gpu"]` value through +the resource passthrough. The cluster must expose `nvidia.com/gpu` through the +NVIDIA device plugin or an equivalent device-plugin implementation, and the +sandbox image must provide the user-space libraries needed by the agent +workload. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a434731c3..5b2fa8984 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -194,10 +194,7 @@ impl KubernetesComputeDriver { } pub async fn validate_sandbox_create(&self, sandbox: &Sandbox) -> Result<(), tonic::Status> { - let gpu_requested = sandbox - .spec - .as_ref() - .is_some_and(|spec| spec.gpu || spec.gpu_count > 0); + let gpu_requested = sandbox.spec.as_ref().is_some_and(|spec| spec.gpu); if gpu_requested && !self.has_gpu_capacity().await.map_err(|err| { tonic::Status::internal(format!("check GPU node capacity failed: {err}")) @@ -1014,14 +1011,7 @@ fn sandbox_to_k8s_spec( if let Some(template) = spec.template.as_ref() { root.insert( "podTemplate".to_string(), - sandbox_template_to_k8s( - template, - spec.gpu, - spec.gpu_count, - &pod_env, - inject_workspace, - params, - ), + sandbox_template_to_k8s(template, spec.gpu, &pod_env, inject_workspace, params), ); if !template.agent_socket_path.is_empty() { root.insert( @@ -1054,7 +1044,6 @@ fn sandbox_to_k8s_spec( sandbox_template_to_k8s( &SandboxTemplate::default(), spec.is_some_and(|s| s.gpu), - spec.map_or(0, |s| s.gpu_count), &pod_env, inject_workspace, params, @@ -1070,7 +1059,6 @@ fn sandbox_to_k8s_spec( fn sandbox_template_to_k8s( template: &SandboxTemplate, gpu: bool, - gpu_count: u32, spec_environment: &std::collections::HashMap, inject_workspace: bool, params: &SandboxPodParams<'_>, @@ -1103,7 +1091,7 @@ fn sandbox_template_to_k8s( if use_user_namespaces { spec.insert("hostUsers".to_string(), serde_json::json!(false)); - if gpu || gpu_count > 0 { + if gpu { warn!( "GPU sandbox with user namespaces enabled — \ NVIDIA device plugin compatibility is unverified" @@ -1181,7 +1169,7 @@ fn sandbox_template_to_k8s( ); } - if let Some(resources) = container_resources(template, gpu, gpu_count) { + if let Some(resources) = container_resources(template, gpu) { container.insert("resources".to_string(), resources); } spec.insert( @@ -1237,11 +1225,7 @@ fn sandbox_template_to_k8s( result } -fn container_resources( - template: &SandboxTemplate, - gpu: bool, - gpu_count: u32, -) -> Option { +fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option { // Start from the raw resources passthrough in platform_config (preserves // custom resource types like GPU limits that users set via the public API // Struct), then overlay the typed DriverResourceRequirements on top. @@ -1263,10 +1247,7 @@ fn container_resources( apply("limits", "memory", &req.memory_limit); } - if gpu_count > 0 { - let quantity = gpu_count.to_string(); - apply_gpu_limit(&mut resources, &quantity); - } else if gpu { + if gpu { apply_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY); } if resources.as_object().is_some_and(serde_json::Map::is_empty) { @@ -1290,7 +1271,9 @@ fn apply_gpu_limit(resources: &mut serde_json::Value, quantity: &str) { return apply_gpu_limit(resources, quantity); }; - limits_obj.insert(GPU_RESOURCE_NAME.to_string(), serde_json::json!(quantity)); + limits_obj + .entry(GPU_RESOURCE_NAME.to_string()) + .or_insert_with(|| serde_json::json!(quantity)); } #[allow(clippy::too_many_arguments)] @@ -1821,7 +1804,6 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), true, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1838,26 +1820,6 @@ mod tests { ); } - #[test] - fn gpu_count_sandbox_sets_requested_gpu_limit() { - let pod_template = { - let params = SandboxPodParams::default(); - sandbox_template_to_k8s( - &SandboxTemplate::default(), - true, - 4, - &std::collections::HashMap::new(), - true, - ¶ms, - ) - }; - - assert_eq!( - pod_template["spec"]["containers"][0]["resources"]["limits"][GPU_RESOURCE_NAME], - serde_json::json!("4") - ); - } - #[test] fn gpu_sandbox_uses_template_runtime_class_name_when_set() { let template = SandboxTemplate { @@ -1878,7 +1840,6 @@ mod tests { sandbox_template_to_k8s( &template, true, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1911,7 +1872,6 @@ mod tests { sandbox_template_to_k8s( &template, false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1940,7 +1900,6 @@ mod tests { sandbox_template_to_k8s( &template, true, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -1956,7 +1915,7 @@ mod tests { } #[test] - fn gpu_count_preserves_existing_resource_limits() { + fn gpu_sandbox_preserves_explicit_gpu_resource_limit() { use openshell_core::proto::compute::v1::DriverResourceRequirements; let template = SandboxTemplate { @@ -1985,7 +1944,6 @@ mod tests { sandbox_template_to_k8s( &template, true, - 4, &std::collections::HashMap::new(), true, ¶ms, @@ -1995,7 +1953,7 @@ mod tests { let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; assert_eq!(limits["cpu"], serde_json::json!("2")); assert_eq!(limits["example.com/custom"], serde_json::json!("7")); - assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("4")); + assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("2")); } #[test] @@ -2008,7 +1966,6 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2034,7 +1991,6 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2058,7 +2014,6 @@ mod tests { sandbox_template_to_k8s( &template, false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2198,7 +2153,6 @@ mod tests { let pod_template = sandbox_template_to_k8s( &SandboxTemplate::default(), false, - 0, &std::collections::HashMap::new(), false, // user provided custom VCTs ¶ms, @@ -2237,7 +2191,6 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2303,7 +2256,6 @@ mod tests { let pod_template = sandbox_template_to_k8s( &template, false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2342,7 +2294,6 @@ mod tests { let pod_template = sandbox_template_to_k8s( &template, false, - 0, &std::collections::HashMap::new(), true, ¶ms, @@ -2369,7 +2320,6 @@ mod tests { sandbox_template_to_k8s( &SandboxTemplate::default(), false, - 0, &std::collections::HashMap::new(), true, ¶ms, diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 38120d67b..d2fd34011 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -1132,7 +1132,6 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { .map(driver_sandbox_template_from_public), gpu: spec.gpu, gpu_device: spec.gpu_device.clone(), - gpu_count: spec.gpu_count, } } @@ -1492,8 +1491,7 @@ fn derive_phase(status: Option<&DriverSandboxStatus>) -> SandboxPhase { } fn rewrite_user_facing_conditions(status: &mut Option, spec: Option<&SandboxSpec>) { - let gpu_requested = - spec.is_some_and(|sandbox_spec| sandbox_spec.gpu || sandbox_spec.gpu_count > 0); + let gpu_requested = spec.is_some_and(|sandbox_spec| sandbox_spec.gpu); if !gpu_requested { return; } @@ -2010,28 +2008,6 @@ mod tests { assert!(build_platform_config(&template).is_none()); } - #[test] - fn driver_sandbox_spec_from_public_maps_gpu_count() { - let public = SandboxSpec { - gpu: true, - gpu_count: 4, - ..Default::default() - }; - - let driver = driver_sandbox_spec_from_public(&public); - - assert!(driver.gpu); - assert_eq!(driver.gpu_count, 4); - } - - #[test] - fn driver_sandbox_spec_from_public_preserves_default_gpu_count() { - let driver = driver_sandbox_spec_from_public(&SandboxSpec::default()); - - assert!(!driver.gpu); - assert_eq!(driver.gpu_count, 0); - } - #[test] fn build_platform_config_preserves_non_typed_resource_fields() { let template = SandboxTemplate { diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 518babb20..160b7e031 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -131,13 +131,6 @@ pub(super) fn validate_sandbox_spec( validate_sandbox_template(tmpl)?; } - // --- GPU allocation mode --- - if spec.gpu_count > 0 && !spec.gpu_device.is_empty() { - return Err(Status::invalid_argument( - "gpu_count cannot be combined with gpu_device", - )); - } - // --- spec.policy serialized size --- if let Some(ref policy) = spec.policy { let size = policy.encoded_len(); @@ -681,29 +674,6 @@ mod tests { assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); } - #[test] - fn validate_sandbox_spec_accepts_gpu_count() { - let spec = SandboxSpec { - gpu_count: 4, - ..Default::default() - }; - assert!(validate_sandbox_spec("gpu-count-sandbox", &spec).is_ok()); - } - - #[test] - fn validate_sandbox_spec_rejects_gpu_count_with_gpu_device() { - let spec = SandboxSpec { - gpu: true, - gpu_count: 4, - gpu_device: "nvidia.com/gpu=0".to_string(), - ..Default::default() - }; - let err = validate_sandbox_spec("gpu-count-sandbox", &spec).unwrap_err(); - assert_eq!(err.code(), Code::InvalidArgument); - assert!(err.message().contains("gpu_count")); - assert!(err.message().contains("gpu_device")); - } - #[test] fn validate_sandbox_spec_accepts_empty_defaults() { assert!(validate_sandbox_spec("", &default_spec()).is_ok()); diff --git a/docs/sandboxes/manage-sandboxes.mdx b/docs/sandboxes/manage-sandboxes.mdx index eddbe210b..0ded3ddf3 100644 --- a/docs/sandboxes/manage-sandboxes.mdx +++ b/docs/sandboxes/manage-sandboxes.mdx @@ -42,8 +42,18 @@ openshell sandbox create --gpu-count 4 -- claude ``` GPU counts schedule Kubernetes sandbox pods with an `nvidia.com/gpu` resource -limit. Use `--gpu-device` only when you need driver-specific device selection; -it cannot be combined with `--gpu-count`. +limit. + +To pass generic resource requests and limits through to the sandbox template, +use `--resources-json` with a JSON object: + +```shell +openshell sandbox create \ + --resources-json '{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}' +``` + +The JSON is forwarded as `SandboxTemplate.resources`. Use Kubernetes resource +quantity strings for CPU, memory, and extended resources. For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker CDI after the gateway starts, restart the gateway so OpenShell can detect the diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 603f5e954..3c4308f3f 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -90,9 +90,6 @@ message DriverSandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; - // Number of NVIDIA GPUs requested for this sandbox. 0 means unspecified; - // when gpu=true and gpu_count=0, drivers keep their existing default. - uint32 gpu_count = 11; } // Driver-owned runtime template consumed by the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index f2c17f451..bb2ce6cec 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -256,9 +256,6 @@ message SandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; - // Number of NVIDIA GPUs requested for this sandbox. 0 means unspecified; - // when gpu=true and gpu_count=0, drivers keep their existing default. - uint32 gpu_count = 11; } // Public sandbox template mapped onto compute-driver template inputs. From 9c167294be5013752924437f60d5e90ec32511ae Mon Sep 17 00:00:00 2001 From: Ryan Angilly Date: Wed, 13 May 2026 10:07:32 -0600 Subject: [PATCH 3/3] refactor(sandbox): add ResourceSpec for sandbox resources Signed-off-by: Ryan Angilly --- architecture/compute-runtimes.md | 4 +- crates/openshell-cli/src/main.rs | 100 +++++-- crates/openshell-cli/src/run.rs | 257 +++++++----------- .../sandbox_create_lifecycle_integration.rs | 79 ++---- crates/openshell-driver-docker/src/tests.rs | 1 + crates/openshell-driver-kubernetes/README.md | 13 +- .../openshell-driver-kubernetes/src/driver.rs | 48 +++- crates/openshell-server/src/compute/mod.rs | 117 +++++++- .../openshell-server/src/grpc/validation.rs | 192 ++++++++++++- docs/sandboxes/manage-sandboxes.mdx | 21 +- proto/compute_driver.proto | 4 + proto/openshell.proto | 22 ++ 12 files changed, 592 insertions(+), 266 deletions(-) diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 11f6de22a..af12291a8 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -63,8 +63,8 @@ Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The runtime still owns GPU device injection or resource scheduling. Kubernetes maps -template resource limits such as `nvidia.com/gpu` into the sandbox pod when the -cluster exposes those resources. +portable `SandboxSpec.resources` CPU, memory, and GPU count requirements into +pod resource requests and limits when the cluster exposes those resources. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index d1bc7ca40..542a57ab7 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1030,6 +1030,7 @@ enum DoctorCommands { } #[derive(Subcommand, Debug)] +#[allow(clippy::large_enum_variant)] enum SandboxCommands { /// Create a sandbox. #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] @@ -1088,21 +1089,33 @@ enum SandboxCommands { /// /// This implies --gpu. Kubernetes-backed gateways schedule pods with /// the corresponding nvidia.com/gpu resource limit. - #[arg(long, value_name = "COUNT", value_parser = clap::value_parser!(u32).range(1..))] + #[arg(long, value_name = "COUNT", value_parser = clap::value_parser!(u32).range(1..), conflicts_with = "gpu_device")] gpu_count: Option, - /// Set compute resource requests and limits as JSON. - /// - /// The JSON must be an object with the same shape as - /// SandboxTemplate.resources, for example: - /// {"requests":{"cpu":"2"},"limits":{"cpu":"16"}} - #[arg(long, value_name = "JSON")] - resources_json: Option, + /// Minimum CPU cores requested, e.g. "500m" or "2". + #[arg(long, value_name = "QUANTITY")] + cpu_request: Option, + + /// Maximum CPU cores allowed, e.g. "2" or "4". + #[arg(long, value_name = "QUANTITY")] + cpu_limit: Option, + + /// Minimum memory requested, e.g. "512Mi" or "4Gi". + #[arg(long, value_name = "QUANTITY")] + memory_request: Option, + + /// Maximum memory allowed, e.g. "1Gi" or "8Gi". + #[arg(long, value_name = "QUANTITY")] + memory_limit: Option, + + /// Driver-specific resource configuration as KEY=VALUE. + #[arg(long = "resource-config", value_name = "KEY=VALUE")] + resource_config: Vec, /// Target a driver-specific GPU device. Docker and Podman use CDI device IDs /// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index. /// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection. - #[arg(long, requires = "gpu")] + #[arg(long, requires = "gpu", conflicts_with = "gpu_count")] gpu_device: Option, /// Provider names to attach to this sandbox. @@ -2380,7 +2393,11 @@ async fn main() -> Result<()> { editor, gpu, gpu_count, - resources_json, + cpu_request, + cpu_limit, + memory_request, + memory_limit, + resource_config, gpu_device, providers, policy, @@ -2447,8 +2464,14 @@ async fn main() -> Result<()> { upload_spec.as_ref(), keep, gpu, - gpu_count, - resources_json.as_deref(), + run::SandboxResourceArgs { + cpu_request: cpu_request.as_deref(), + cpu_limit: cpu_limit.as_deref(), + memory_request: memory_request.as_deref(), + memory_limit: memory_limit.as_deref(), + gpu_count, + driver_config: &resource_config, + }, gpu_device.as_deref(), editor, &providers, @@ -3853,27 +3876,66 @@ mod tests { } #[test] - fn sandbox_create_resources_json_parses() { + fn sandbox_create_resource_spec_flags_parse() { let cli = Cli::try_parse_from([ "openshell", "sandbox", "create", - "--resources-json", - r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#, + "--cpu-request", + "2", + "--cpu-limit", + "4", + "--memory-request", + "8Gi", + "--memory-limit", + "16Gi", + "--resource-config", + "kubernetes.resource-name=nvidia.com/gpu", ]) - .expect("sandbox create --resources-json should parse"); + .expect("sandbox create resource flags should parse"); match cli.command { Some(Commands::Sandbox { - command: Some(SandboxCommands::Create { resources_json, .. }), + command: + Some(SandboxCommands::Create { + cpu_request, + cpu_limit, + memory_request, + memory_limit, + resource_config, + .. + }), .. }) => { + assert_eq!(cpu_request.as_deref(), Some("2")); + assert_eq!(cpu_limit.as_deref(), Some("4")); + assert_eq!(memory_request.as_deref(), Some("8Gi")); + assert_eq!(memory_limit.as_deref(), Some("16Gi")); assert_eq!( - resources_json.as_deref(), - Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#) + resource_config, + vec!["kubernetes.resource-name=nvidia.com/gpu".to_string()] ); } other => panic!("expected sandbox create command, got: {other:?}"), } } + + #[test] + fn sandbox_create_gpu_count_conflicts_with_gpu_device() { + let result = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--gpu", + "--gpu-count", + "2", + "--gpu-device", + "nvidia.com/gpu=0", + ]); + + assert!( + result.is_err(), + "sandbox create should reject combining --gpu-count and --gpu-device" + ); + } } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 07ee6fd69..f4d931445 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -36,7 +36,7 @@ use openshell_core::proto::{ LintProviderProfilesRequest, ListProviderProfilesRequest, ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxProvidersRequest, ListSandboxesRequest, ListServicesRequest, PolicySource, PolicyStatus, Provider, ProviderProfile, - ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, + ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, ResourceSpec, RevokeSshSessionRequest, Sandbox, SandboxPhase, SandboxPolicy, SandboxSpec, SandboxTemplate, ServiceEndpointResponse, SetClusterInferenceRequest, SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit, TcpRelayTarget, UpdateConfigRequest, UpdateProviderRequest, @@ -67,8 +67,6 @@ pub use openshell_core::forward::{ find_forward_by_port, list_forwards, stop_forward, stop_forwards_for_sandbox, }; -const NVIDIA_GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; - /// Convert a sandbox phase integer to a human-readable string. fn phase_name(phase: i32) -> &'static str { match SandboxPhase::try_from(phase) { @@ -1459,91 +1457,52 @@ async fn finalize_sandbox_create_session( session_result } -fn parse_sandbox_resources_json( - resources_json: Option<&str>, - gpu_count: Option, -) -> Result> { - let mut resources = match resources_json { - Some(raw) => { - if raw.trim().is_empty() { - return Err(miette::miette!("--resources-json cannot be empty")); - } - - let value: serde_json::Value = serde_json::from_str(raw) - .into_diagnostic() - .wrap_err("invalid --resources-json JSON")?; - let serde_json::Value::Object(fields) = value else { - return Err(miette::miette!("--resources-json must be a JSON object")); - }; +#[derive(Clone, Copy, Debug, Default)] +pub struct SandboxResourceArgs<'a> { + pub cpu_request: Option<&'a str>, + pub cpu_limit: Option<&'a str>, + pub memory_request: Option<&'a str>, + pub memory_limit: Option<&'a str>, + pub gpu_count: Option, + pub driver_config: &'a [String], +} - Some(json_object_to_prost_struct(fields)) - } - None => None, +fn build_resource_spec(args: SandboxResourceArgs<'_>) -> Result> { + let mut resources = ResourceSpec { + driver_config: parse_key_value_pairs(args.driver_config, "--resource-config")?, + ..ResourceSpec::default() }; - if let Some(count) = gpu_count { + if let Some(value) = args.cpu_request { + resources.cpu_request = value.to_string(); + } + if let Some(value) = args.cpu_limit { + resources.cpu_limit = value.to_string(); + } + if let Some(value) = args.memory_request { + resources.memory_request = value.to_string(); + } + if let Some(value) = args.memory_limit { + resources.memory_limit = value.to_string(); + } + if let Some(count) = args.gpu_count { if count == 0 { return Err(miette::miette!("--gpu-count must be greater than zero")); } - inject_gpu_count_resource(resources.get_or_insert_with(Default::default), count); - } - - Ok(resources) -} - -fn json_object_to_prost_struct( - fields: serde_json::Map, -) -> prost_types::Struct { - prost_types::Struct { - fields: fields - .into_iter() - .map(|(key, value)| (key, json_value_to_prost_value(value))) - .collect(), + resources.gpu_count = count; } -} - -fn json_value_to_prost_value(value: serde_json::Value) -> prost_types::Value { - use prost_types::{ListValue, Value, value::Kind}; - - let kind = match value { - serde_json::Value::Null => Kind::NullValue(0), - serde_json::Value::Bool(value) => Kind::BoolValue(value), - serde_json::Value::Number(value) => Kind::NumberValue(value.as_f64().unwrap_or_default()), - serde_json::Value::String(value) => Kind::StringValue(value), - serde_json::Value::Array(values) => Kind::ListValue(ListValue { - values: values.into_iter().map(json_value_to_prost_value).collect(), - }), - serde_json::Value::Object(fields) => Kind::StructValue(json_object_to_prost_struct(fields)), - }; - Value { kind: Some(kind) } -} - -fn inject_gpu_count_resource(resources: &mut prost_types::Struct, gpu_count: u32) { - use prost_types::{Struct, Value, value::Kind}; - - let limits_kind = &mut resources - .fields - .entry("limits".to_string()) - .or_insert_with(|| Value { - kind: Some(Kind::StructValue(Struct::default())), - }) - .kind; - - if !matches!(limits_kind, Some(Kind::StructValue(_))) { - *limits_kind = Some(Kind::StructValue(Struct::default())); + if resources.cpu_request.is_empty() + && resources.cpu_limit.is_empty() + && resources.memory_request.is_empty() + && resources.memory_limit.is_empty() + && resources.gpu_count == 0 + && resources.driver_config.is_empty() + { + Ok(None) + } else { + Ok(Some(resources)) } - - let Some(Kind::StructValue(limits)) = limits_kind.as_mut() else { - unreachable!("limits kind was normalized to a StructValue"); - }; - - limits.fields.insert( - NVIDIA_GPU_RESOURCE_NAME.to_string(), - Value { - kind: Some(Kind::StringValue(gpu_count.to_string())), - }, - ); } /// Create a sandbox with default settings. @@ -1556,8 +1515,7 @@ pub async fn sandbox_create( upload: Option<&(String, Option, bool)>, keep: bool, gpu: bool, - gpu_count: Option, - resources_json: Option<&str>, + resources: SandboxResourceArgs<'_>, gpu_device: Option<&str>, editor: Option, providers: &[String], @@ -1574,7 +1532,12 @@ pub async fn sandbox_create( "--editor cannot be used with a trailing command; use `openshell sandbox connect --editor ...` after the sandbox is ready" )); } - let resources = parse_sandbox_resources_json(resources_json, gpu_count)?; + let resources = build_resource_spec(resources)?; + if resources.as_ref().is_some_and(|spec| spec.gpu_count > 0) && gpu_device.is_some() { + return Err(miette::miette!( + "--gpu-count cannot be combined with --gpu-device" + )); + } // Check port availability *before* creating the sandbox so we don't // leave an orphaned sandbox behind when the forward would fail. @@ -1610,8 +1573,9 @@ pub async fn sandbox_create( } None => None, }; - let requested_gpu = - gpu || gpu_count.is_some() || image.as_deref().is_some_and(image_requests_gpu); + let requested_gpu = gpu + || resources.as_ref().is_some_and(|spec| spec.gpu_count > 0) + || image.as_deref().is_some_and(image_requests_gpu); let inferred_types: Vec = inferred_provider_type(command).into_iter().collect(); let configured_providers = ensure_required_providers( @@ -1624,10 +1588,9 @@ pub async fn sandbox_create( let policy = load_sandbox_policy(policy)?; - let template = if image.is_some() || resources.is_some() { + let template = if image.is_some() { Some(SandboxTemplate { image: image.unwrap_or_default(), - resources, ..SandboxTemplate::default() }) } else { @@ -1640,6 +1603,7 @@ pub async fn sandbox_create( gpu_device: gpu_device.unwrap_or_default().to_string(), policy, providers: configured_providers, + resources, template, ..SandboxSpec::default() }), @@ -6109,15 +6073,15 @@ fn format_timestamp_ms(ms: i64) -> String { #[cfg(test)] mod tests { use super::{ - TlsOptions, dockerfile_sources_supported_for_gateway, format_endpoint, - format_gateway_select_header, format_gateway_select_items, - format_provider_attachment_table, gateway_add, gateway_auth_label, - gateway_env_override_warning, gateway_select_with, gateway_type_label, git_sync_files, - http_health_check, image_requests_gpu, import_local_package_mtls_bundle, + SandboxResourceArgs, TlsOptions, build_resource_spec, + dockerfile_sources_supported_for_gateway, format_endpoint, format_gateway_select_header, + format_gateway_select_items, format_provider_attachment_table, gateway_add, + gateway_auth_label, gateway_env_override_warning, gateway_select_with, gateway_type_label, + git_sync_files, http_health_check, image_requests_gpu, import_local_package_mtls_bundle, inferred_provider_type, package_managed_tls_dirs, parse_cli_setting_value, - parse_credential_pairs, parse_sandbox_resources_json, plaintext_gateway_is_remote, - provisioning_timeout_message, ready_false_condition_message, resolve_from, - sandbox_should_persist, service_expose_status_error, service_url_for_gateway, + parse_credential_pairs, plaintext_gateway_is_remote, provisioning_timeout_message, + ready_false_condition_message, resolve_from, sandbox_should_persist, + service_expose_status_error, service_url_for_gateway, }; use crate::TEST_ENV_LOCK; use hyper::StatusCode; @@ -6135,83 +6099,54 @@ mod tests { Provider, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, }; - fn prost_struct_field<'a>( - value: &'a prost_types::Struct, - field: &str, - ) -> &'a prost_types::Struct { - let value = value - .fields - .get(field) - .and_then(|value| value.kind.as_ref()) - .unwrap_or_else(|| panic!("expected {field} field")); - let prost_types::value::Kind::StructValue(value) = value else { - panic!("expected {field} to be an object"); - }; - value - } - - fn prost_string_field(value: &prost_types::Struct, field: &str) -> String { - let value = value - .fields - .get(field) - .and_then(|value| value.kind.as_ref()) - .unwrap_or_else(|| panic!("expected {field} field")); - let prost_types::value::Kind::StringValue(value) = value else { - panic!("expected {field} to be a string"); - }; - value.clone() - } - - #[test] - fn parse_sandbox_resources_json_rejects_empty_string() { - let err = parse_sandbox_resources_json(Some(""), None) - .expect_err("empty resources JSON should fail"); - - assert!(err.to_string().contains("--resources-json")); - assert!(err.to_string().contains("empty")); - } - #[test] - fn parse_sandbox_resources_json_rejects_invalid_json() { - let err = parse_sandbox_resources_json(Some("{bad"), None) - .expect_err("invalid resources JSON should fail"); - - assert!(err.to_string().contains("invalid --resources-json JSON")); - } - - #[test] - fn parse_sandbox_resources_json_rejects_non_object_json() { - let err = parse_sandbox_resources_json(Some(r#"["cpu"]"#), None) - .expect_err("non-object resources JSON should fail"); + fn build_resource_spec_sets_typed_fields_and_driver_config() { + let driver_config = vec!["kubernetes.resource-name=nvidia.com/mig-1g.5gb".to_string()]; + let resources = build_resource_spec(SandboxResourceArgs { + cpu_request: Some("2"), + cpu_limit: Some("4"), + memory_request: Some("8Gi"), + memory_limit: Some("16Gi"), + gpu_count: Some(4), + driver_config: &driver_config, + }) + .expect("resources should parse") + .expect("resource spec should be present"); - assert!(err.to_string().contains("--resources-json")); - assert!(err.to_string().contains("JSON object")); + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); + assert_eq!(resources.gpu_count, 4); + assert_eq!( + resources.driver_config.get("kubernetes.resource-name"), + Some(&"nvidia.com/mig-1g.5gb".to_string()) + ); } #[test] - fn parse_sandbox_resources_json_injects_gpu_count_resource() { - let resources = parse_sandbox_resources_json( - Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#), - Some(4), - ) - .expect("resources should parse") - .expect("resources should be present"); - - let requests = prost_struct_field(&resources, "requests"); - let limits = prost_struct_field(&resources, "limits"); + fn build_resource_spec_rejects_zero_gpu_count() { + let err = build_resource_spec(SandboxResourceArgs { + gpu_count: Some(0), + ..SandboxResourceArgs::default() + }) + .expect_err("zero GPU count should fail"); - assert_eq!(prost_string_field(requests, "cpu"), "2"); - assert_eq!(prost_string_field(limits, "cpu"), "16"); - assert_eq!(prost_string_field(limits, "nvidia.com/gpu"), "4"); + assert!(err.to_string().contains("--gpu-count")); + assert!(err.to_string().contains("greater than zero")); } #[test] - fn parse_sandbox_resources_json_rejects_zero_gpu_count() { - let err = - parse_sandbox_resources_json(None, Some(0)).expect_err("zero GPU count should fail"); + fn build_resource_spec_rejects_invalid_driver_config() { + let driver_config = vec!["missing-separator".to_string()]; + let err = build_resource_spec(SandboxResourceArgs { + driver_config: &driver_config, + ..SandboxResourceArgs::default() + }) + .expect_err("invalid driver config should fail"); - assert!(err.to_string().contains("--gpu-count")); - assert!(err.to_string().contains("greater than zero")); + assert!(err.to_string().contains("--resource-config")); + assert!(err.to_string().contains("KEY=VALUE")); } struct EnvVarGuard { diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 861c8c011..4b6ec4451 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -659,30 +659,6 @@ fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } -fn struct_field<'a>(value: &'a prost_types::Struct, field: &str) -> &'a prost_types::Struct { - let value = value - .fields - .get(field) - .and_then(|value| value.kind.as_ref()) - .unwrap_or_else(|| panic!("expected {field} field")); - let prost_types::value::Kind::StructValue(value) = value else { - panic!("expected {field} to be an object"); - }; - value -} - -fn string_field(value: &prost_types::Struct, field: &str) -> String { - let value = value - .fields - .get(field) - .and_then(|value| value.kind.as_ref()) - .unwrap_or_else(|| panic!("expected {field} field")); - let prost_types::value::Kind::StringValue(value) = value else { - panic!("expected {field} to be a string"); - }; - value.clone() -} - #[tokio::test] async fn sandbox_create_keeps_command_sessions_by_default() { let server = run_server().await; @@ -700,8 +676,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { None, true, false, - None, - None, + run::SandboxResourceArgs::default(), None, None, &[], @@ -741,8 +716,10 @@ async fn sandbox_create_gpu_count_sets_gpu_intent_and_resources() { None, true, false, - Some(2), - None, + run::SandboxResourceArgs { + gpu_count: Some(2), + ..run::SandboxResourceArgs::default() + }, None, None, &[], @@ -765,16 +742,14 @@ async fn sandbox_create_gpu_count_sets_gpu_intent_and_resources() { assert!(spec.gpu); let resources = spec - .template + .resources .as_ref() - .and_then(|template| template.resources.as_ref()) - .expect("gpu count should create template resources"); - let limits = struct_field(resources, "limits"); - assert_eq!(string_field(limits, "nvidia.com/gpu"), "2"); + .expect("gpu count should create resource spec"); + assert_eq!(resources.gpu_count, 2); } #[tokio::test] -async fn sandbox_create_resources_json_sets_template_resources() { +async fn sandbox_create_resource_spec_sets_resource_requirements() { let server = run_server().await; let fake_ssh_dir = tempfile::tempdir().unwrap(); let xdg_dir = tempfile::tempdir().unwrap(); @@ -784,14 +759,19 @@ async fn sandbox_create_resources_json_sets_template_resources() { run::sandbox_create( &server.endpoint, - Some("resources-json"), + Some("resource-spec"), None, "openshell", None, true, false, - None, - Some(r#"{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}"#), + run::SandboxResourceArgs { + cpu_request: Some("2"), + cpu_limit: Some("4"), + memory_request: Some("8Gi"), + memory_limit: Some("16Gi"), + ..run::SandboxResourceArgs::default() + }, None, None, &[], @@ -814,14 +794,13 @@ async fn sandbox_create_resources_json_sets_template_resources() { assert!(!spec.gpu); let resources = spec - .template + .resources .as_ref() - .and_then(|template| template.resources.as_ref()) - .expect("resources JSON should create template resources"); - let requests = struct_field(resources, "requests"); - let limits = struct_field(resources, "limits"); - assert_eq!(string_field(requests, "cpu"), "2"); - assert_eq!(string_field(limits, "cpu"), "16"); + .expect("resource flags should create resource spec"); + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); } #[tokio::test] @@ -841,8 +820,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { None, false, false, - None, - None, + run::SandboxResourceArgs::default(), None, None, &[], @@ -885,8 +863,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { None, false, false, - None, - None, + run::SandboxResourceArgs::default(), None, None, &[], @@ -929,8 +906,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { None, true, false, - None, - None, + run::SandboxResourceArgs::default(), None, None, &[], @@ -973,8 +949,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { None, false, false, - None, - None, + run::SandboxResourceArgs::default(), None, None, &[], diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index df68d39d6..f3453f78f 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -357,6 +357,7 @@ fn docker_resource_limits_rejects_requests() { cpu_limit: String::new(), memory_request: String::new(), memory_limit: String::new(), + ..DriverResourceRequirements::default() }), platform_config: None, }; diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index f2324b757..b1e133c86 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -44,9 +44,10 @@ pods do not need direct external ingress for SSH. ## GPU Support When a sandbox requests GPU support, the driver checks node allocatable capacity -for `nvidia.com/gpu`. A request with only `gpu=true` asks for one GPU unless the -template already provides an explicit `limits["nvidia.com/gpu"]` value through -the resource passthrough. The cluster must expose `nvidia.com/gpu` through the -NVIDIA device plugin or an equivalent device-plugin implementation, and the -sandbox image must provide the user-space libraries needed by the agent -workload. +for `nvidia.com/gpu`. A resource spec with `gpu_count` maps to +`limits["nvidia.com/gpu"] = ""`. A request with only `gpu=true` asks for +one GPU unless the template already provides an explicit +`limits["nvidia.com/gpu"]` value through the resource passthrough. The cluster +must expose `nvidia.com/gpu` through the NVIDIA device plugin or an equivalent +device-plugin implementation, and the sandbox image must provide the user-space +libraries needed by the agent workload. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 5b2fa8984..b3dac55b9 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -1245,10 +1245,13 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option 0 { + set_gpu_limit(&mut resources, &req.gpu_count.to_string(), true); + } } if gpu { - apply_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY); + set_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY, false); } if resources.as_object().is_some_and(serde_json::Map::is_empty) { None @@ -1257,10 +1260,10 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option DriverSandboxSpec { DriverSandboxSpec { log_level: spec.log_level.clone(), environment: spec.environment.clone(), - template: spec - .template - .as_ref() - .map(driver_sandbox_template_from_public), - gpu: spec.gpu, + template: driver_sandbox_template_from_public( + spec.template.as_ref(), + spec.resources.as_ref(), + ), + gpu: spec.gpu || spec.resources.as_ref().is_some_and(|r| r.gpu_count > 0), gpu_device: spec.gpu_device.clone(), } } -fn driver_sandbox_template_from_public(template: &SandboxTemplate) -> DriverSandboxTemplate { - DriverSandboxTemplate { +fn driver_sandbox_template_from_public( + template: Option<&SandboxTemplate>, + resources: Option<&ResourceSpec>, +) -> Option { + let driver_resources = + driver_resources_from_public(resources, template.and_then(|t| t.resources.as_ref())); + + if template.is_none() && driver_resources.is_none() { + return None; + } + + let default_template = SandboxTemplate::default(); + let template = template.unwrap_or(&default_template); + + Some(DriverSandboxTemplate { image: template.image.clone(), agent_socket_path: template.agent_socket.clone(), labels: template.labels.clone(), environment: template.environment.clone(), - resources: extract_typed_resources(&template.resources), + resources: driver_resources, platform_config: build_platform_config(template), + }) +} + +fn driver_resources_from_public( + resources: Option<&ResourceSpec>, + template_resources: Option<&prost_types::Struct>, +) -> Option { + let mut req = extract_typed_resources(template_resources).unwrap_or_default(); + + if let Some(resources) = resources { + if !resources.cpu_request.is_empty() { + req.cpu_request.clone_from(&resources.cpu_request); + } + if !resources.cpu_limit.is_empty() { + req.cpu_limit.clone_from(&resources.cpu_limit); + } + if !resources.memory_request.is_empty() { + req.memory_request.clone_from(&resources.memory_request); + } + if !resources.memory_limit.is_empty() { + req.memory_limit.clone_from(&resources.memory_limit); + } + if resources.gpu_count > 0 { + req.gpu_count = resources.gpu_count; + } + if !resources.driver_config.is_empty() { + req.driver_config.clone_from(&resources.driver_config); + } + } + + if req.cpu_request.is_empty() + && req.cpu_limit.is_empty() + && req.memory_request.is_empty() + && req.memory_limit.is_empty() + && req.gpu_count == 0 + && req.driver_config.is_empty() + { + None + } else { + Some(req) } } @@ -1152,7 +1205,7 @@ fn driver_sandbox_template_from_public(template: &SandboxTemplate) -> DriverSand /// with the Kubernetes limits/requests shape. We pull out the well-known /// keys into the typed `DriverResourceRequirements` message. fn extract_typed_resources( - resources: &Option, + resources: Option<&prost_types::Struct>, ) -> Option { fn get_quantity(s: &prost_types::Struct, section: &str, key: &str) -> String { s.fields @@ -1168,13 +1221,14 @@ fn extract_typed_resources( .unwrap_or_default() } - let s = resources.as_ref()?; + let s = resources?; let req = DriverResourceRequirements { cpu_request: get_quantity(s, "requests", "cpu"), cpu_limit: get_quantity(s, "limits", "cpu"), memory_request: get_quantity(s, "requests", "memory"), memory_limit: get_quantity(s, "limits", "memory"), + ..DriverResourceRequirements::default() }; // Return None when all fields are empty so drivers can distinguish @@ -1183,6 +1237,8 @@ fn extract_typed_resources( && req.cpu_limit.is_empty() && req.memory_request.is_empty() && req.memory_limit.is_empty() + && req.gpu_count == 0 + && req.driver_config.is_empty() { None } else { @@ -2098,6 +2154,43 @@ mod tests { assert!(resources_raw.fields.contains_key("opaque_cpu")); } + #[test] + fn driver_sandbox_spec_maps_resource_spec_to_driver_resources() { + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_request: "2".to_string(), + cpu_limit: "4".to_string(), + memory_request: "8Gi".to_string(), + memory_limit: "16Gi".to_string(), + gpu_count: 4, + driver_config: std::iter::once(( + "kubernetes.resource-name".to_string(), + "nvidia.com/gpu".to_string(), + )) + .collect(), + }), + ..Default::default() + }; + + let driver_spec = driver_sandbox_spec_from_public(&spec); + assert!(driver_spec.gpu); + + let resources = driver_spec + .template + .as_ref() + .and_then(|template| template.resources.as_ref()) + .expect("resource spec should create driver template resources"); + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); + assert_eq!(resources.gpu_count, 4); + assert_eq!( + resources.driver_config.get("kubernetes.resource-name"), + Some(&"nvidia.com/gpu".to_string()) + ); + } + #[test] fn rewrite_user_facing_conditions_rewrites_gpu_unschedulable_message() { let mut status = Some(SandboxStatus { diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 160b7e031..424042435 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -9,7 +9,8 @@ #![allow(clippy::result_large_err)] // Validation returns Result<_, Status> use openshell_core::proto::{ - ExecSandboxRequest, Provider, SandboxPolicy as ProtoSandboxPolicy, SandboxTemplate, + ExecSandboxRequest, Provider, ResourceSpec, SandboxPolicy as ProtoSandboxPolicy, + SandboxTemplate, }; use prost::Message; use tonic::Status; @@ -21,6 +22,8 @@ use super::{ MAX_TEMPLATE_STRUCT_SIZE, }; +const NVIDIA_GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; + // --------------------------------------------------------------------------- // Exec request validation // --------------------------------------------------------------------------- @@ -130,6 +133,20 @@ pub(super) fn validate_sandbox_spec( if let Some(ref tmpl) = spec.template { validate_sandbox_template(tmpl)?; } + if let Some(ref resources) = spec.resources { + validate_resource_spec(resources)?; + validate_resource_template_conflicts(resources, spec.template.as_ref())?; + } + if spec + .resources + .as_ref() + .is_some_and(|resources| resources.gpu_count > 0) + && !spec.gpu_device.is_empty() + { + return Err(Status::invalid_argument( + "resources.gpu_count cannot be combined with gpu_device", + )); + } // --- spec.policy serialized size --- if let Some(ref policy) = spec.policy { @@ -144,6 +161,104 @@ pub(super) fn validate_sandbox_spec( Ok(()) } +fn validate_resource_spec(resources: &ResourceSpec) -> Result<(), Status> { + for (field, value) in [ + ("resources.cpu_request", &resources.cpu_request), + ("resources.cpu_limit", &resources.cpu_limit), + ("resources.memory_request", &resources.memory_request), + ("resources.memory_limit", &resources.memory_limit), + ] { + if value.len() > MAX_MAP_VALUE_LEN { + return Err(Status::invalid_argument(format!( + "{field} exceeds maximum length ({} > {MAX_MAP_VALUE_LEN})", + value.len() + ))); + } + } + + validate_string_map( + &resources.driver_config, + MAX_TEMPLATE_MAP_ENTRIES, + MAX_MAP_KEY_LEN, + MAX_MAP_VALUE_LEN, + "resources.driver_config", + ) +} + +fn validate_resource_template_conflicts( + resources: &ResourceSpec, + template: Option<&SandboxTemplate>, +) -> Result<(), Status> { + let Some(template_resources) = template.and_then(|template| template.resources.as_ref()) else { + return Ok(()); + }; + + for (field, section, key, expected) in [ + ( + "resources.cpu_request", + "requests", + "cpu", + resources.cpu_request.as_str(), + ), + ( + "resources.cpu_limit", + "limits", + "cpu", + resources.cpu_limit.as_str(), + ), + ( + "resources.memory_request", + "requests", + "memory", + resources.memory_request.as_str(), + ), + ( + "resources.memory_limit", + "limits", + "memory", + resources.memory_limit.as_str(), + ), + ] { + if !expected.is_empty() + && template_resource_string(template_resources, section, key) + .is_some_and(|actual| actual != expected) + { + return Err(Status::invalid_argument(format!( + "{field} conflicts with template.resources.{section}.{key}" + ))); + } + } + + if resources.gpu_count > 0 + && template_resource_string(template_resources, "limits", NVIDIA_GPU_RESOURCE_NAME) + .is_some_and(|actual| actual != resources.gpu_count.to_string()) + { + return Err(Status::invalid_argument( + "resources.gpu_count conflicts with template.resources.limits.nvidia.com/gpu", + )); + } + + Ok(()) +} + +fn template_resource_string<'a>( + resources: &'a prost_types::Struct, + section: &str, + key: &str, +) -> Option<&'a str> { + resources + .fields + .get(section) + .and_then(|value| match value.kind.as_ref() { + Some(prost_types::value::Kind::StructValue(section)) => section.fields.get(key), + _ => None, + }) + .and_then(|value| match value.kind.as_ref() { + Some(prost_types::value::Kind::StringValue(value)) => Some(value.as_str()), + _ => None, + }) +} + /// Validate template-level field sizes. fn validate_sandbox_template(tmpl: &SandboxTemplate) -> Result<(), Status> { // String fields. @@ -674,6 +789,81 @@ mod tests { assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); } + #[test] + fn validate_sandbox_spec_accepts_resource_spec() { + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_limit: "4".to_string(), + memory_limit: "8Gi".to_string(), + gpu_count: 2, + driver_config: std::iter::once(( + "kubernetes.resource-name".to_string(), + "nvidia.com/gpu".to_string(), + )) + .collect(), + ..Default::default() + }), + ..Default::default() + }; + + assert!(validate_sandbox_spec("resource-sandbox", &spec).is_ok()); + } + + #[test] + fn validate_sandbox_spec_rejects_gpu_count_with_gpu_device() { + let spec = SandboxSpec { + gpu_device: "nvidia.com/gpu=0".to_string(), + resources: Some(ResourceSpec { + gpu_count: 2, + ..Default::default() + }), + ..Default::default() + }; + + let err = validate_sandbox_spec("resource-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("gpu_count")); + assert!(err.message().contains("gpu_device")); + } + + #[test] + fn validate_sandbox_spec_rejects_conflicting_template_resources() { + use prost_types::{Struct, Value, value::Kind}; + + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_limit: "4".to_string(), + ..Default::default() + }), + template: Some(SandboxTemplate { + resources: Some(Struct { + fields: std::iter::once(( + "limits".to_string(), + Value { + kind: Some(Kind::StructValue(Struct { + fields: std::iter::once(( + "cpu".to_string(), + Value { + kind: Some(Kind::StringValue("2".to_string())), + }, + )) + .collect(), + })), + }, + )) + .collect(), + }), + ..Default::default() + }), + ..Default::default() + }; + + let err = validate_sandbox_spec("resource-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("resources.cpu_limit")); + assert!(err.message().contains("template.resources.limits.cpu")); + } + #[test] fn validate_sandbox_spec_accepts_empty_defaults() { assert!(validate_sandbox_spec("", &default_spec()).is_ok()); diff --git a/docs/sandboxes/manage-sandboxes.mdx b/docs/sandboxes/manage-sandboxes.mdx index 0ded3ddf3..5ec45b3bd 100644 --- a/docs/sandboxes/manage-sandboxes.mdx +++ b/docs/sandboxes/manage-sandboxes.mdx @@ -42,18 +42,27 @@ openshell sandbox create --gpu-count 4 -- claude ``` GPU counts schedule Kubernetes sandbox pods with an `nvidia.com/gpu` resource -limit. +limit. `--gpu-count` cannot be combined with `--gpu-device` because count-based +scheduling and driver-specific device selection are different allocation modes. -To pass generic resource requests and limits through to the sandbox template, -use `--resources-json` with a JSON object: +To set portable CPU and memory resource requests or limits, use the resource +flags: ```shell openshell sandbox create \ - --resources-json '{"requests":{"cpu":"2"},"limits":{"cpu":"16"}}' + --cpu-request 2 \ + --cpu-limit 4 \ + --memory-request 8Gi \ + --memory-limit 16Gi \ + -- claude ``` -The JSON is forwarded as `SandboxTemplate.resources`. Use Kubernetes resource -quantity strings for CPU, memory, and extended resources. +Resource values use Kubernetes-style quantity strings. Kubernetes-backed +gateways map these fields into pod `resources.requests` and +`resources.limits`. + +For driver-specific resource settings, pass opaque key-value entries with +`--resource-config KEY=VALUE`. The selected driver interprets those entries. For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker CDI after the gateway starts, restart the gateway so OpenShell can detect the diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6ed42cf4a 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -131,6 +131,10 @@ message DriverResourceRequirements { string memory_request = 3; // Maximum memory allowed (e.g. "512Mi", "8Gi"). string memory_limit = 4; + // Number of GPUs requested. A value of 0 means no explicit GPU count. + uint32 gpu_count = 5; + // Opaque key-value configuration interpreted by the selected driver. + map driver_config = 100; } // Raw status observed directly from the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index bb2ce6cec..5af21851a 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -256,6 +256,28 @@ message SandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Portable resource requirements for the sandbox workload. + ResourceSpec resources = 11; +} + +// Portable resource requirements for a sandbox workload. +// +// Values use Kubernetes-style quantity strings (e.g. "500m", "2", "4Gi") +// because they are a well-known, widely-adopted notation. Drivers for +// non-Kubernetes platforms must parse these strings into their native units. +message ResourceSpec { + // Minimum CPU cores requested (e.g. "500m", "2"). + string cpu_request = 1; + // Maximum CPU cores allowed (e.g. "500m", "4"). + string cpu_limit = 2; + // Minimum memory requested (e.g. "256Mi", "4Gi"). + string memory_request = 3; + // Maximum memory allowed (e.g. "512Mi", "8Gi"). + string memory_limit = 4; + // Number of GPUs requested. A value of 0 means no explicit GPU count. + uint32 gpu_count = 5; + // Opaque key-value configuration interpreted by the selected driver. + map driver_config = 100; } // Public sandbox template mapped onto compute-driver template inputs.