diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index 9ab512f98..af12291a8 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -22,7 +22,7 @@ Each runtime receives a sandbox spec from the gateway and is responsible for: |---|---|---|---| | Docker | Local development with Docker available. | Container plus nested sandbox namespace. | Uses host networking so loopback gateway endpoints work from the supervisor. | | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | -| Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | +| Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and `nvidia.com/gpu` limits for GPU requests. | | VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. | VM runtime state paths are derived only from driver-validated sandbox IDs @@ -62,7 +62,9 @@ users. Custom sandbox images must include the agent runtime and any system dependencies, but they should not need to include the gateway. GPU-capable images must include the user-space libraries required by the workload. The -runtime still owns GPU device injection. +runtime still owns GPU device injection or resource scheduling. Kubernetes maps +portable `SandboxSpec.resources` CPU, memory, and GPU count requirements into +pod resource requests and limits when the cluster exposes those resources. ## Deployment Shape diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 9cffb243b..542a57ab7 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1030,6 +1030,7 @@ enum DoctorCommands { } #[derive(Subcommand, Debug)] +#[allow(clippy::large_enum_variant)] enum SandboxCommands { /// Create a sandbox. #[command(help_template = LEAF_HELP_TEMPLATE, next_help_heading = "FLAGS")] @@ -1084,10 +1085,37 @@ enum SandboxCommands { #[arg(long)] gpu: bool, + /// Request a specific number of GPUs. + /// + /// This implies --gpu. Kubernetes-backed gateways schedule pods with + /// the corresponding nvidia.com/gpu resource limit. + #[arg(long, value_name = "COUNT", value_parser = clap::value_parser!(u32).range(1..), conflicts_with = "gpu_device")] + gpu_count: Option, + + /// Minimum CPU cores requested, e.g. "500m" or "2". + #[arg(long, value_name = "QUANTITY")] + cpu_request: Option, + + /// Maximum CPU cores allowed, e.g. "2" or "4". + #[arg(long, value_name = "QUANTITY")] + cpu_limit: Option, + + /// Minimum memory requested, e.g. "512Mi" or "4Gi". + #[arg(long, value_name = "QUANTITY")] + memory_request: Option, + + /// Maximum memory allowed, e.g. "1Gi" or "8Gi". + #[arg(long, value_name = "QUANTITY")] + memory_limit: Option, + + /// Driver-specific resource configuration as KEY=VALUE. + #[arg(long = "resource-config", value_name = "KEY=VALUE")] + resource_config: Vec, + /// Target a driver-specific GPU device. Docker and Podman use CDI device IDs /// (for example "nvidia.com/gpu=0"); VM uses a PCI BDF or index. /// Only valid with --gpu. When omitted with --gpu, the driver uses its default GPU selection. - #[arg(long, requires = "gpu")] + #[arg(long, requires = "gpu", conflicts_with = "gpu_count")] gpu_device: Option, /// Provider names to attach to this sandbox. @@ -2364,6 +2392,12 @@ async fn main() -> Result<()> { no_keep, editor, gpu, + gpu_count, + cpu_request, + cpu_limit, + memory_request, + memory_limit, + resource_config, gpu_device, providers, policy, @@ -2430,6 +2464,14 @@ async fn main() -> Result<()> { upload_spec.as_ref(), keep, gpu, + run::SandboxResourceArgs { + cpu_request: cpu_request.as_deref(), + cpu_limit: cpu_limit.as_deref(), + memory_request: memory_request.as_deref(), + memory_limit: memory_limit.as_deref(), + gpu_count, + driver_config: &resource_config, + }, gpu_device.as_deref(), editor, &providers, @@ -3821,4 +3863,79 @@ mod tests { other => panic!("expected service delete command, got: {other:?}"), } } + + #[test] + fn sandbox_create_gpu_count_rejects_zero() { + let result = Cli::try_parse_from(["openshell", "sandbox", "create", "--gpu-count", "0"]); + assert!( + result.is_err(), + "sandbox create --gpu-count 0 should be rejected" + ); + let err = result.unwrap_err().to_string(); + assert!(err.contains("--gpu-count")); + } + + #[test] + fn sandbox_create_resource_spec_flags_parse() { + let cli = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--cpu-request", + "2", + "--cpu-limit", + "4", + "--memory-request", + "8Gi", + "--memory-limit", + "16Gi", + "--resource-config", + "kubernetes.resource-name=nvidia.com/gpu", + ]) + .expect("sandbox create resource flags should parse"); + + match cli.command { + Some(Commands::Sandbox { + command: + Some(SandboxCommands::Create { + cpu_request, + cpu_limit, + memory_request, + memory_limit, + resource_config, + .. + }), + .. + }) => { + assert_eq!(cpu_request.as_deref(), Some("2")); + assert_eq!(cpu_limit.as_deref(), Some("4")); + assert_eq!(memory_request.as_deref(), Some("8Gi")); + assert_eq!(memory_limit.as_deref(), Some("16Gi")); + assert_eq!( + resource_config, + vec!["kubernetes.resource-name=nvidia.com/gpu".to_string()] + ); + } + other => panic!("expected sandbox create command, got: {other:?}"), + } + } + + #[test] + fn sandbox_create_gpu_count_conflicts_with_gpu_device() { + let result = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--gpu", + "--gpu-count", + "2", + "--gpu-device", + "nvidia.com/gpu=0", + ]); + + assert!( + result.is_err(), + "sandbox create should reject combining --gpu-count and --gpu-device" + ); + } } diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index 3205b8f68..f4d931445 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -36,7 +36,7 @@ use openshell_core::proto::{ LintProviderProfilesRequest, ListProviderProfilesRequest, ListProvidersRequest, ListSandboxPoliciesRequest, ListSandboxProvidersRequest, ListSandboxesRequest, ListServicesRequest, PolicySource, PolicyStatus, Provider, ProviderProfile, - ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, + ProviderProfileDiagnostic, ProviderProfileImportItem, RejectDraftChunkRequest, ResourceSpec, RevokeSshSessionRequest, Sandbox, SandboxPhase, SandboxPolicy, SandboxSpec, SandboxTemplate, ServiceEndpointResponse, SetClusterInferenceRequest, SettingScope, SettingValue, TcpForwardFrame, TcpForwardInit, TcpRelayTarget, UpdateConfigRequest, UpdateProviderRequest, @@ -1457,6 +1457,54 @@ async fn finalize_sandbox_create_session( session_result } +#[derive(Clone, Copy, Debug, Default)] +pub struct SandboxResourceArgs<'a> { + pub cpu_request: Option<&'a str>, + pub cpu_limit: Option<&'a str>, + pub memory_request: Option<&'a str>, + pub memory_limit: Option<&'a str>, + pub gpu_count: Option, + pub driver_config: &'a [String], +} + +fn build_resource_spec(args: SandboxResourceArgs<'_>) -> Result> { + let mut resources = ResourceSpec { + driver_config: parse_key_value_pairs(args.driver_config, "--resource-config")?, + ..ResourceSpec::default() + }; + + if let Some(value) = args.cpu_request { + resources.cpu_request = value.to_string(); + } + if let Some(value) = args.cpu_limit { + resources.cpu_limit = value.to_string(); + } + if let Some(value) = args.memory_request { + resources.memory_request = value.to_string(); + } + if let Some(value) = args.memory_limit { + resources.memory_limit = value.to_string(); + } + if let Some(count) = args.gpu_count { + if count == 0 { + return Err(miette::miette!("--gpu-count must be greater than zero")); + } + resources.gpu_count = count; + } + + if resources.cpu_request.is_empty() + && resources.cpu_limit.is_empty() + && resources.memory_request.is_empty() + && resources.memory_limit.is_empty() + && resources.gpu_count == 0 + && resources.driver_config.is_empty() + { + Ok(None) + } else { + Ok(Some(resources)) + } +} + /// Create a sandbox with default settings. #[allow(clippy::too_many_arguments, clippy::implicit_hasher)] // user-facing CLI command; default hasher is fine pub async fn sandbox_create( @@ -1467,6 +1515,7 @@ pub async fn sandbox_create( upload: Option<&(String, Option, bool)>, keep: bool, gpu: bool, + resources: SandboxResourceArgs<'_>, gpu_device: Option<&str>, editor: Option, providers: &[String], @@ -1483,6 +1532,12 @@ pub async fn sandbox_create( "--editor cannot be used with a trailing command; use `openshell sandbox connect --editor ...` after the sandbox is ready" )); } + let resources = build_resource_spec(resources)?; + if resources.as_ref().is_some_and(|spec| spec.gpu_count > 0) && gpu_device.is_some() { + return Err(miette::miette!( + "--gpu-count cannot be combined with --gpu-device" + )); + } // Check port availability *before* creating the sandbox so we don't // leave an orphaned sandbox behind when the forward would fail. @@ -1518,7 +1573,9 @@ pub async fn sandbox_create( } None => None, }; - let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu); + let requested_gpu = gpu + || resources.as_ref().is_some_and(|spec| spec.gpu_count > 0) + || image.as_deref().is_some_and(image_requests_gpu); let inferred_types: Vec = inferred_provider_type(command).into_iter().collect(); let configured_providers = ensure_required_providers( @@ -1531,10 +1588,14 @@ pub async fn sandbox_create( let policy = load_sandbox_policy(policy)?; - let template = image.map(|img| SandboxTemplate { - image: img, - ..SandboxTemplate::default() - }); + let template = if image.is_some() { + Some(SandboxTemplate { + image: image.unwrap_or_default(), + ..SandboxTemplate::default() + }) + } else { + None + }; let request = CreateSandboxRequest { spec: Some(SandboxSpec { @@ -1542,6 +1603,7 @@ pub async fn sandbox_create( gpu_device: gpu_device.unwrap_or_default().to_string(), policy, providers: configured_providers, + resources, template, ..SandboxSpec::default() }), @@ -6011,11 +6073,11 @@ fn format_timestamp_ms(ms: i64) -> String { #[cfg(test)] mod tests { use super::{ - TlsOptions, dockerfile_sources_supported_for_gateway, format_endpoint, - format_gateway_select_header, format_gateway_select_items, - format_provider_attachment_table, gateway_add, gateway_auth_label, - gateway_env_override_warning, gateway_select_with, gateway_type_label, git_sync_files, - http_health_check, image_requests_gpu, import_local_package_mtls_bundle, + SandboxResourceArgs, TlsOptions, build_resource_spec, + dockerfile_sources_supported_for_gateway, format_endpoint, format_gateway_select_header, + format_gateway_select_items, format_provider_attachment_table, gateway_add, + gateway_auth_label, gateway_env_override_warning, gateway_select_with, gateway_type_label, + git_sync_files, http_health_check, image_requests_gpu, import_local_package_mtls_bundle, inferred_provider_type, package_managed_tls_dirs, parse_cli_setting_value, parse_credential_pairs, plaintext_gateway_is_remote, provisioning_timeout_message, ready_false_condition_message, resolve_from, sandbox_should_persist, @@ -6037,6 +6099,56 @@ mod tests { Provider, SandboxCondition, SandboxStatus, datamodel::v1::ObjectMeta, }; + #[test] + fn build_resource_spec_sets_typed_fields_and_driver_config() { + let driver_config = vec!["kubernetes.resource-name=nvidia.com/mig-1g.5gb".to_string()]; + let resources = build_resource_spec(SandboxResourceArgs { + cpu_request: Some("2"), + cpu_limit: Some("4"), + memory_request: Some("8Gi"), + memory_limit: Some("16Gi"), + gpu_count: Some(4), + driver_config: &driver_config, + }) + .expect("resources should parse") + .expect("resource spec should be present"); + + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); + assert_eq!(resources.gpu_count, 4); + assert_eq!( + resources.driver_config.get("kubernetes.resource-name"), + Some(&"nvidia.com/mig-1g.5gb".to_string()) + ); + } + + #[test] + fn build_resource_spec_rejects_zero_gpu_count() { + let err = build_resource_spec(SandboxResourceArgs { + gpu_count: Some(0), + ..SandboxResourceArgs::default() + }) + .expect_err("zero GPU count should fail"); + + assert!(err.to_string().contains("--gpu-count")); + assert!(err.to_string().contains("greater than zero")); + } + + #[test] + fn build_resource_spec_rejects_invalid_driver_config() { + let driver_config = vec!["missing-separator".to_string()]; + let err = build_resource_spec(SandboxResourceArgs { + driver_config: &driver_config, + ..SandboxResourceArgs::default() + }) + .expect_err("invalid driver config should fail"); + + assert!(err.to_string().contains("--resource-config")); + assert!(err.to_string().contains("KEY=VALUE")); + } + struct EnvVarGuard { key: &'static str, original: Option, diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index 6e7d66d11..4b6ec4451 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -84,6 +84,7 @@ impl Drop for EnvVarGuard { #[derive(Clone, Default)] struct SandboxState { deleted_names: Arc>>>, + created_requests: Arc>>, } #[derive(Clone, Default)] @@ -107,7 +108,9 @@ impl OpenShell for TestOpenShell { &self, request: tonic::Request, ) -> Result, Status> { - let name = request.into_inner().name; + let request = request.into_inner(); + let name = request.name.clone(); + self.state.created_requests.lock().await.push(request); let sandbox_name = if name.is_empty() { "test-sandbox".to_string() } else { @@ -648,6 +651,10 @@ async fn deleted_names(server: &TestServer) -> Vec> { server.openshell.state.deleted_names.lock().await.clone() } +async fn created_requests(server: &TestServer) -> Vec { + server.openshell.state.created_requests.lock().await.clone() +} + fn test_tls(server: &TestServer) -> TlsOptions { server.tls.with_gateway_name("openshell") } @@ -669,6 +676,7 @@ async fn sandbox_create_keeps_command_sessions_by_default() { None, true, false, + run::SandboxResourceArgs::default(), None, None, &[], @@ -691,6 +699,110 @@ async fn sandbox_create_keeps_command_sessions_by_default() { ); } +#[tokio::test] +async fn sandbox_create_gpu_count_sets_gpu_intent_and_resources() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("gpu-count"), + None, + "openshell", + None, + true, + false, + run::SandboxResourceArgs { + gpu_count: Some(2), + ..run::SandboxResourceArgs::default() + }, + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = created_requests(&server).await; + let spec = requests + .last() + .and_then(|request| request.spec.as_ref()) + .expect("create request should include a spec"); + assert!(spec.gpu); + + let resources = spec + .resources + .as_ref() + .expect("gpu count should create resource spec"); + assert_eq!(resources.gpu_count, 2); +} + +#[tokio::test] +async fn sandbox_create_resource_spec_sets_resource_requirements() { + let server = run_server().await; + let fake_ssh_dir = tempfile::tempdir().unwrap(); + let xdg_dir = tempfile::tempdir().unwrap(); + let _env = test_env(&fake_ssh_dir, &xdg_dir); + let tls = test_tls(&server); + install_fake_ssh(&fake_ssh_dir); + + run::sandbox_create( + &server.endpoint, + Some("resource-spec"), + None, + "openshell", + None, + true, + false, + run::SandboxResourceArgs { + cpu_request: Some("2"), + cpu_limit: Some("4"), + memory_request: Some("8Gi"), + memory_limit: Some("16Gi"), + ..run::SandboxResourceArgs::default() + }, + None, + None, + &[], + None, + None, + &["echo".to_string(), "OK".to_string()], + Some(false), + Some(false), + &HashMap::new(), + &tls, + ) + .await + .expect("sandbox create should succeed"); + + let requests = created_requests(&server).await; + let spec = requests + .last() + .and_then(|request| request.spec.as_ref()) + .expect("create request should include a spec"); + assert!(!spec.gpu); + + let resources = spec + .resources + .as_ref() + .expect("resource flags should create resource spec"); + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); +} + #[tokio::test] async fn sandbox_create_deletes_command_sessions_with_no_keep() { let server = run_server().await; @@ -708,6 +820,7 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { None, false, false, + run::SandboxResourceArgs::default(), None, None, &[], @@ -750,6 +863,7 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { None, false, false, + run::SandboxResourceArgs::default(), None, None, &[], @@ -792,6 +906,7 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { None, true, false, + run::SandboxResourceArgs::default(), None, None, &[], @@ -834,6 +949,7 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { None, false, false, + run::SandboxResourceArgs::default(), None, None, &[], diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index df68d39d6..f3453f78f 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -357,6 +357,7 @@ fn docker_resource_limits_rejects_requests() { cpu_limit: String::new(), memory_request: String::new(), memory_limit: String::new(), + ..DriverResourceRequirements::default() }), platform_config: None, }; diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 4a8a8f76b..b1e133c86 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -44,6 +44,10 @@ pods do not need direct external ingress for SSH. ## GPU Support When a sandbox requests GPU support, the driver checks node allocatable capacity -for `nvidia.com/gpu` and requests one GPU resource in the workload spec. The -sandbox image must provide the user-space libraries needed by the agent -workload. +for `nvidia.com/gpu`. A resource spec with `gpu_count` maps to +`limits["nvidia.com/gpu"] = ""`. A request with only `gpu=true` asks for +one GPU unless the template already provides an explicit +`limits["nvidia.com/gpu"]` value through the resource passthrough. The cluster +must expose `nvidia.com/gpu` through the NVIDIA device plugin or an equivalent +device-plugin implementation, and the sandbox image must provide the user-space +libraries needed by the agent workload. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 56b73447a..b3dac55b9 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -1245,10 +1245,13 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option 0 { + set_gpu_limit(&mut resources, &req.gpu_count.to_string(), true); + } } if gpu { - apply_gpu_limit(&mut resources); + set_gpu_limit(&mut resources, GPU_RESOURCE_QUANTITY, false); } if resources.as_object().is_some_and(serde_json::Map::is_empty) { None @@ -1257,10 +1260,10 @@ fn container_resources(template: &SandboxTemplate, gpu: bool) -> Option Value { + Value { + kind: Some(Kind::StringValue(value.to_string())), + } + } + + fn struct_value(fields: impl IntoIterator) -> Value { + Value { + kind: Some(Kind::StructValue(Struct { + fields: fields + .into_iter() + .map(|(key, value)| (key.to_string(), value)) + .collect(), + })), + } + } + #[test] fn apply_required_env_always_injects_ssh_handshake_secret() { let mut env = Vec::new(); @@ -1898,6 +1921,75 @@ mod tests { ); } + #[test] + fn gpu_resource_spec_sets_gpu_resource_limit() { + use openshell_core::proto::compute::v1::DriverResourceRequirements; + + let template = SandboxTemplate { + resources: Some(DriverResourceRequirements { + gpu_count: 4, + ..Default::default() + }), + ..SandboxTemplate::default() + }; + + let pod_template = { + let params = SandboxPodParams::default(); + sandbox_template_to_k8s( + &template, + true, + &std::collections::HashMap::new(), + true, + ¶ms, + ) + }; + + let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; + assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("4")); + } + + #[test] + fn gpu_sandbox_preserves_explicit_gpu_resource_limit() { + use openshell_core::proto::compute::v1::DriverResourceRequirements; + + let template = SandboxTemplate { + resources: Some(DriverResourceRequirements { + cpu_limit: "2".to_string(), + ..Default::default() + }), + platform_config: Some(Struct { + fields: std::iter::once(( + "resources_raw".to_string(), + struct_value([( + "limits", + struct_value([ + ("example.com/custom", string_value("7")), + (GPU_RESOURCE_NAME, string_value("2")), + ]), + )]), + )) + .collect(), + }), + ..SandboxTemplate::default() + }; + + let pod_template = { + let params = SandboxPodParams::default(); + sandbox_template_to_k8s( + &template, + true, + &std::collections::HashMap::new(), + true, + ¶ms, + ) + }; + + let limits = &pod_template["spec"]["containers"][0]["resources"]["limits"]; + assert_eq!(limits["cpu"], serde_json::json!("2")); + assert_eq!(limits["example.com/custom"], serde_json::json!("7")); + assert_eq!(limits[GPU_RESOURCE_NAME], serde_json::json!("2")); + } + #[test] fn host_aliases_injected_when_gateway_ip_set() { let pod_template = { diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index d2fd34011..8483578bb 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -24,8 +24,8 @@ use openshell_core::proto::compute::v1::{ watch_sandboxes_event, }; use openshell_core::proto::{ - PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, - SandboxTemplate, SshSession, + PlatformEvent, ResourceSpec, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, + SandboxStatus, SandboxTemplate, SshSession, }; use openshell_driver_docker::DockerComputeDriver; use openshell_driver_kubernetes::{ @@ -1126,23 +1126,76 @@ fn driver_sandbox_spec_from_public(spec: &SandboxSpec) -> DriverSandboxSpec { DriverSandboxSpec { log_level: spec.log_level.clone(), environment: spec.environment.clone(), - template: spec - .template - .as_ref() - .map(driver_sandbox_template_from_public), - gpu: spec.gpu, + template: driver_sandbox_template_from_public( + spec.template.as_ref(), + spec.resources.as_ref(), + ), + gpu: spec.gpu || spec.resources.as_ref().is_some_and(|r| r.gpu_count > 0), gpu_device: spec.gpu_device.clone(), } } -fn driver_sandbox_template_from_public(template: &SandboxTemplate) -> DriverSandboxTemplate { - DriverSandboxTemplate { +fn driver_sandbox_template_from_public( + template: Option<&SandboxTemplate>, + resources: Option<&ResourceSpec>, +) -> Option { + let driver_resources = + driver_resources_from_public(resources, template.and_then(|t| t.resources.as_ref())); + + if template.is_none() && driver_resources.is_none() { + return None; + } + + let default_template = SandboxTemplate::default(); + let template = template.unwrap_or(&default_template); + + Some(DriverSandboxTemplate { image: template.image.clone(), agent_socket_path: template.agent_socket.clone(), labels: template.labels.clone(), environment: template.environment.clone(), - resources: extract_typed_resources(&template.resources), + resources: driver_resources, platform_config: build_platform_config(template), + }) +} + +fn driver_resources_from_public( + resources: Option<&ResourceSpec>, + template_resources: Option<&prost_types::Struct>, +) -> Option { + let mut req = extract_typed_resources(template_resources).unwrap_or_default(); + + if let Some(resources) = resources { + if !resources.cpu_request.is_empty() { + req.cpu_request.clone_from(&resources.cpu_request); + } + if !resources.cpu_limit.is_empty() { + req.cpu_limit.clone_from(&resources.cpu_limit); + } + if !resources.memory_request.is_empty() { + req.memory_request.clone_from(&resources.memory_request); + } + if !resources.memory_limit.is_empty() { + req.memory_limit.clone_from(&resources.memory_limit); + } + if resources.gpu_count > 0 { + req.gpu_count = resources.gpu_count; + } + if !resources.driver_config.is_empty() { + req.driver_config.clone_from(&resources.driver_config); + } + } + + if req.cpu_request.is_empty() + && req.cpu_limit.is_empty() + && req.memory_request.is_empty() + && req.memory_limit.is_empty() + && req.gpu_count == 0 + && req.driver_config.is_empty() + { + None + } else { + Some(req) } } @@ -1152,7 +1205,7 @@ fn driver_sandbox_template_from_public(template: &SandboxTemplate) -> DriverSand /// with the Kubernetes limits/requests shape. We pull out the well-known /// keys into the typed `DriverResourceRequirements` message. fn extract_typed_resources( - resources: &Option, + resources: Option<&prost_types::Struct>, ) -> Option { fn get_quantity(s: &prost_types::Struct, section: &str, key: &str) -> String { s.fields @@ -1168,13 +1221,14 @@ fn extract_typed_resources( .unwrap_or_default() } - let s = resources.as_ref()?; + let s = resources?; let req = DriverResourceRequirements { cpu_request: get_quantity(s, "requests", "cpu"), cpu_limit: get_quantity(s, "limits", "cpu"), memory_request: get_quantity(s, "requests", "memory"), memory_limit: get_quantity(s, "limits", "memory"), + ..DriverResourceRequirements::default() }; // Return None when all fields are empty so drivers can distinguish @@ -1183,6 +1237,8 @@ fn extract_typed_resources( && req.cpu_limit.is_empty() && req.memory_request.is_empty() && req.memory_limit.is_empty() + && req.gpu_count == 0 + && req.driver_config.is_empty() { None } else { @@ -2098,6 +2154,43 @@ mod tests { assert!(resources_raw.fields.contains_key("opaque_cpu")); } + #[test] + fn driver_sandbox_spec_maps_resource_spec_to_driver_resources() { + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_request: "2".to_string(), + cpu_limit: "4".to_string(), + memory_request: "8Gi".to_string(), + memory_limit: "16Gi".to_string(), + gpu_count: 4, + driver_config: std::iter::once(( + "kubernetes.resource-name".to_string(), + "nvidia.com/gpu".to_string(), + )) + .collect(), + }), + ..Default::default() + }; + + let driver_spec = driver_sandbox_spec_from_public(&spec); + assert!(driver_spec.gpu); + + let resources = driver_spec + .template + .as_ref() + .and_then(|template| template.resources.as_ref()) + .expect("resource spec should create driver template resources"); + assert_eq!(resources.cpu_request, "2"); + assert_eq!(resources.cpu_limit, "4"); + assert_eq!(resources.memory_request, "8Gi"); + assert_eq!(resources.memory_limit, "16Gi"); + assert_eq!(resources.gpu_count, 4); + assert_eq!( + resources.driver_config.get("kubernetes.resource-name"), + Some(&"nvidia.com/gpu".to_string()) + ); + } + #[test] fn rewrite_user_facing_conditions_rewrites_gpu_unschedulable_message() { let mut status = Some(SandboxStatus { diff --git a/crates/openshell-server/src/grpc/validation.rs b/crates/openshell-server/src/grpc/validation.rs index 160b7e031..424042435 100644 --- a/crates/openshell-server/src/grpc/validation.rs +++ b/crates/openshell-server/src/grpc/validation.rs @@ -9,7 +9,8 @@ #![allow(clippy::result_large_err)] // Validation returns Result<_, Status> use openshell_core::proto::{ - ExecSandboxRequest, Provider, SandboxPolicy as ProtoSandboxPolicy, SandboxTemplate, + ExecSandboxRequest, Provider, ResourceSpec, SandboxPolicy as ProtoSandboxPolicy, + SandboxTemplate, }; use prost::Message; use tonic::Status; @@ -21,6 +22,8 @@ use super::{ MAX_TEMPLATE_STRUCT_SIZE, }; +const NVIDIA_GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; + // --------------------------------------------------------------------------- // Exec request validation // --------------------------------------------------------------------------- @@ -130,6 +133,20 @@ pub(super) fn validate_sandbox_spec( if let Some(ref tmpl) = spec.template { validate_sandbox_template(tmpl)?; } + if let Some(ref resources) = spec.resources { + validate_resource_spec(resources)?; + validate_resource_template_conflicts(resources, spec.template.as_ref())?; + } + if spec + .resources + .as_ref() + .is_some_and(|resources| resources.gpu_count > 0) + && !spec.gpu_device.is_empty() + { + return Err(Status::invalid_argument( + "resources.gpu_count cannot be combined with gpu_device", + )); + } // --- spec.policy serialized size --- if let Some(ref policy) = spec.policy { @@ -144,6 +161,104 @@ pub(super) fn validate_sandbox_spec( Ok(()) } +fn validate_resource_spec(resources: &ResourceSpec) -> Result<(), Status> { + for (field, value) in [ + ("resources.cpu_request", &resources.cpu_request), + ("resources.cpu_limit", &resources.cpu_limit), + ("resources.memory_request", &resources.memory_request), + ("resources.memory_limit", &resources.memory_limit), + ] { + if value.len() > MAX_MAP_VALUE_LEN { + return Err(Status::invalid_argument(format!( + "{field} exceeds maximum length ({} > {MAX_MAP_VALUE_LEN})", + value.len() + ))); + } + } + + validate_string_map( + &resources.driver_config, + MAX_TEMPLATE_MAP_ENTRIES, + MAX_MAP_KEY_LEN, + MAX_MAP_VALUE_LEN, + "resources.driver_config", + ) +} + +fn validate_resource_template_conflicts( + resources: &ResourceSpec, + template: Option<&SandboxTemplate>, +) -> Result<(), Status> { + let Some(template_resources) = template.and_then(|template| template.resources.as_ref()) else { + return Ok(()); + }; + + for (field, section, key, expected) in [ + ( + "resources.cpu_request", + "requests", + "cpu", + resources.cpu_request.as_str(), + ), + ( + "resources.cpu_limit", + "limits", + "cpu", + resources.cpu_limit.as_str(), + ), + ( + "resources.memory_request", + "requests", + "memory", + resources.memory_request.as_str(), + ), + ( + "resources.memory_limit", + "limits", + "memory", + resources.memory_limit.as_str(), + ), + ] { + if !expected.is_empty() + && template_resource_string(template_resources, section, key) + .is_some_and(|actual| actual != expected) + { + return Err(Status::invalid_argument(format!( + "{field} conflicts with template.resources.{section}.{key}" + ))); + } + } + + if resources.gpu_count > 0 + && template_resource_string(template_resources, "limits", NVIDIA_GPU_RESOURCE_NAME) + .is_some_and(|actual| actual != resources.gpu_count.to_string()) + { + return Err(Status::invalid_argument( + "resources.gpu_count conflicts with template.resources.limits.nvidia.com/gpu", + )); + } + + Ok(()) +} + +fn template_resource_string<'a>( + resources: &'a prost_types::Struct, + section: &str, + key: &str, +) -> Option<&'a str> { + resources + .fields + .get(section) + .and_then(|value| match value.kind.as_ref() { + Some(prost_types::value::Kind::StructValue(section)) => section.fields.get(key), + _ => None, + }) + .and_then(|value| match value.kind.as_ref() { + Some(prost_types::value::Kind::StringValue(value)) => Some(value.as_str()), + _ => None, + }) +} + /// Validate template-level field sizes. fn validate_sandbox_template(tmpl: &SandboxTemplate) -> Result<(), Status> { // String fields. @@ -674,6 +789,81 @@ mod tests { assert!(validate_sandbox_spec("gpu-sandbox", &spec).is_ok()); } + #[test] + fn validate_sandbox_spec_accepts_resource_spec() { + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_limit: "4".to_string(), + memory_limit: "8Gi".to_string(), + gpu_count: 2, + driver_config: std::iter::once(( + "kubernetes.resource-name".to_string(), + "nvidia.com/gpu".to_string(), + )) + .collect(), + ..Default::default() + }), + ..Default::default() + }; + + assert!(validate_sandbox_spec("resource-sandbox", &spec).is_ok()); + } + + #[test] + fn validate_sandbox_spec_rejects_gpu_count_with_gpu_device() { + let spec = SandboxSpec { + gpu_device: "nvidia.com/gpu=0".to_string(), + resources: Some(ResourceSpec { + gpu_count: 2, + ..Default::default() + }), + ..Default::default() + }; + + let err = validate_sandbox_spec("resource-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("gpu_count")); + assert!(err.message().contains("gpu_device")); + } + + #[test] + fn validate_sandbox_spec_rejects_conflicting_template_resources() { + use prost_types::{Struct, Value, value::Kind}; + + let spec = SandboxSpec { + resources: Some(ResourceSpec { + cpu_limit: "4".to_string(), + ..Default::default() + }), + template: Some(SandboxTemplate { + resources: Some(Struct { + fields: std::iter::once(( + "limits".to_string(), + Value { + kind: Some(Kind::StructValue(Struct { + fields: std::iter::once(( + "cpu".to_string(), + Value { + kind: Some(Kind::StringValue("2".to_string())), + }, + )) + .collect(), + })), + }, + )) + .collect(), + }), + ..Default::default() + }), + ..Default::default() + }; + + let err = validate_sandbox_spec("resource-sandbox", &spec).unwrap_err(); + assert_eq!(err.code(), Code::InvalidArgument); + assert!(err.message().contains("resources.cpu_limit")); + assert!(err.message().contains("template.resources.limits.cpu")); + } + #[test] fn validate_sandbox_spec_accepts_empty_defaults() { assert!(validate_sandbox_spec("", &default_spec()).is_ok()); diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 067e0d122..4adccecde 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -25,6 +25,7 @@ Make sure the following are in place before you install. | Agent Sandbox controller and CRDs | Yes | Install before the OpenShell chart. Refer to [Install Agent Sandbox](#install-agent-sandbox). | | cert-manager | No | Refer to [Managing Certificates](/kubernetes/managing-certificates). Use cert-manager only if you prefer it over the built-in PKI job. | | Kubernetes Gateway API | No | Refer to [Ingress](/kubernetes/ingress). Use it only for external access without port-forwarding. | +| NVIDIA device plugin or equivalent | No | Required only for GPU sandboxes. The cluster must expose allocatable `nvidia.com/gpu` resources for `--gpu` and `--gpu-count`. | ## Install Agent Sandbox diff --git a/docs/sandboxes/manage-sandboxes.mdx b/docs/sandboxes/manage-sandboxes.mdx index 3eddecec5..5ec45b3bd 100644 --- a/docs/sandboxes/manage-sandboxes.mdx +++ b/docs/sandboxes/manage-sandboxes.mdx @@ -35,6 +35,35 @@ To request GPU resources, add `--gpu`: openshell sandbox create --gpu -- claude ``` +To request multiple GPUs on a Kubernetes-backed gateway, set a GPU count: + +```shell +openshell sandbox create --gpu-count 4 -- claude +``` + +GPU counts schedule Kubernetes sandbox pods with an `nvidia.com/gpu` resource +limit. `--gpu-count` cannot be combined with `--gpu-device` because count-based +scheduling and driver-specific device selection are different allocation modes. + +To set portable CPU and memory resource requests or limits, use the resource +flags: + +```shell +openshell sandbox create \ + --cpu-request 2 \ + --cpu-limit 4 \ + --memory-request 8Gi \ + --memory-limit 16Gi \ + -- claude +``` + +Resource values use Kubernetes-style quantity strings. Kubernetes-backed +gateways map these fields into pod `resources.requests` and +`resources.limits`. + +For driver-specific resource settings, pass opaque key-value entries with +`--resource-config KEY=VALUE`. The selected driver interprets those entries. + For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker CDI after the gateway starts, restart the gateway so OpenShell can detect the updated Docker daemon capability. diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 3c4308f3f..6ed42cf4a 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -131,6 +131,10 @@ message DriverResourceRequirements { string memory_request = 3; // Maximum memory allowed (e.g. "512Mi", "8Gi"). string memory_limit = 4; + // Number of GPUs requested. A value of 0 means no explicit GPU count. + uint32 gpu_count = 5; + // Opaque key-value configuration interpreted by the selected driver. + map driver_config = 100; } // Raw status observed directly from the compute platform. diff --git a/proto/openshell.proto b/proto/openshell.proto index bb2ce6cec..5af21851a 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -256,6 +256,28 @@ message SandboxSpec { // (e.g. "0", "1"). When empty with gpu=true, the driver assigns the // first available GPU. string gpu_device = 10; + // Portable resource requirements for the sandbox workload. + ResourceSpec resources = 11; +} + +// Portable resource requirements for a sandbox workload. +// +// Values use Kubernetes-style quantity strings (e.g. "500m", "2", "4Gi") +// because they are a well-known, widely-adopted notation. Drivers for +// non-Kubernetes platforms must parse these strings into their native units. +message ResourceSpec { + // Minimum CPU cores requested (e.g. "500m", "2"). + string cpu_request = 1; + // Maximum CPU cores allowed (e.g. "500m", "4"). + string cpu_limit = 2; + // Minimum memory requested (e.g. "256Mi", "4Gi"). + string memory_request = 3; + // Maximum memory allowed (e.g. "512Mi", "8Gi"). + string memory_limit = 4; + // Number of GPUs requested. A value of 0 means no explicit GPU count. + uint32 gpu_count = 5; + // Opaque key-value configuration interpreted by the selected driver. + map driver_config = 100; } // Public sandbox template mapped onto compute-driver template inputs.