From 55357dfeaa3c4d4122128774ed928fa7a85411d6 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 10:47:33 -0700 Subject: [PATCH 01/15] feat(policy): plumb chunk_ids and rejection_reason through proposal pipeline Prereq plumbing for the agent revise-and-resubmit loop. Two narrow additive proto changes unblock the upcoming /wait endpoint (#1092), prover validation badge (#1097), and reject --guidance surfaces (#1098). - SubmitPolicyAnalysisResponse: add accepted_chunk_ids so the in-sandbox agent gets handles to watch its proposals. Surfaced through the typed grpc_client wrapper and policy.local's POST /v1/proposals 202 body. Closes #1094. - PolicyChunk + StoredDraftChunk + DraftChunkPayload: add validation_result (gateway prover verdict, populated by #1097) and rejection_reason (operator free-form text). Both plain strings; no enums, no parsing on the read path. Closes #1096. - RejectDraftChunk now persists the existing reason field into the chunk's rejection_reason so it round-trips back to the agent via GetDraftPolicy. UndoDraftChunk clears it on the way back to pending so consumers cannot read a stale guidance string from a prior reject -> re-approve -> undo cycle. Whole surface stays gated behind agent_policy_proposals_enabled. Two focused tests cover the round-trip and the undo-clears guarantee. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/grpc_client.rs | 5 +- .../src/mechanistic_mapper.rs | 2 + crates/openshell-sandbox/src/policy_local.rs | 3 + crates/openshell-server/src/grpc/policy.rs | 241 +++++++++++++++++- .../src/persistence/postgres.rs | 4 + .../src/persistence/sqlite.rs | 4 + crates/openshell-server/src/policy_store.rs | 14 +- proto/openshell.proto | 23 ++ 8 files changed, 287 insertions(+), 9 deletions(-) diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 1cb15f929..96cce1ae8 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -332,8 +332,9 @@ impl CachedOpenShellClient { /// Submit denial summaries and/or agent-authored proposals for policy analysis. /// /// Returns the gateway response so callers can surface accepted/rejected - /// counts and rejection reasons (e.g., the `policy.local` API forwards - /// these to the in-sandbox agent). + /// counts, rejection reasons, and server-assigned `accepted_chunk_ids` + /// (e.g., the `policy.local` API forwards these to the in-sandbox agent + /// so it can watch proposal state via `GET /v1/proposals/{id}`). pub async fn submit_policy_analysis( &self, sandbox_name: &str, diff --git a/crates/openshell-sandbox/src/mechanistic_mapper.rs b/crates/openshell-sandbox/src/mechanistic_mapper.rs index cb6daa550..1df6953ed 100644 --- a/crates/openshell-sandbox/src/mechanistic_mapper.rs +++ b/crates/openshell-sandbox/src/mechanistic_mapper.rs @@ -230,6 +230,8 @@ pub async fn generate_proposals(summaries: &[DenialSummary]) -> Vec first_seen_ms, last_seen_ms, binary: binary.clone(), + validation_result: String::new(), + rejection_reason: String::new(), }); } diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index 165b0c1bd..1bbba7acc 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -451,6 +451,7 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j "accepted_chunks": response.accepted_chunks, "rejected_chunks": response.rejected_chunks, "rejection_reasons": response.rejection_reasons, + "accepted_chunk_ids": response.accepted_chunk_ids, }), ) } @@ -521,6 +522,8 @@ fn policy_chunk_from_add_rule( first_seen_ms: 0, last_seen_ms: 0, binary, + validation_result: String::new(), + rejection_reason: String::new(), }) } diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index 885dbc9ad..d01cccb82 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -1358,6 +1358,7 @@ pub(super) async fn handle_submit_policy_analysis( let mut accepted: u32 = 0; let mut rejected: u32 = 0; let mut rejection_reasons: Vec = Vec::new(); + let mut accepted_chunk_ids: Vec = Vec::new(); for chunk in &req.proposed_chunks { if chunk.rule_name.is_empty() { @@ -1391,7 +1392,7 @@ pub(super) async fn handle_submit_policy_analysis( .unwrap_or_default(); let record = DraftChunkRecord { - id: chunk_id, + id: chunk_id.clone(), sandbox_id: sandbox_id.clone(), draft_version, status: "pending".to_string(), @@ -1419,6 +1420,8 @@ pub(super) async fn handle_submit_policy_analysis( } else { now_ms }, + validation_result: String::new(), + rejection_reason: String::new(), }; state .store @@ -1426,6 +1429,7 @@ pub(super) async fn handle_submit_policy_analysis( .await .map_err(|e| Status::internal(format!("persist draft chunk failed: {e}")))?; accepted += 1; + accepted_chunk_ids.push(chunk_id); } state.sandbox_watch_bus.notify(&sandbox_id); @@ -1443,6 +1447,7 @@ pub(super) async fn handle_submit_policy_analysis( accepted_chunks: accepted, rejected_chunks: rejected, rejection_reasons, + accepted_chunk_ids, })) } @@ -1559,7 +1564,7 @@ pub(super) async fn handle_approve_draft_chunk( current_time_ms().map_err(|e| Status::internal(format!("timestamp error: {e}")))?; state .store - .update_draft_chunk_status(&req.chunk_id, "approved", Some(now_ms)) + .update_draft_chunk_status(&req.chunk_id, "approved", Some(now_ms), None) .await .map_err(|e| Status::internal(format!("update chunk status failed: {e}")))?; @@ -1657,9 +1662,17 @@ pub(super) async fn handle_reject_draft_chunk( let now_ms = current_time_ms().map_err(|e| Status::internal(format!("timestamp error: {e}")))?; + // Persist the reviewer's free-form `reason` into the chunk's + // `rejection_reason` field so the in-sandbox agent can read it back via + // GetDraftPolicy / policy.local and revise the proposal. + let persisted_reason = if req.reason.is_empty() { + None + } else { + Some(req.reason.as_str()) + }; state .store - .update_draft_chunk_status(&req.chunk_id, "rejected", Some(now_ms)) + .update_draft_chunk_status(&req.chunk_id, "rejected", Some(now_ms), persisted_reason) .await .map_err(|e| Status::internal(format!("update chunk status failed: {e}")))?; @@ -1741,7 +1754,7 @@ pub(super) async fn handle_approve_all_draft_chunks( current_time_ms().map_err(|e| Status::internal(format!("timestamp error: {e}")))?; state .store - .update_draft_chunk_status(&chunk.id, "approved", Some(now_ms)) + .update_draft_chunk_status(&chunk.id, "approved", Some(now_ms), None) .await .map_err(|e| Status::internal(format!("update chunk status failed: {e}")))?; @@ -1884,9 +1897,12 @@ pub(super) async fn handle_undo_draft_chunk( let (version, hash) = remove_chunk_from_policy(state, &sandbox_id, &chunk).await?; + // Clear any prior rejection_reason on the way back to "pending" so an + // agent reading the chunk via policy.local cannot see a stale guidance + // string left over from a previous reject → undo round. state .store - .update_draft_chunk_status(&req.chunk_id, "pending", None) + .update_draft_chunk_status(&req.chunk_id, "pending", None, Some("")) .await .map_err(|e| Status::internal(format!("update chunk status failed: {e}")))?; @@ -2108,6 +2124,8 @@ fn draft_chunk_record_to_proto(record: &DraftChunkRecord) -> Result, + rejection_reason: Option<&str>, ) -> PersistenceResult { let Some(mut record) = self.get_draft_chunk(id).await? else { return Ok(false); @@ -491,6 +492,9 @@ ORDER BY created_at_ms DESC record.status = status.to_string(); record.decided_at_ms = decided_at_ms; record.last_seen_ms = current_time_ms()?; + if let Some(reason) = rejection_reason { + record.rejection_reason = reason.to_string(); + } let payload = draft_chunk_payload_from_record(&record)?; let result = sqlx::query( diff --git a/crates/openshell-server/src/persistence/sqlite.rs b/crates/openshell-server/src/persistence/sqlite.rs index fafb07597..a1e1503e4 100644 --- a/crates/openshell-server/src/persistence/sqlite.rs +++ b/crates/openshell-server/src/persistence/sqlite.rs @@ -498,6 +498,7 @@ ORDER BY "created_at_ms" DESC id: &str, status: &str, decided_at_ms: Option, + rejection_reason: Option<&str>, ) -> PersistenceResult { let Some(mut record) = self.get_draft_chunk(id).await? else { return Ok(false); @@ -506,6 +507,9 @@ ORDER BY "created_at_ms" DESC record.status = status.to_string(); record.decided_at_ms = decided_at_ms; record.last_seen_ms = current_time_ms()?; + if let Some(reason) = rejection_reason { + record.rejection_reason = reason.to_string(); + } let payload = draft_chunk_payload_from_record(&record)?; let result = sqlx::query( diff --git a/crates/openshell-server/src/policy_store.rs b/crates/openshell-server/src/policy_store.rs index f0a43698e..2895250eb 100644 --- a/crates/openshell-server/src/policy_store.rs +++ b/crates/openshell-server/src/policy_store.rs @@ -64,11 +64,16 @@ pub trait PolicyStoreExt { status_filter: Option<&str>, ) -> PersistenceResult>; + /// Update a draft chunk's status, optionally recording a free-form + /// `rejection_reason` for the reviewer's note. Pass `Some` only on the + /// reject path; other status transitions pass `None` to leave any prior + /// reason intact. async fn update_draft_chunk_status( &self, id: &str, status: &str, decided_at_ms: Option, + rejection_reason: Option<&str>, ) -> PersistenceResult; async fn update_draft_chunk_rule( @@ -216,16 +221,17 @@ impl PolicyStoreExt for Store { id: &str, status: &str, decided_at_ms: Option, + rejection_reason: Option<&str>, ) -> PersistenceResult { match self { Self::Postgres(store) => { store - .update_draft_chunk_status(id, status, decided_at_ms) + .update_draft_chunk_status(id, status, decided_at_ms, rejection_reason) .await } Self::Sqlite(store) => { store - .update_draft_chunk_status(id, status, decided_at_ms) + .update_draft_chunk_status(id, status, decided_at_ms, rejection_reason) .await } } @@ -325,6 +331,8 @@ pub fn draft_chunk_payload_from_record(chunk: &DraftChunkRecord) -> PersistenceR port: chunk.port, binary: chunk.binary.clone(), draft_version: chunk.draft_version, + validation_result: chunk.validation_result.clone(), + rejection_reason: chunk.rejection_reason.clone(), } .encode_to_vec()) } @@ -365,5 +373,7 @@ pub fn draft_chunk_record_from_parts( hit_count: i32::try_from(hit_count).unwrap_or(i32::MAX), first_seen_ms: created_at_ms, last_seen_ms: updated_at_ms, + validation_result: wrapper.validation_result, + rejection_reason: wrapper.rejection_reason, }) } diff --git a/proto/openshell.proto b/proto/openshell.proto index 883c1576c..6aac4a245 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -1213,6 +1213,15 @@ message PolicyChunk { int64 last_seen_ms = 15; // Binary path that triggered the denial (denormalized for display convenience). string binary = 16; + // Validation verdict from gateway-side static checks (prover output). + // Free-form summary string for human consumption in the inbox card. + // Empty until the prover has run for this chunk. + string validation_result = 17; + // Operator-supplied free-form text accompanying a rejection. Populated + // when the reviewer rejects via `RejectDraftChunkRequest.reason`; surfaced + // back to the in-sandbox agent so it can revise the proposal. + // Empty for non-rejected chunks. + string rejection_reason = 18; } // Notification that the draft policy was updated. @@ -1246,6 +1255,10 @@ message SubmitPolicyAnalysisResponse { uint32 rejected_chunks = 2; // Reasons for each rejected chunk. repeated string rejection_reasons = 3; + // Server-assigned chunk IDs for the accepted chunks, in submission order. + // Agents use these to watch proposal state via policy.local's + // GET /v1/proposals/{id} and /wait endpoints. + repeated string accepted_chunk_ids = 4; } // Get draft policy for a sandbox. @@ -1408,6 +1421,12 @@ message DraftChunkPayload { string binary = 9; // Current draft version for the owning sandbox. int64 draft_version = 10; + // Gateway prover verdict for this chunk; empty until prover runs. + // Mirrors PolicyChunk.validation_result. + string validation_result = 11; + // Operator-supplied free-form rejection text; empty for non-rejected + // chunks. Mirrors PolicyChunk.rejection_reason. + string rejection_reason = 12; } // Internal stored policy revision row materialized from the generic objects table. @@ -1442,4 +1461,8 @@ message StoredDraftChunk { int32 hit_count = 15; int64 first_seen_ms = 16; int64 last_seen_ms = 17; + // Gateway prover verdict; empty until the prover runs. See PolicyChunk. + string validation_result = 18; + // Operator-supplied free-form rejection text. See PolicyChunk. + string rejection_reason = 19; } From 652081d43f622de78b466e5cd3c717e83f2e1e45 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 11:05:32 -0700 Subject: [PATCH 02/15] feat(sandbox): add /v1/proposals/{id} and /wait long-poll to policy.local MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The agent feedback channel back from policy.local. Two new routes let the in-sandbox agent learn its proposal's outcome on a single blocking HTTP call — zero LLM tokens during the wait. - GET /v1/proposals/{chunk_id} returns the chunk's current state in one gateway call. - GET /v1/proposals/{chunk_id}/wait?timeout= blocks until the chunk transitions out of pending. Default 60s, clamped [1, 300]. Agent re-issues on timeout to extend. Response carries the chunk's status plus the two feedback fields shipped in the prereq commit: rejection_reason (free-form reviewer text) and validation_result (gateway prover verdict, empty until #1097). On timeout: same shape with timed_out: true so the agent can disambiguate without parsing. Wait handler short-polls GetDraftPolicy every 1s inside the request with a tokio::time::Instant deadline. One gateway connection is opened per request and reused across all polls, so a 60s wait does one TLS handshake instead of sixty. A future commit can swap the loop body for a tokio::sync::broadcast driven by a watcher task — the agent-visible contract (URL, query, response shape) is independent of the polling implementation. All routes stay behind agent_policy_proposals_enabled. Closes #1092. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/grpc_client.rs | 32 +- crates/openshell-sandbox/src/policy_local.rs | 323 +++++++++++++++++++ 2 files changed, 350 insertions(+), 5 deletions(-) diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 96cce1ae8..19a6831e5 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -9,10 +9,11 @@ use std::time::Duration; use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ - DenialSummary, GetInferenceBundleRequest, GetInferenceBundleResponse, GetSandboxConfigRequest, - GetSandboxProviderEnvironmentRequest, PolicySource, PolicyStatus, ReportPolicyStatusRequest, - SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, - UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, + DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, + GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, PolicyChunk, PolicySource, + PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, + SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, + inference_client::InferenceClient, open_shell_client::OpenShellClient, }; use tonic::service::interceptor::InterceptedService; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; @@ -339,7 +340,7 @@ impl CachedOpenShellClient { &self, sandbox_name: &str, summaries: Vec, - proposed_chunks: Vec, + proposed_chunks: Vec, analysis_mode: &str, ) -> Result { let response = self @@ -357,6 +358,27 @@ impl CachedOpenShellClient { Ok(response.into_inner()) } + /// Fetch the current draft chunks for a sandbox. `status_filter` may be + /// `"pending"`, `"approved"`, `"rejected"`, or empty for all. Used by + /// `policy.local`'s `GET /v1/proposals/{id}` and `/wait` routes to + /// inspect proposal state. + pub async fn get_draft_policy( + &self, + sandbox_name: &str, + status_filter: &str, + ) -> Result> { + let response = self + .client + .clone() + .get_draft_policy(GetDraftPolicyRequest { + name: sandbox_name.to_string(), + status_filter: status_filter.to_string(), + }) + .await + .into_diagnostic()?; + Ok(response.into_inner().chunks) + } + /// Report policy load status back to the server. pub async fn report_policy_status( &self, diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index 1bbba7acc..b363d0b74 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -31,6 +31,19 @@ pub const SKILL_PATH: &str = "/etc/openshell/skills/policy_advisor.md"; pub const ROUTE_POLICY_CURRENT: &str = "/v1/policy/current"; pub const ROUTE_DENIALS: &str = "/v1/denials"; pub const ROUTE_PROPOSALS: &str = "/v1/proposals"; +/// Per-proposal status and long-poll routes live below this prefix: +/// `GET /v1/proposals/{chunk_id}` — immediate status +/// `GET /v1/proposals/{chunk_id}/wait?timeout` — long-poll until terminal +/// Trailing slash differentiates from the bare `POST /v1/proposals` submit. +const ROUTE_PROPOSALS_PREFIX: &str = "/v1/proposals/"; + +/// Long-poll bounds for `GET /v1/proposals/{id}/wait?timeout=`. The agent +/// re-issues on timeout, so the cap is a hold ceiling, not a hard limit on +/// how long the agent can wait overall. +const PROPOSAL_WAIT_DEFAULT_SECS: u64 = 60; +const PROPOSAL_WAIT_MIN_SECS: u64 = 1; +const PROPOSAL_WAIT_MAX_SECS: u64 = 300; +const PROPOSAL_WAIT_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); const MAX_POLICY_LOCAL_BODY_BYTES: usize = 64 * 1024; /// Hard ceiling on how long a single request body read can stall. Bounds a @@ -135,6 +148,9 @@ async fn route_request( ("GET", ROUTE_POLICY_CURRENT) => current_policy_response(ctx).await, ("GET", ROUTE_DENIALS) => recent_denials_response(ctx, query).await, ("POST", ROUTE_PROPOSALS) => submit_proposal(ctx, body).await, + ("GET", path) if path.starts_with(ROUTE_PROPOSALS_PREFIX) => { + proposal_state_route(ctx, path, query).await + } _ => ( 404, serde_json::json!({ @@ -145,6 +161,42 @@ async fn route_request( } } +/// Parse `{chunk_id}` (status) or `{chunk_id}/wait` (long-poll) from the path +/// suffix and dispatch. Empty `chunk_id` or extra segments return 404 so a +/// malformed path cannot trigger a gateway call. +async fn proposal_state_route( + ctx: &PolicyLocalContext, + path: &str, + query: &str, +) -> (u16, serde_json::Value) { + let suffix = path + .strip_prefix(ROUTE_PROPOSALS_PREFIX) + .unwrap_or_default(); + let (chunk_id, wait) = match suffix.split_once('/') { + Some((id, "wait")) => (id, true), + Some(_) => return not_found_payload(path), + None => (suffix, false), + }; + if chunk_id.is_empty() { + return not_found_payload(path); + } + if wait { + proposal_wait_response(ctx, chunk_id, query).await + } else { + proposal_status_response(ctx, chunk_id).await + } +} + +fn not_found_payload(path: &str) -> (u16, serde_json::Value) { + ( + 404, + serde_json::json!({ + "error": "not_found", + "detail": format!("policy.local proposal sub-route not found: {path}") + }), + ) +} + /// Build the `next_steps` array embedded in the L7 deny body so the agent has /// machine-readable pointers to this API. Centralizes the shape here to keep /// the deny body and the actual route table from drifting — adding or @@ -456,6 +508,176 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j ) } +/// `GET /v1/proposals/{chunk_id}` — immediate state. One gateway call, no loop. +async fn proposal_status_response( + ctx: &PolicyLocalContext, + chunk_id: &str, +) -> (u16, serde_json::Value) { + let session = match open_lookup_session(ctx).await { + Ok(session) => session, + Err(err) => return err, + }; + fetch_chunk_or_404(&session, chunk_id, false).await +} + +/// `GET /v1/proposals/{chunk_id}/wait?timeout=` — block until terminal or +/// timeout. Returns the chunk's current state on a status transition; on +/// timeout, returns the still-pending state with `timed_out: true` so the +/// agent can re-issue without ambiguity. The agent's wait costs zero LLM +/// tokens — the tool call sits in a socket recv until we return. +async fn proposal_wait_response( + ctx: &PolicyLocalContext, + chunk_id: &str, + query: &str, +) -> (u16, serde_json::Value) { + let session = match open_lookup_session(ctx).await { + Ok(session) => session, + Err(err) => return err, + }; + let timeout_secs = parse_timeout_query(query); + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + loop { + match fetch_chunk(&session, chunk_id).await { + Ok(Some(chunk)) if is_terminal_status(&chunk.status) => { + return (200, chunk_state_payload(&chunk, false)); + } + Ok(Some(chunk)) => { + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + return (200, chunk_state_payload(&chunk, true)); + } + let sleep_for = std::cmp::min(remaining, PROPOSAL_WAIT_POLL_INTERVAL); + tokio::time::sleep(sleep_for).await; + } + Ok(None) => return chunk_not_found_payload(chunk_id), + Err(err) => return err, + } + } +} + +fn chunk_not_found_payload(chunk_id: &str) -> (u16, serde_json::Value) { + ( + 404, + error_payload( + "chunk_not_found", + format!("chunk '{chunk_id}' is not present in this sandbox's draft policy"), + ), + ) +} + +async fn fetch_chunk_or_404( + session: &LookupSession<'_>, + chunk_id: &str, + timed_out: bool, +) -> (u16, serde_json::Value) { + match fetch_chunk(session, chunk_id).await { + Ok(Some(chunk)) => (200, chunk_state_payload(&chunk, timed_out)), + Ok(None) => chunk_not_found_payload(chunk_id), + Err(err) => err, + } +} + +/// Build the agent-facing response for a chunk. +/// +/// Selection rule: include the fields the agent needs to decide what to do +/// next on the redraft loop — identity (`chunk_id`, `status`), the proposal +/// it submitted (`rule_name`, `binary`), and the two feedback signals +/// (`rejection_reason` from the reviewer, `validation_result` from the +/// gateway prover). Display-only proto fields (`hit_count`, `confidence`, +/// `stage`, timing) are left off until a concrete agent need surfaces them. +fn chunk_state_payload(chunk: &PolicyChunk, timed_out: bool) -> serde_json::Value { + let mut payload = serde_json::json!({ + "chunk_id": chunk.id, + "status": chunk.status, + "rule_name": chunk.rule_name, + "binary": chunk.binary, + "rejection_reason": chunk.rejection_reason, + "validation_result": chunk.validation_result, + }); + if timed_out { + payload["timed_out"] = serde_json::json!(true); + } + payload +} + +fn is_terminal_status(status: &str) -> bool { + matches!(status, "approved" | "rejected") +} + +/// Parse `?timeout=` from the query string. Default applies for missing +/// or unparseable values; bounds clamp to keep the agent's hold ceiling +/// sane. Re-issue is the right pattern for longer waits. +fn parse_timeout_query(query: &str) -> u64 { + let raw = query + .split('&') + .filter_map(|kv| kv.split_once('=')) + .find(|(k, _)| *k == "timeout") + .map_or("", |(_, v)| v); + raw.parse::() + .unwrap_or(PROPOSAL_WAIT_DEFAULT_SECS) + .clamp(PROPOSAL_WAIT_MIN_SECS, PROPOSAL_WAIT_MAX_SECS) +} + +/// One connected gateway client + the validated sandbox name. Built once +/// per request and reused for every `fetch_chunk` call in a wait loop so a +/// 60-second wait does one TLS handshake, not sixty. +struct LookupSession<'a> { + client: crate::grpc_client::CachedOpenShellClient, + sandbox_name: &'a str, +} + +/// Validate ctx and open one gateway channel. Failures map to the canonical +/// error payload shape used by both `/proposals/{id}` and `/wait`. +async fn open_lookup_session( + ctx: &PolicyLocalContext, +) -> std::result::Result, (u16, serde_json::Value)> { + let endpoint = ctx.gateway_endpoint.as_deref().ok_or_else(|| { + ( + 503, + error_payload( + "gateway_unavailable", + "proposal state lookup requires a gateway-connected sandbox".to_string(), + ), + ) + })?; + let sandbox_name = ctx + .sandbox_name + .as_deref() + .map(str::trim) + .filter(|name| !name.is_empty()) + .ok_or_else(|| { + ( + 503, + error_payload( + "sandbox_name_unavailable", + "proposal state lookup requires a sandbox name".to_string(), + ), + ) + })?; + let client = crate::grpc_client::CachedOpenShellClient::connect(endpoint) + .await + .map_err(|e| (502, error_payload("gateway_connect_failed", e.to_string())))?; + Ok(LookupSession { + client, + sandbox_name, + }) +} + +/// One gateway call: list the sandbox's draft chunks and find the matching +/// id. Returns `Ok(None)` only when the gateway responded successfully but +/// no chunk in this sandbox matches. +async fn fetch_chunk( + session: &LookupSession<'_>, + chunk_id: &str, +) -> std::result::Result, (u16, serde_json::Value)> { + let chunks = session + .client + .get_draft_policy(session.sandbox_name, "") + .await + .map_err(|e| (502, error_payload("gateway_lookup_failed", e.to_string())))?; + Ok(chunks.into_iter().find(|c| c.id == chunk_id)) +} + fn proposal_chunks_from_body(body: &[u8]) -> std::result::Result, String> { let request: ProposalRequest = serde_json::from_slice(body).map_err(|e| e.to_string())?; if request.operations.is_empty() { @@ -1159,4 +1381,105 @@ mod tests { assert_eq!(body["format"], "yaml"); assert!(body["policy_yaml"].as_str().unwrap().contains("version: 1")); } + + #[test] + fn parse_timeout_query_defaults_and_clamps() { + assert_eq!(parse_timeout_query(""), PROPOSAL_WAIT_DEFAULT_SECS); + assert_eq!(parse_timeout_query("timeout="), PROPOSAL_WAIT_DEFAULT_SECS); + assert_eq!( + parse_timeout_query("timeout=abc"), + PROPOSAL_WAIT_DEFAULT_SECS + ); + assert_eq!(parse_timeout_query("timeout=30"), 30); + assert_eq!(parse_timeout_query("foo=1&timeout=45"), 45); + // Below floor clamps up; above ceiling clamps down. + assert_eq!(parse_timeout_query("timeout=0"), PROPOSAL_WAIT_MIN_SECS); + assert_eq!(parse_timeout_query("timeout=9999"), PROPOSAL_WAIT_MAX_SECS); + } + + #[test] + fn is_terminal_status_matches_only_approved_and_rejected() { + assert!(!is_terminal_status("pending")); + assert!(is_terminal_status("approved")); + assert!(is_terminal_status("rejected")); + assert!(!is_terminal_status("")); + } + + #[test] + fn chunk_state_payload_surfaces_loop_fields() { + let chunk = PolicyChunk { + id: "chunk-x".to_string(), + status: "rejected".to_string(), + rule_name: "allow_example".to_string(), + binary: "/usr/bin/curl".to_string(), + rejection_reason: "scope too broad".to_string(), + validation_result: "no exfil paths".to_string(), + ..Default::default() + }; + let pending = chunk_state_payload(&chunk, false); + assert_eq!(pending["chunk_id"], "chunk-x"); + assert_eq!(pending["status"], "rejected"); + assert_eq!(pending["rejection_reason"], "scope too broad"); + assert_eq!(pending["validation_result"], "no exfil paths"); + // timed_out only appears when set. + assert!(pending.get("timed_out").is_none()); + + let timed = chunk_state_payload(&chunk, true); + assert_eq!(timed["timed_out"], true); + } + + #[tokio::test] + async fn proposal_routes_reject_malformed_paths() { + let _guard = ProposalsFlagGuard::set(true).await; + let ctx = PolicyLocalContext::new(None, None, None); + + // Empty chunk_id after the prefix is 404, not a wildcard list. + let (status, _) = route_request(&ctx, "GET", "/v1/proposals/", &[]).await; + assert_eq!(status, 404); + + // More than one segment after the id (not "/wait") is 404, not a + // partial match. Prevents `/v1/proposals/abc/extra` from silently + // dispatching as a status lookup for "abc/extra". + let (status, _) = route_request(&ctx, "GET", "/v1/proposals/abc/extra", &[]).await; + assert_eq!(status, 404); + + // Trailing path after `/wait` also 404 — must not match the wait + // arm as a wildcard. + let (status, _) = route_request(&ctx, "GET", "/v1/proposals/abc/wait/extra", &[]).await; + assert_eq!(status, 404); + } + + #[tokio::test] + async fn proposal_status_route_returns_503_when_no_gateway() { + let _guard = ProposalsFlagGuard::set(true).await; + let ctx = PolicyLocalContext::new(None, None, Some("test-sandbox".to_string())); + + let (status, body) = route_request(&ctx, "GET", "/v1/proposals/chunk-id", &[]).await; + assert_eq!(status, 503); + assert_eq!(body["error"], "gateway_unavailable"); + } + + #[tokio::test] + async fn proposal_wait_route_returns_503_when_no_gateway() { + let _guard = ProposalsFlagGuard::set(true).await; + let ctx = PolicyLocalContext::new(None, None, Some("test-sandbox".to_string())); + + let (status, body) = + route_request(&ctx, "GET", "/v1/proposals/chunk-id/wait?timeout=1", &[]).await; + assert_eq!(status, 503); + assert_eq!(body["error"], "gateway_unavailable"); + } + + #[tokio::test] + async fn proposal_routes_return_feature_disabled_when_flag_off() { + let _guard = ProposalsFlagGuard::set(false).await; + let ctx = PolicyLocalContext::new(None, None, Some("test-sandbox".to_string())); + + let (status, body) = route_request(&ctx, "GET", "/v1/proposals/abc", &[]).await; + assert_eq!(status, 404); + assert_eq!(body["error"], "feature_disabled"); + + let (status, _) = route_request(&ctx, "GET", "/v1/proposals/abc/wait", &[]).await; + assert_eq!(status, 404); + } } From d5fe465bd3f9cc41b554d64d480ab56115170f87 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 11:11:03 -0700 Subject: [PATCH 03/15] docs(sandbox): teach policy_advisor skill the wait + redraft loop The agent-facing instructions for the feedback loop. The endpoints exist; this is the doc that makes them usable. policy_advisor.md gains: - API entries for GET /v1/proposals/{chunk_id} and /wait?timeout=, including the field semantics (status, rejection_reason, validation_result, timed_out). - A note on the submit response's accepted_chunk_ids / rejection_reasons split so the agent handles partial acceptance. - Step 6 saves the chunk_ids and addresses any submit-time rejections before waiting. - Step 7 walks the four wait outcomes: approved (retry, with the honest "may still fail" caveat), rejected (read rejection_reason AND validation_result; address whichever has content), still-pending with timed_out (re-call), non-2xx (surface, do not retry). skills.rs gains two assertions on the skill content so a future edit cannot drop the wait endpoint or the rejection_reason directive silently. Closes #1095. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/skills.rs | 6 +++ .../src/skills/policy_advisor.md | 38 +++++++++++++++++-- 2 files changed, 41 insertions(+), 3 deletions(-) diff --git a/crates/openshell-sandbox/src/skills.rs b/crates/openshell-sandbox/src/skills.rs index 91654699f..d2b898226 100644 --- a/crates/openshell-sandbox/src/skills.rs +++ b/crates/openshell-sandbox/src/skills.rs @@ -59,5 +59,11 @@ mod tests { assert!(content.contains("# OpenShell Policy Advisor")); assert!(content.contains("policy.local")); assert!(content.contains("addRule")); + // The wait-loop teaching is load-bearing for the agent feedback + // UX; lock the workflow language in so future skill edits cannot + // drop it silently. Both substrings target the directive, not the + // field name (which could appear in the API doc block alone). + assert!(content.contains("/v1/proposals/{chunk_id}/wait")); + assert!(content.contains("read `rejection_reason`")); } } diff --git a/crates/openshell-sandbox/src/skills/policy_advisor.md b/crates/openshell-sandbox/src/skills/policy_advisor.md index 57546145c..3503674b9 100644 --- a/crates/openshell-sandbox/src/skills/policy_advisor.md +++ b/crates/openshell-sandbox/src/skills/policy_advisor.md @@ -19,7 +19,20 @@ The sandbox-local policy API is reachable at `http://policy.local`: carries the timestamp, class, severity, action, host/port, binary, policy name, and (for denied events) a short reason. Read the lines directly; you do not need to parse them into structured fields. -- `POST /v1/proposals` — submit a proposal for developer approval. +- `POST /v1/proposals` — submit a proposal. The 202 response carries + `accepted_chunk_ids` (one ID per operation the gateway accepted) and + `rejection_reasons` (one entry per operation the gateway refused at + submit-time). The two arrays together account for every operation you + sent. +- `GET /v1/proposals/{chunk_id}` — immediate state of one proposal. + Returns `status` (`pending` / `approved` / `rejected`), + `rejection_reason` (the reviewer's free-form note, only set on reject), + and `validation_result` (the gateway prover's verdict on this chunk; + may be empty). +- `GET /v1/proposals/{chunk_id}/wait?timeout=` — block on this + proposal until the developer decides or the timeout expires. Default + 60s, clamped [1, 300]. On timeout you get `status: "pending"` plus + `timed_out: true`. Use this instead of polling `/v1/proposals/{chunk_id}`. The proposal body takes an `intent_summary` and one or more `addRule` operations. Each `addRule` carries a complete narrow `NetworkPolicyRule`. @@ -34,8 +47,27 @@ operations. Each `addRule` carries a complete narrow `NetworkPolicyRule`. when the client tunnels opaque traffic that OpenShell cannot inspect. 5. Draft the narrowest rule: exact host, exact port, exact binary when known, exact method, and the smallest safe path. -6. Submit the proposal, tell the developer what you proposed, and retry the - denied action only after approval. +6. Submit the proposal, save `accepted_chunk_ids` from the response, and + tell the developer what you proposed. If the response also carries + `rejection_reasons`, the gateway refused those operations at + submit-time before any human review; fix them and resubmit before + waiting on the rest. +7. For each accepted chunk_id, call + `GET /v1/proposals/{chunk_id}/wait?timeout=300` and act on the result: + - `status: "approved"` — retry the original denied action. Policy + hot-reloads after approval, so the request should succeed. If it + still fails with `policy_denied`, re-read the denial — your rule + may not match. If it fails for any other reason, surface to the + user. + - `status: "rejected"` — read `rejection_reason` and + `validation_result`. `rejection_reason` is what the reviewer typed; + `validation_result` is the prover's verdict, which often explains + a reject driven by automated checks. If either has content, address + the specific feedback and submit a revised proposal. If both are + empty, draft something materially different or ask the user. + - `status: "pending"` with `timed_out: true` — call `/wait` again. + - Any non-2xx response — surface to the user; do not retry the denied + action without approval. ## Proposal shape From 14949e09146c8f6d5788e60285c65e9772cb1040 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 11:28:58 -0700 Subject: [PATCH 04/15] test(policy-advisor): add end-to-end smoke for the agent feedback loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A focused smoke that exercises the new policy.local /wait endpoint on a live gateway + sandbox, separate from the existing no-LLM regression harness (which still drives the OLD retry-with-bash-loop recovery pattern). Two flows: - Flow A — approve-and-retry: agent submits, /wait blocks, host runs `openshell rule approve`, /wait returns status=approved. Confirms the happy path round-trip latency. - Flow B — reject-with-guidance: agent submits, /wait blocks, host runs `openshell rule reject --reason "..."`, /wait returns status=rejected with the exact reviewer text in rejection_reason. Confirms the free-form guidance contract round-trips through the agent feedback channel. No GitHub credentials needed — proposals are synthetic and never trigger outbound traffic. Both flows expect agent_policy_proposals_enabled=true and a running gateway. Adds three cases to sandbox-runner.sh: submit-test-proposal (no GH deps), proposal-status, proposal-wait. The existing put-file and submit-proposal cases used by test.sh are untouched. Signed-off-by: Alexander Watson --- e2e/policy-advisor/sandbox-runner.sh | 71 +++++++++ e2e/policy-advisor/wait-smoke.sh | 208 +++++++++++++++++++++++++++ 2 files changed, 279 insertions(+) create mode 100755 e2e/policy-advisor/wait-smoke.sh diff --git a/e2e/policy-advisor/sandbox-runner.sh b/e2e/policy-advisor/sandbox-runner.sh index 780e85573..947d2163c 100755 --- a/e2e/policy-advisor/sandbox-runner.sh +++ b/e2e/policy-advisor/sandbox-runner.sh @@ -135,6 +135,77 @@ PY json_status_response "$status" "$body" ;; + submit-test-proposal) + # Synthetic proposal for the wait-loop smoke. No GitHub creds needed + # — we never make outbound calls, the gateway just persists the + # chunk and the reviewer decides on it. + rule_id="$1" + body="$(mktemp)" + payload="$(mktemp)" + + python3 - "$rule_id" > "$payload" <<'PY' +import json +import sys + +rule_id = sys.argv[1] +payload = { + "intent_summary": f"Smoke test proposal {rule_id}", + "operations": [{ + "addRule": { + "ruleName": f"smoke_{rule_id}", + "rule": { + "name": f"smoke_{rule_id}", + "endpoints": [{ + "host": "example.invalid", + "port": 443, + "protocol": "rest", + "enforcement": "enforce", + "rules": [{ + "allow": {"method": "GET", "path": f"/{rule_id}"} + }], + }], + "binaries": [{"path": "/usr/bin/curl"}], + }, + } + }], +} +print(json.dumps(payload)) +PY + + status="$(curl -sS \ + -o "$body" \ + -w "%{http_code}" \ + -X POST \ + -H "Content-Type: application/json" \ + --data-binary "@${payload}" \ + http://policy.local/v1/proposals)" + json_status_response "$status" "$body" + ;; + + proposal-status) + chunk_id="$1" + body="$(mktemp)" + status="$(curl -sS \ + -o "$body" \ + -w "%{http_code}" \ + "http://policy.local/v1/proposals/${chunk_id}")" + json_status_response "$status" "$body" + ;; + + proposal-wait) + chunk_id="$1" + timeout="${2:-60}" + body="$(mktemp)" + # No --max-time on curl: the server bounds the wait at `timeout`, + # which is already clamped to [1, 300] by policy.local. Let the + # request return naturally. + status="$(curl -sS \ + -o "$body" \ + -w "%{http_code}" \ + "http://policy.local/v1/proposals/${chunk_id}/wait?timeout=${timeout}")" + json_status_response "$status" "$body" + ;; + *) echo "unknown command: $cmd" >&2 exit 64 diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh new file mode 100755 index 000000000..25ff9946e --- /dev/null +++ b/e2e/policy-advisor/wait-smoke.sh @@ -0,0 +1,208 @@ +#!/usr/bin/env bash + +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# End-to-end smoke for the agent feedback loop (#1092 + #1094 + #1096). +# +# Exercises both halves of the new policy.local /wait endpoint: +# Flow A — approve-and-retry: agent submits, /wait blocks, host approves, +# /wait returns approved. +# Flow B — reject-with-guidance: agent submits, /wait blocks, host rejects +# with --reason, /wait returns rejected + rejection_reason text. +# +# Prereqs: +# - A running gateway and the openshell CLI built from this branch. +# - Global setting agent_policy_proposals_enabled=true. +# +# No GitHub credentials needed — proposals are synthetic and never trigger +# outbound traffic. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +RUNNER_SOURCE="${SCRIPT_DIR}/sandbox-runner.sh" + +if [[ -z "${OPENSHELL_BIN:-}" ]]; then + if [[ -x "${REPO_ROOT}/target/debug/openshell" ]]; then + OPENSHELL_BIN="${REPO_ROOT}/target/debug/openshell" + else + OPENSHELL_BIN="openshell" + fi +fi + +RUN_ID="${RUN_ID:-$(date +%Y%m%d-%H%M%S)}" +SANDBOX="${SANDBOX:-policy-wait-smoke-${RUN_ID}}" +KEEP_SANDBOX="${KEEP_SANDBOX:-0}" +WAIT_TIMEOUT="${WAIT_TIMEOUT:-30}" + +BOLD='\033[1m' +DIM='\033[2m' +CYAN='\033[36m' +GREEN='\033[32m' +RED='\033[31m' +RESET='\033[0m' + +step() { printf "\n${BOLD}${CYAN}==> %s${RESET}\n\n" "$1"; } +info() { printf " %b\n" "$*"; } +fail() { printf "\n${RED}FAIL:${RESET} %s\n" "$*" >&2; exit 1; } +ok() { printf " ${GREEN}✓${RESET} %s\n" "$*"; } + +TMP_DIR="" +SSH_CONFIG="" +SSH_HOST="" + +cleanup() { + local status=$? + if [[ "$KEEP_SANDBOX" != "1" ]]; then + "$OPENSHELL_BIN" sandbox delete "$SANDBOX" >/dev/null 2>&1 || true + fi + if [[ -n "$TMP_DIR" && $status -eq 0 ]]; then + rm -rf "$TMP_DIR" + fi +} +trap cleanup EXIT + +preflight() { + step "Preflight" + command -v jq >/dev/null || fail "jq is required" + command -v ssh >/dev/null || fail "ssh is required" + [[ -f "$RUNNER_SOURCE" ]] || fail "missing $RUNNER_SOURCE" + + local enabled + enabled="$("$OPENSHELL_BIN" settings get --global 2>/dev/null \ + | jq -r '.settings.agent_policy_proposals_enabled // ""')" + if [[ "$enabled" != "true" ]]; then + fail "agent_policy_proposals_enabled must be true. Run: + $OPENSHELL_BIN settings set --global --key agent_policy_proposals_enabled --value true --yes" + fi + ok "agent_policy_proposals_enabled=true" +} + +create_sandbox() { + step "Creating sandbox '${SANDBOX}'" + TMP_DIR="$(mktemp -d)" + SSH_CONFIG="${TMP_DIR}/ssh_config" + + "$OPENSHELL_BIN" sandbox delete "$SANDBOX" >/dev/null 2>&1 || true + "$OPENSHELL_BIN" sandbox create \ + --name "$SANDBOX" \ + --upload "${RUNNER_SOURCE}:/sandbox/runner.sh" \ + --no-git-ignore \ + --keep \ + --no-auto-providers \ + --no-tty \ + -- bash -lc "chmod +x /sandbox/runner.sh && echo sandbox ready" \ + | sed 's/^/ /' + + "$OPENSHELL_BIN" sandbox ssh-config "$SANDBOX" > "$SSH_CONFIG" + SSH_HOST="$(awk '/^Host / { print $2; exit }' "$SSH_CONFIG")" + [[ -n "$SSH_HOST" ]] || fail "could not parse SSH config" + + local _i + for _i in $(seq 1 30); do + if ssh -F "$SSH_CONFIG" "$SSH_HOST" true >/dev/null 2>&1; then + ok "SSH up" + return + fi + sleep 2 + done + fail "SSH connection timed out" +} + +in_sandbox() { ssh -F "$SSH_CONFIG" "$SSH_HOST" "$@"; } +http_status() { awk -F= '/^HTTP_STATUS=/ { print $2; exit }'; } +http_body() { sed '/^HTTP_STATUS=/d'; } + +submit() { + local rule_id="$1" + local out + out="$(in_sandbox /sandbox/runner.sh submit-test-proposal "$rule_id")" + [[ "$(printf '%s\n' "$out" | http_status)" == "202" ]] \ + || { printf '%s\n' "$out" >&2; fail "submit returned non-202"; } + printf '%s\n' "$out" | http_body | jq -r '.accepted_chunk_ids[0]' +} + +run_flow_a_approve() { + step "Flow A — approve-and-retry" + local chunk_id + chunk_id="$(submit "alpha")" + info "submitted, chunk_id=${DIM}${chunk_id}${RESET}" + + # Kick off /wait in background, capture wall-clock time to settle. + local wait_out_file="${TMP_DIR}/wait_a.out" + local started_at finished_at elapsed + started_at="$(date +%s.%N)" + in_sandbox /sandbox/runner.sh proposal-wait "$chunk_id" "$WAIT_TIMEOUT" \ + > "$wait_out_file" & + local wait_pid=$! + + # Brief settle so the in-sandbox wait registers before we approve. + sleep 0.3 + info "approving from host..." + "$OPENSHELL_BIN" rule approve "$SANDBOX" --chunk-id "$chunk_id" \ + | sed 's/^/ /' + + wait "$wait_pid" || fail "in-sandbox /wait exited non-zero" + finished_at="$(date +%s.%N)" + elapsed="$(awk -v s="$started_at" -v f="$finished_at" 'BEGIN { printf "%.2f", f - s }')" + + local body + body="$(http_body < "$wait_out_file")" + [[ "$(printf '%s\n' "$body" | jq -r '.status')" == "approved" ]] \ + || { printf '%s\n' "$body" >&2; fail "Flow A: /wait did not return approved"; } + [[ "$(printf '%s\n' "$body" | jq -r '.rejection_reason')" == "" ]] \ + || fail "Flow A: rejection_reason should be empty on approve" + + ok "/wait returned status=approved in ${elapsed}s" +} + +run_flow_b_reject() { + step "Flow B — reject-with-guidance" + local chunk_id + chunk_id="$(submit "beta")" + info "submitted, chunk_id=${DIM}${chunk_id}${RESET}" + + local guidance="scope to docs/ paths only, not the whole repo" + local wait_out_file="${TMP_DIR}/wait_b.out" + local started_at finished_at elapsed + started_at="$(date +%s.%N)" + in_sandbox /sandbox/runner.sh proposal-wait "$chunk_id" "$WAIT_TIMEOUT" \ + > "$wait_out_file" & + local wait_pid=$! + + sleep 0.3 + info "rejecting from host with --reason..." + "$OPENSHELL_BIN" rule reject "$SANDBOX" \ + --chunk-id "$chunk_id" \ + --reason "$guidance" \ + | sed 's/^/ /' + + wait "$wait_pid" || fail "in-sandbox /wait exited non-zero" + finished_at="$(date +%s.%N)" + elapsed="$(awk -v s="$started_at" -v f="$finished_at" 'BEGIN { printf "%.2f", f - s }')" + + local body received_reason + body="$(http_body < "$wait_out_file")" + [[ "$(printf '%s\n' "$body" | jq -r '.status')" == "rejected" ]] \ + || { printf '%s\n' "$body" >&2; fail "Flow B: /wait did not return rejected"; } + received_reason="$(printf '%s\n' "$body" | jq -r '.rejection_reason')" + [[ "$received_reason" == "$guidance" ]] \ + || fail "Flow B: rejection_reason mismatch. + sent: $guidance + received: $received_reason" + + ok "/wait returned status=rejected in ${elapsed}s" + ok "rejection_reason round-tripped exactly: \"${received_reason}\"" +} + +main() { + preflight + create_sandbox + run_flow_a_approve + run_flow_b_reject + step "Smoke pass" +} + +main "$@" From ad11745ca2f7ef6f11f5fdab59696a1e5a0ed236 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 15:24:47 -0700 Subject: [PATCH 05/15] fix(policy-advisor): surface real CLI errors from wait-smoke preflight The preflight piped openshell's stderr to /dev/null and relied on jq to default the missing setting key to "", but under `set -euo pipefail` a non-zero exit from openshell makes the whole pipeline fail and the command substitution exits the script silently before the intended fail() message can print. Capture stderr explicitly, check the CLI exit code, and surface the real error plus the expected fix (port-forward + gateway add + select) when the CLI cannot reach the gateway. Signed-off-by: Alexander Watson --- e2e/policy-advisor/wait-smoke.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh index 25ff9946e..7eea7b987 100755 --- a/e2e/policy-advisor/wait-smoke.sh +++ b/e2e/policy-advisor/wait-smoke.sh @@ -70,8 +70,21 @@ preflight() { command -v ssh >/dev/null || fail "ssh is required" [[ -f "$RUNNER_SOURCE" ]] || fail "missing $RUNNER_SOURCE" + # Capture stderr so a CLI failure (gateway unreachable, no current + # gateway configured, etc.) surfaces a real error instead of an empty + # pipeline that `set -euo pipefail` silently exits on. + local raw_settings + if ! raw_settings="$("$OPENSHELL_BIN" settings get --global 2>&1)"; then + fail "openshell could not reach the gateway. CLI output: +${raw_settings} + +If you just deployed via skaffold, you probably still need: + KUBECONFIG=kubeconfig kubectl -n openshell port-forward svc/openshell 8090:8080 & + $OPENSHELL_BIN gateway add local --endpoint http://localhost:8090 --skip-verify + $OPENSHELL_BIN gateway select local" + fi local enabled - enabled="$("$OPENSHELL_BIN" settings get --global 2>/dev/null \ + enabled="$(printf '%s' "$raw_settings" \ | jq -r '.settings.agent_policy_proposals_enabled // ""')" if [[ "$enabled" != "true" ]]; then fail "agent_policy_proposals_enabled must be true. Run: From 62d40bc478a41ade922e8469dfbb8acd53ebf794 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 15:27:34 -0700 Subject: [PATCH 06/15] fix(policy-advisor): pass --json to settings get in wait-smoke preflight `openshell settings get --global` defaults to a human-readable table; jq cannot parse it and the preflight died with a numeric-literal error. Pass --json so jq gets actual JSON. Also touched up the suggested recovery commands in the preflight error to match the real CLI shape (`gateway add --name ` and the env-var override warning). Signed-off-by: Alexander Watson --- e2e/policy-advisor/wait-smoke.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh index 7eea7b987..ea3206cb2 100755 --- a/e2e/policy-advisor/wait-smoke.sh +++ b/e2e/policy-advisor/wait-smoke.sh @@ -74,14 +74,15 @@ preflight() { # gateway configured, etc.) surfaces a real error instead of an empty # pipeline that `set -euo pipefail` silently exits on. local raw_settings - if ! raw_settings="$("$OPENSHELL_BIN" settings get --global 2>&1)"; then + if ! raw_settings="$("$OPENSHELL_BIN" settings get --global --json 2>&1)"; then fail "openshell could not reach the gateway. CLI output: ${raw_settings} If you just deployed via skaffold, you probably still need: KUBECONFIG=kubeconfig kubectl -n openshell port-forward svc/openshell 8090:8080 & - $OPENSHELL_BIN gateway add local --endpoint http://localhost:8090 --skip-verify - $OPENSHELL_BIN gateway select local" + $OPENSHELL_BIN gateway add http://localhost:8090 --name local + $OPENSHELL_BIN gateway select local + unset OPENSHELL_GATEWAY # if the CLI warns about it overriding" fi local enabled enabled="$(printf '%s' "$raw_settings" \ From f2dfa8a718af1310fb8403e2d552a53a3b65ae80 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 15:53:46 -0700 Subject: [PATCH 07/15] fix(policy): dedup draft chunks only in mechanistic mode; return effective id MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The smoke harness for the agent feedback loop caught a real bug in the gateway: SubmitPolicyAnalysis's response carried a chunk_id that was never persisted whenever the SQL ON CONFLICT path fired. Two failure modes, both load-bearing: - Agent-authored proposals targeting the same host/port/binary (e.g. the redraft-after-rejection loop) silently folded into one row and any RejectDraftChunk by the new chunk_id failed with "chunk not found." Latent since #1151, surfaced by #1094 returning chunk_ids. - Mechanistic mode had the same class of bug — the dedup fold-in is the intended behavior there, but the response still advertised the newly-generated UUID instead of the existing row's id. Less visible because no current caller reads mechanistic chunk_ids back, but the proto contract was violated either way. Fix in three parts: - put_draft_chunk now takes Option<&str> dedup_key explicitly and returns the effective row id (via RETURNING). None binds NULL to the dedup_key column, which bypasses the partial-index ON CONFLICT path entirely. Caller-decides semantics replace store-side magic. - handle_submit_policy_analysis picks dedup_key per chunk using an allowlist (only "mechanistic" dedups) and pushes the returned effective_id to accepted_chunk_ids. New modes default to no-dedup so a misconfigured caller cannot silently lose proposals. - The two-copy draft_chunk_dedup_key helper consolidated to one observation_dedup_key in policy_store.rs with a doc comment. Tests: - agent_authored_submits_for_same_endpoint_do_not_dedup pins the redraft-loop contract: two intentional submissions with the same host/port/binary get distinct chunk_ids, both findable via GetDraftPolicy, both rejectable by id. - mechanistic_submits_for_same_endpoint_dedup_into_one_chunk locks in the observation-mode dedup AND asserts both submits return the same effective_id — would have caught the deeper bug. Proto: SubmitPolicyAnalysisRequest.analysis_mode doc updated to describe the actual semantics (mechanistic dedups, agent_authored and unknown modes do not). Signed-off-by: Alexander Watson --- crates/openshell-server/src/grpc/policy.rs | 233 +++++++++++++++++- .../src/persistence/postgres.rs | 22 +- .../src/persistence/sqlite.rs | 23 +- crates/openshell-server/src/policy_store.rs | 40 ++- proto/openshell.proto | 8 +- 5 files changed, 298 insertions(+), 28 deletions(-) diff --git a/crates/openshell-server/src/grpc/policy.rs b/crates/openshell-server/src/grpc/policy.rs index d01cccb82..2930ed975 100644 --- a/crates/openshell-server/src/grpc/policy.rs +++ b/crates/openshell-server/src/grpc/policy.rs @@ -1372,7 +1372,6 @@ pub(super) async fn handle_submit_policy_analysis( continue; } - let chunk_id = uuid::Uuid::new_v4().to_string(); let now_ms = current_time_ms().map_err(|e| Status::internal(format!("timestamp error: {e}")))?; let proposed_rule_bytes = chunk @@ -1392,7 +1391,10 @@ pub(super) async fn handle_submit_policy_analysis( .unwrap_or_default(); let record = DraftChunkRecord { - id: chunk_id.clone(), + // The handler proposes an id; the store may swap it for an + // existing row's id on dedup. Always trust `effective_id` for + // anything user-facing. + id: uuid::Uuid::new_v4().to_string(), sandbox_id: sandbox_id.clone(), draft_version, status: "pending".to_string(), @@ -1423,13 +1425,20 @@ pub(super) async fn handle_submit_policy_analysis( validation_result: String::new(), rejection_reason: String::new(), }; - state + // Mechanistic mode dedups N denials targeting the same endpoint + // into one chunk. All other modes (agent-authored proposals, future + // modes) submit each chunk as a distinct row — the redraft loop + // relies on it, and the conservative default for an unknown mode + // is to keep the proposal rather than silently fold it away. + let dedup_key = matches!(req.analysis_mode.as_str(), "mechanistic") + .then(|| crate::policy_store::observation_dedup_key(&record)); + let effective_id = state .store - .put_draft_chunk(&record) + .put_draft_chunk(&record, dedup_key.as_deref()) .await .map_err(|e| Status::internal(format!("persist draft chunk failed: {e}")))?; accepted += 1; - accepted_chunk_ids.push(chunk_id); + accepted_chunk_ids.push(effective_id); } state.sandbox_watch_bus.notify(&sandbox_id); @@ -4229,6 +4238,220 @@ mod tests { assert!(rejected.validation_result.is_empty()); } + /// Two agent-authored proposals targeting the same host/port/binary must + /// each persist as a distinct chunk. The mechanistic-mode dedup + /// (`host|port|binary`) is wrong for agent intent: the redraft loop + /// relies on the second submission landing as its own chunk so the + /// reviewer can decide on it independently. Regression test for the bug + /// where Flow B of `e2e/policy-advisor/wait-smoke.sh` saw a fresh + /// `chunk_id` returned from submit but `RejectDraftChunk` could not + /// find it because the SQL ON CONFLICT had silently kept the prior row. + #[tokio::test] + async fn agent_authored_submits_for_same_endpoint_do_not_dedup() { + use openshell_core::proto::{NetworkBinary, NetworkEndpoint, SandboxPhase, SandboxSpec}; + + let state = test_server_state().await; + let sandbox_name = "redraft-loop".to_string(); + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-redraft".to_string(), + name: sandbox_name.clone(), + created_at_ms: 1_000_000, + labels: std::collections::HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Ready as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + + // Two proposals with the same host|port|binary (so the mechanistic + // dedup_key would collide) but distinct rule names and L7 paths — + // proves the gateway distinguishes them by intentional act and not + // by payload hash. If a future dedup-by-payload-hash regression + // landed, this test would still fail because the chunk_ids would + // still need to be distinct. + let make_rule = |rule_name: &str| NetworkPolicyRule { + name: rule_name.to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.example.com".to_string(), + port: 443, + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + let submit_one = |rule_name: &str, rule: NetworkPolicyRule| { + let state = state.clone(); + let sandbox_name = sandbox_name.clone(); + let rule_name = rule_name.to_string(); + async move { + handle_submit_policy_analysis( + &state, + Request::new(SubmitPolicyAnalysisRequest { + name: sandbox_name, + analysis_mode: "agent_authored".to_string(), + proposed_chunks: vec![PolicyChunk { + rule_name, + proposed_rule: Some(rule), + ..Default::default() + }], + ..Default::default() + }), + ) + .await + .unwrap() + .into_inner() + } + }; + + let first = submit_one("allow_first", make_rule("allow_first")).await; + let second = submit_one("allow_second", make_rule("allow_second")).await; + + assert_eq!(first.accepted_chunk_ids.len(), 1); + assert_eq!(second.accepted_chunk_ids.len(), 1); + assert_ne!( + first.accepted_chunk_ids[0], second.accepted_chunk_ids[0], + "second agent-authored proposal for the same endpoint must get its own chunk_id, not dedup" + ); + + let draft = handle_get_draft_policy( + &state, + Request::new(GetDraftPolicyRequest { + name: sandbox_name.clone(), + status_filter: String::new(), + }), + ) + .await + .unwrap() + .into_inner(); + let ids: Vec<_> = draft.chunks.iter().map(|c| c.id.as_str()).collect(); + assert!( + ids.contains(&first.accepted_chunk_ids[0].as_str()) + && ids.contains(&second.accepted_chunk_ids[0].as_str()), + "both reported chunk_ids must be persisted; got: {ids:?}" + ); + + // Reject the second by id to prove the gateway can actually find + // what the submit response claimed to have created — this is the + // exact path the smoke test exercises end-to-end. + handle_reject_draft_chunk( + &state, + Request::new(RejectDraftChunkRequest { + name: sandbox_name, + chunk_id: second.accepted_chunk_ids[0].clone(), + reason: "redraft test".to_string(), + }), + ) + .await + .expect("reject must find the chunk_id the submit response just promised"); + } + + /// Complement to the agent-authored test above: mechanistic-mode + /// submissions for the same endpoint must STILL dedup. The + /// observation-driven path relies on N denials folding into one chunk + /// instead of N near-identical chunks. Lock the behavior in so a future + /// change to the dedup branch doesn't accidentally also turn off + /// mechanistic dedup. + #[tokio::test] + async fn mechanistic_submits_for_same_endpoint_dedup_into_one_chunk() { + use openshell_core::proto::{NetworkBinary, NetworkEndpoint, SandboxPhase, SandboxSpec}; + + let state = test_server_state().await; + let sandbox_name = "mechanistic-dedup".to_string(); + let sandbox = Sandbox { + metadata: Some(openshell_core::proto::datamodel::v1::ObjectMeta { + id: "sb-mech-dedup".to_string(), + name: sandbox_name.clone(), + created_at_ms: 1_000_000, + labels: std::collections::HashMap::new(), + }), + spec: Some(SandboxSpec { + policy: None, + ..Default::default() + }), + phase: SandboxPhase::Ready as i32, + ..Default::default() + }; + state.store.put_message(&sandbox).await.unwrap(); + + let proposed_rule = NetworkPolicyRule { + name: "allow_example".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.example.com".to_string(), + port: 443, + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + let submit_one = || { + let state = state.clone(); + let sandbox_name = sandbox_name.clone(); + let rule = proposed_rule.clone(); + async move { + handle_submit_policy_analysis( + &state, + Request::new(SubmitPolicyAnalysisRequest { + name: sandbox_name, + analysis_mode: "mechanistic".to_string(), + proposed_chunks: vec![PolicyChunk { + rule_name: "allow_example".to_string(), + proposed_rule: Some(rule), + ..Default::default() + }], + ..Default::default() + }), + ) + .await + .unwrap() + .into_inner() + } + }; + let first = submit_one().await; + let second = submit_one().await; + assert_eq!(first.accepted_chunk_ids.len(), 1); + assert_eq!(second.accepted_chunk_ids.len(), 1); + + let draft = handle_get_draft_policy( + &state, + Request::new(GetDraftPolicyRequest { + name: sandbox_name, + status_filter: String::new(), + }), + ) + .await + .unwrap() + .into_inner(); + assert_eq!( + draft.chunks.len(), + 1, + "two mechanistic submits for the same host|port|binary must dedup; got {} chunks", + draft.chunks.len() + ); + // Both submits must report the same effective id — the id of the + // one row that actually exists in the DB. Before the dedup fix the + // second submit would return a freshly-generated UUID that was + // never persisted; this assertion locks the contract down. + let stored_id = &draft.chunks[0].id; + assert_eq!( + &first.accepted_chunk_ids[0], stored_id, + "first submit's reported id must match the stored chunk" + ); + assert_eq!( + &second.accepted_chunk_ids[0], stored_id, + "second submit must report the same id as the first (dedup fold-in), not a fresh UUID" + ); + } + /// Undo of an approve must clear any `rejection_reason` left over from a /// prior reject. Without this, the in-sandbox agent reading chunks via /// `policy.local` cannot tell "pending and never rejected" from "pending diff --git a/crates/openshell-server/src/persistence/postgres.rs b/crates/openshell-server/src/persistence/postgres.rs index e13a02404..d9167b63b 100644 --- a/crates/openshell-server/src/persistence/postgres.rs +++ b/crates/openshell-server/src/persistence/postgres.rs @@ -395,9 +395,16 @@ WHERE object_type = $1 Ok(result.rows_affected()) } - pub async fn put_draft_chunk(&self, chunk: &DraftChunkRecord) -> PersistenceResult<()> { + pub async fn put_draft_chunk( + &self, + chunk: &DraftChunkRecord, + dedup_key: Option<&str>, + ) -> PersistenceResult { let payload = draft_chunk_payload_from_record(chunk)?; - sqlx::query( + // RETURNING id gives the row's effective id whether INSERT inserted + // a fresh row or ON CONFLICT updated an existing one. See the + // matching sqlite path for the rationale. + let row = sqlx::query( r" INSERT INTO objects ( object_type, id, scope, status, dedup_key, hit_count, payload, created_at_ms, updated_at_ms @@ -406,21 +413,22 @@ VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ON CONFLICT (object_type, scope, dedup_key) WHERE dedup_key IS NOT NULL DO UPDATE SET hit_count = objects.hit_count + EXCLUDED.hit_count, updated_at_ms = EXCLUDED.updated_at_ms +RETURNING id ", ) .bind(DRAFT_CHUNK_OBJECT_TYPE) .bind(&chunk.id) .bind(&chunk.sandbox_id) .bind(&chunk.status) - .bind(draft_chunk_dedup_key(chunk)) + .bind(dedup_key) .bind(i64::from(chunk.hit_count)) .bind(payload) .bind(chunk.first_seen_ms) .bind(chunk.last_seen_ms) - .execute(&self.pool) + .fetch_one(&self.pool) .await .map_err(|e| map_db_error(&e))?; - Ok(()) + Ok(row.get::("id")) } pub async fn get_draft_chunk(&self, id: &str) -> PersistenceResult> { @@ -601,10 +609,6 @@ WHERE object_type = $1 AND scope = $2 } } -fn draft_chunk_dedup_key(chunk: &DraftChunkRecord) -> String { - format!("{}|{}|{}", chunk.host, chunk.port, chunk.binary) -} - fn row_to_object_record(row: sqlx::postgres::PgRow) -> ObjectRecord { let labels_jsonb: Option = row.get("labels"); ObjectRecord { diff --git a/crates/openshell-server/src/persistence/sqlite.rs b/crates/openshell-server/src/persistence/sqlite.rs index a1e1503e4..f4a886a12 100644 --- a/crates/openshell-server/src/persistence/sqlite.rs +++ b/crates/openshell-server/src/persistence/sqlite.rs @@ -410,9 +410,17 @@ WHERE "object_type" = ?1 Ok(result.rows_affected()) } - pub async fn put_draft_chunk(&self, chunk: &DraftChunkRecord) -> PersistenceResult<()> { + pub async fn put_draft_chunk( + &self, + chunk: &DraftChunkRecord, + dedup_key: Option<&str>, + ) -> PersistenceResult { let payload = draft_chunk_payload_from_record(chunk)?; - sqlx::query( + // RETURNING "id" gives us the row's effective id regardless of + // whether INSERT inserted a fresh row or ON CONFLICT updated an + // existing one. Callers report this id to clients so the response + // can never advertise a chunk_id that isn't actually persisted. + let row = sqlx::query( r#" INSERT INTO "objects" ( "object_type", "id", "scope", "status", "dedup_key", "hit_count", "payload", "created_at_ms", "updated_at_ms" @@ -421,21 +429,22 @@ VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9) ON CONFLICT ("object_type", "scope", "dedup_key") WHERE "dedup_key" IS NOT NULL DO UPDATE SET "hit_count" = "objects"."hit_count" + excluded."hit_count", "updated_at_ms" = excluded."updated_at_ms" +RETURNING "id" "#, ) .bind(DRAFT_CHUNK_OBJECT_TYPE) .bind(&chunk.id) .bind(&chunk.sandbox_id) .bind(&chunk.status) - .bind(draft_chunk_dedup_key(chunk)) + .bind(dedup_key) .bind(i64::from(chunk.hit_count)) .bind(payload) .bind(chunk.first_seen_ms) .bind(chunk.last_seen_ms) - .execute(&self.pool) + .fetch_one(&self.pool) .await .map_err(|e| map_db_error(&e))?; - Ok(()) + Ok(row.get::("id")) } pub async fn get_draft_chunk(&self, id: &str) -> PersistenceResult> { @@ -616,10 +625,6 @@ WHERE "object_type" = ?1 AND "scope" = ?2 } } -fn draft_chunk_dedup_key(chunk: &DraftChunkRecord) -> String { - format!("{}|{}|{}", chunk.host, chunk.port, chunk.binary) -} - fn row_to_object_record(row: sqlx::sqlite::SqliteRow) -> ObjectRecord { ObjectRecord { object_type: row.get("object_type"), diff --git a/crates/openshell-server/src/policy_store.rs b/crates/openshell-server/src/policy_store.rs index 2895250eb..9a6333543 100644 --- a/crates/openshell-server/src/policy_store.rs +++ b/crates/openshell-server/src/policy_store.rs @@ -54,7 +54,25 @@ pub trait PolicyStoreExt { before_version: i64, ) -> PersistenceResult; - async fn put_draft_chunk(&self, chunk: &DraftChunkRecord) -> PersistenceResult<()>; + /// Persist a draft chunk. When `dedup_key` is `Some`, duplicate inserts + /// for the same `(sandbox, dedup_key)` fold into the existing row's + /// `hit_count` instead of creating a second chunk — appropriate for + /// observation-driven proposals from the mechanistic mapper. When + /// `None`, the chunk is inserted unconditionally — appropriate for + /// agent-authored proposals where each submission is an intentional + /// act and the redraft loop relies on every proposal getting its own + /// `chunk_id` even when the target endpoint is unchanged. + /// + /// Returns the **effective** row id. On a fresh insert that equals + /// `chunk.id`; on dedup fold-in it is the existing row's id. Callers + /// must use the returned id (not `chunk.id`) when reporting the chunk + /// to clients — otherwise the response advertises an id that was never + /// persisted. + async fn put_draft_chunk( + &self, + chunk: &DraftChunkRecord, + dedup_key: Option<&str>, + ) -> PersistenceResult; async fn get_draft_chunk(&self, id: &str) -> PersistenceResult>; @@ -191,10 +209,14 @@ impl PolicyStoreExt for Store { } } - async fn put_draft_chunk(&self, chunk: &DraftChunkRecord) -> PersistenceResult<()> { + async fn put_draft_chunk( + &self, + chunk: &DraftChunkRecord, + dedup_key: Option<&str>, + ) -> PersistenceResult { match self { - Self::Postgres(store) => store.put_draft_chunk(chunk).await, - Self::Sqlite(store) => store.put_draft_chunk(chunk).await, + Self::Postgres(store) => store.put_draft_chunk(chunk, dedup_key).await, + Self::Sqlite(store) => store.put_draft_chunk(chunk, dedup_key).await, } } @@ -307,6 +329,16 @@ pub fn policy_record_from_parts( }) } +/// Observation-mode dedup key: `host|port|binary`. Used by the mechanistic +/// mapper path where N denials targeting the same endpoint should fold into +/// one chunk instead of N near-identical chunks. Agent-authored proposals +/// pass `None` for the `dedup_key` argument to `put_draft_chunk` so each +/// proposal lands as its own chunk regardless of target — the redraft loop +/// depends on this. +pub fn observation_dedup_key(chunk: &DraftChunkRecord) -> String { + format!("{}|{}|{}", chunk.host, chunk.port, chunk.binary) +} + pub fn draft_chunk_payload_from_record(chunk: &DraftChunkRecord) -> PersistenceResult> { let proposed_rule = if chunk.proposed_rule.is_empty() { None diff --git a/proto/openshell.proto b/proto/openshell.proto index 6aac4a245..1977ef7cc 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -1242,7 +1242,13 @@ message SubmitPolicyAnalysisRequest { repeated DenialSummary summaries = 1; // Proposed policy chunks (validated by sandbox OPA engine). repeated PolicyChunk proposed_chunks = 2; - // Analysis mode: "mechanistic" or "llm". + // Analysis mode. `mechanistic` is the observation-driven path from the + // denial aggregator — chunks targeting the same host|port|binary fold + // into one row with hit_count incremented. `agent_authored` is an + // intentional proposal from an in-sandbox agent — each submission lands + // as its own chunk so the redraft-after-rejection loop has a stable id + // to watch. Other values are treated as agent-style (no dedup) so a new + // mode does not silently collapse proposals. string analysis_mode = 3; // Sandbox name. string name = 4; From 6e135a0a6ac9e53c9a7de2fbd0992e69c8d8d070 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 16:41:07 -0700 Subject: [PATCH 08/15] docs(examples): retarget policy-management demo at the /wait endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The narrated demo (examples/agent-driven-policy-management) has been the public face of this feature since #1151. Its agent prompt told Codex to retry the original PUT every few seconds for up to 120 seconds — a polling workaround for the missing /wait endpoint that this branch shipped. Update the demo to exercise /wait so the canonical reading of the feature reflects the actual UX win. - agent-task.md: step 4 is now "call /wait, branch on status" with the three outcomes spelled out (approved → retry once; rejected → read rejection_reason and revise or stop; pending+timed_out → re-issue /wait once, do NOT busy-loop or shorten the timeout). Also makes explicit that the demo submits one rule per proposal so accepted_chunk_ids[0] is the safe single id to wait on. - demo.sh: header docstring rewritten as a six-step loop that mirrors the README. narrate_sandbox_workflow drops its parallel numbering and uses bullets (the runtime narration is the agent's sub-actions, not a separate decomposition of the loop). Approve step header and success message now reference /wait waking the agent, not "policy hot-reload retry." - README.md: top-of-file flow expanded from 5 to 6 steps to include the /wait call and chunk_id capture; "Going further" section now describes both regression scripts and the boundary between them (real-GitHub retry vs. synthetic /wait wire test). Slow-path qualifier corrected from "image pull on first run" to "sandbox cold-start (SSH bring-up plus Codex install)". - wait-smoke.sh header rewritten to make it unambiguous this is a regression, NOT a tutorial, with explicit prereq commands instead of prereq descriptions, and a pointer at demo.sh for the narrated story. No code paths change; this is the readability pass. Signed-off-by: Alexander Watson --- e2e/policy-advisor/wait-smoke.sh | 39 +++++++++++++---- .../agent-driven-policy-management/README.md | 34 ++++++++++++--- .../agent-task.md | 28 +++++++++--- .../agent-driven-policy-management/demo.sh | 43 +++++++++++++------ 4 files changed, 109 insertions(+), 35 deletions(-) diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh index ea3206cb2..3bd0143ed 100755 --- a/e2e/policy-advisor/wait-smoke.sh +++ b/e2e/policy-advisor/wait-smoke.sh @@ -3,20 +3,41 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -# End-to-end smoke for the agent feedback loop (#1092 + #1094 + #1096). +# Wire-contract regression for policy.local's /wait endpoint. # -# Exercises both halves of the new policy.local /wait endpoint: -# Flow A — approve-and-retry: agent submits, /wait blocks, host approves, -# /wait returns approved. -# Flow B — reject-with-guidance: agent submits, /wait blocks, host rejects -# with --reason, /wait returns rejected + rejection_reason text. +# This is NOT a tutorial — read examples/agent-driven-policy-management/demo.sh +# (and its README) for the narrated end-to-end story with a real LLM agent. +# This script is the cheap, deterministic, no-LLM check that the underlying +# contract still holds. Pair this with a unit test diff to catch regressions +# in the gateway+supervisor integration path that the unit tests can't reach. +# +# What it pins down: +# - Flow A — submit → /wait blocks → host approves → /wait returns +# status=approved within seconds. Proves the wait-and-wake +# path of the agent feedback loop. +# - Flow B — submit → /wait blocks → host rejects with --reason "..." +# → /wait returns status=rejected AND the reviewer's exact +# free-form text comes back in rejection_reason. Proves the +# revise-and-resubmit path's wire contract. +# +# What it deliberately doesn't do: +# - No LLM (Codex) — proposals are synthetic JSON crafted by curl. The +# real agent's prompt-following is the demo's concern, not this one. +# - No outbound traffic — host is `example.invalid`. We never make a real +# GitHub or HTTP call. Only policy.local and the gateway gRPC. +# - No prover badge / TUI assertions (out of scope for this regression; +# see the prover-validation and TUI-inbox work). # # Prereqs: # - A running gateway and the openshell CLI built from this branch. -# - Global setting agent_policy_proposals_enabled=true. +# The simplest local path is `mise run helm:skaffold:run`, then +# `kubectl -n openshell port-forward svc/openshell 8090:8080 &` and +# `cargo build -p openshell-cli`. +# - The agent-proposals feature flag enabled. Run once: +# openshell settings set --global \ +# --key agent_policy_proposals_enabled --value true --yes # -# No GitHub credentials needed — proposals are synthetic and never trigger -# outbound traffic. +# Runs in ~10s on a warm cluster (most of which is sandbox SSH bring-up). set -euo pipefail diff --git a/examples/agent-driven-policy-management/README.md b/examples/agent-driven-policy-management/README.md index 7ff9a7780..c84687e60 100644 --- a/examples/agent-driven-policy-management/README.md +++ b/examples/agent-driven-policy-management/README.md @@ -11,10 +11,19 @@ Run the full agent-driven policy loop end-to-end: the initial policy only allows read-only access to `api.github.com`. 3. The agent reads `/etc/openshell/skills/policy_advisor.md`, drafts the narrowest rule needed, and submits it to `http://policy.local/v1/proposals`. -4. You approve the proposal from the host with one keystroke. -5. The sandbox hot-reloads the merged policy and the agent's retry succeeds. - -The whole loop usually finishes in under two minutes. + It saves the returned `chunk_id`. +4. The agent calls `GET /v1/proposals/{chunk_id}/wait?timeout=300` — a single + HTTP request that the supervisor holds open until the developer decides. + This is the load-bearing UX point: the agent burns zero LLM tokens while + it waits; it's literally sleeping on a socket. +5. You approve the proposal from the host with one keystroke. +6. The agent's `/wait` returns within ~1 second of the approval. The sandbox + has hot-reloaded the merged policy; the agent retries the original PUT + once and exits. + +The whole loop usually finishes in under two minutes; most of that time is +sandbox cold-start (SSH bring-up + Codex install inside the sandbox), not +the policy round-trip itself. ## Prerequisites @@ -82,6 +91,17 @@ approve based on the structured rule, not the agent's rationale.** ## Going further -`e2e/policy-advisor/test.sh` runs the same loop deterministically without an -LLM (curl + the `policy.local` API directly). Use it to validate the proxy and -proposal pipeline when iterating on the sandbox or gateway code. +Two LLM-less regression scripts cover adjacent slices of the same surface +when you're iterating on the sandbox or gateway code: + +- `e2e/policy-advisor/test.sh` — drives the original deny-observe-approve + loop end-to-end against a real GitHub repo, using `curl` from inside the + sandbox in a retry loop until policy hot-reloads. Exercises the L7 proxy + enforcement, the proposal-submit path, and the merged-policy reload. +- `e2e/policy-advisor/wait-smoke.sh` — pure wire-contract regression for the + `GET /v1/proposals/{id}/wait` endpoint shipped here. No LLM, no GitHub, no + real network traffic; just submits a synthetic proposal, blocks on + `/wait`, and asserts the developer's approve or `reject --reason` text + round-trips back into the response body. Faster (~10s) and the right + thing to add to when changing `policy.local` or the gateway draft-chunk + persistence. diff --git a/examples/agent-driven-policy-management/agent-task.md b/examples/agent-driven-policy-management/agent-task.md index 9c7588181..e83a653bd 100644 --- a/examples/agent-driven-policy-management/agent-task.md +++ b/examples/agent-driven-policy-management/agent-task.md @@ -36,15 +36,29 @@ markdown file to GitHub via the GitHub Contents API. 3. Read `/etc/openshell/skills/policy_advisor.md` and follow it. Submit the narrowest possible proposal to `http://policy.local/v1/proposals` — exact host, exact port, exact method, exact path, binary `/usr/bin/curl`. Do not - include query strings. Do not propose wildcard hosts. + include query strings. Do not propose wildcard hosts. The 202 response + carries `accepted_chunk_ids`; this demo submits one rule per proposal, so + the list always has exactly one element. Save `accepted_chunk_ids[0]`, + you need it for step 4. -4. After submitting, retry the PUT every few seconds for up to 120 seconds. - The developer is approving from outside the sandbox; once approved, the - sandbox hot-reloads policy and the same PUT will succeed. +4. Block on the developer's decision by calling + `GET http://policy.local/v1/proposals/{chunk_id}/wait?timeout=300`. This is + a single HTTP request that the supervisor holds open until the developer + approves or rejects; do not run a polling loop yourself. -5. Stop as soon as the PUT returns HTTP 200 or 201. Print a short summary - showing whether it succeeded, plus `content.path` and `content.html_url` - from the GitHub response. Do not print the full response body. + - `status: "approved"` — retry the PUT once. Policy has hot-reloaded. + - `status: "rejected"` — read `rejection_reason`. If it has text, address + the specific feedback and submit a revised proposal (back to step 3); + otherwise stop and tell the developer you can't proceed. + - `status: "pending"` with `timed_out: true` — the supervisor returned + without a decision after the full timeout window elapsed. Immediately + re-issue the same `/wait` request once. Each `/wait` is one long-lived + HTTP call; do not sleep, do not loop with a short timeout, do not + decrease `timeout=300`. + +5. On a successful PUT (HTTP 200 or 201), print a short summary showing + `content.path` and `content.html_url` from the GitHub response. Do not + print the full response body. If anything is unclear, prefer making a narrower proposal and asking for approval again over widening the rule. diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index 6c3c60cfa..c575f6073 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -5,9 +5,25 @@ # Agent-driven policy management demo. # -# Runs the full loop: a Codex agent inside a sandbox hits an OpenShell policy -# block, reads the policy advisor skill, drafts a narrow rule via policy.local, -# the developer approves from the host, and the agent retries successfully. +# Runs the full loop end-to-end: +# +# 1. A Codex agent inside an OpenShell sandbox attempts a PUT that the L7 +# proxy denies with a structured policy_denied 403. +# 2. The agent reads /etc/openshell/skills/policy_advisor.md. +# 3. The agent submits a narrow proposal (exact host, port, method, path) +# to policy.local and saves the returned chunk_id. +# 4. The agent blocks on `GET /v1/proposals/{chunk_id}/wait` — one HTTP +# call that sleeps on a socket. THE AGENT BURNS ZERO LLM TOKENS WHILE +# IT WAITS; this is the load-bearing UX win over polling. +# 5. The developer (this script, simulating the host side) sees the pending +# proposal in `openshell rule get` and approves it. +# 6. The agent's /wait returns approved within ~1 second of the approval, +# retries the original PUT once against the hot-reloaded policy, and +# exits. +# +# The whole loop is feature-flagged behind agent_policy_proposals_enabled and +# requires no GitHub credentials beyond the repo write token already used by +# the existing demo flow. set -euo pipefail @@ -315,9 +331,9 @@ summarize_pending() { narrate_sandbox_workflow() { info "Inside the sandbox right now:" info "" - info " ${BOLD}[1]${RESET} agent: ${DIM}curl -X PUT https://api.github.com/repos/${DEMO_GITHUB_OWNER}/${DEMO_GITHUB_REPO}/contents/...${RESET}" - info " ${BOLD}[2]${RESET} L7 proxy denies the write and returns a structured 403 the" - info " agent can parse and act on:" + info " • agent: ${DIM}curl -X PUT https://api.github.com/repos/${DEMO_GITHUB_OWNER}/${DEMO_GITHUB_REPO}/contents/...${RESET}" + info " • L7 proxy denies the write and returns a structured 403 the" + info " agent can parse and act on:" cat < Date: Mon, 11 May 2026 16:43:31 -0700 Subject: [PATCH 09/15] fix(examples): pass --yes on demo.sh's global setting writes Global setting updates require explicit confirmation in non-interactive mode; demo.sh's enable_agent_proposals and the cleanup restore path were missing --yes and hard-failed the preflight. Pre-existing issue that surfaced now that more of the demo runs through this path. No other behavior change. Signed-off-by: Alexander Watson --- examples/agent-driven-policy-management/demo.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index c575f6073..96da6d8d9 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -117,11 +117,11 @@ cleanup() { # before this run. if [[ -n "${PRIOR_PROPOSALS_FLAG:-}" ]]; then if [[ "$PRIOR_PROPOSALS_FLAG" == "(unset)" ]]; then - "$OPENSHELL_BIN" settings delete --global --key agent_policy_proposals_enabled \ + "$OPENSHELL_BIN" settings delete --global --key agent_policy_proposals_enabled --yes \ >/dev/null 2>&1 || true else "$OPENSHELL_BIN" settings set --global --key agent_policy_proposals_enabled \ - --value "$PRIOR_PROPOSALS_FLAG" >/dev/null 2>&1 || true + --value "$PRIOR_PROPOSALS_FLAG" --yes >/dev/null 2>&1 || true fi fi @@ -433,7 +433,7 @@ enable_agent_proposals() { | grep -o 'true\|false' | head -1)" PRIOR_PROPOSALS_FLAG="${prior:-(unset)}" "$OPENSHELL_BIN" settings set --global \ - --key agent_policy_proposals_enabled --value true >/dev/null \ + --key agent_policy_proposals_enabled --value true --yes >/dev/null \ || fail "could not enable agent_policy_proposals_enabled globally" } From 0ed74b003d421d910d56eb4bdcd074276237a693 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 17:12:41 -0700 Subject: [PATCH 10/15] feat(sandbox): emit OCSF audit events for policy proposal lifecycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The demo's policy decision trace previously showed only the proxy enforcement story (HTTP:PUT DENIED, CONFIG:LOADED, HTTP:PUT ALLOWED). It was silent about who proposed what or who decided what — the audit-trail receipts for the agent feedback loop were missing. policy.local now emits sandbox-side OCSF events at the observation moments, into the same stream as the existing CONFIG:LOADED: - CONFIG:PROPOSED on submit_proposal acceptance. Per accepted chunk: the message names the chunk_id, target endpoint, L7 method/path, and binary so the trace correlates against the inbox card via chunk_id. - CONFIG:APPROVED on /wait observation of approved status. - CONFIG:REJECTED on /wait observation of rejected status. Carries the reviewer's free-form rejection_reason in the message AND as an unmapped field, both sanitized (control chars stripped, capped at 200 chars with an ellipsis marker). The agent still reads the raw text via GET /v1/proposals/{id}; sanitization is audit-side only, per AGENTS.md's no-secrets-in-OCSF rule. The submit path defends the audit_summaries / accepted_chunk_ids index pairing against a future gateway change that compresses past rejected chunks (the proto doesn't promise 1:1 ordering with the request). Today client-side validation makes the lengths always match; if they don't, the pairing falls back to a generic per-id event rather than mis-attribute. The wait handler's emit site fires once per terminal-status observation. Multiple concurrent waiters on the same chunk would each emit one event; acceptable for single-waiter-per-chunk demos and the right place to dedup is the SIEM. demo.sh's trace filter now surfaces the four CONFIG: events alongside HTTP:PUT, so the trace at the end of every run tells the full story from deny to allow via propose -> approve. wait-smoke.sh's prereq notes recommend redirecting kubectl port-forward output so its "Handling connection for 8090" lines don't bleed into demo narration. Three new unit tests on the sandbox-side helpers — summary builder happy path, fallback, and the rejection_reason sanitizer. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/policy_local.rs | 215 ++++++++++++++++++ e2e/policy-advisor/wait-smoke.sh | 7 +- .../agent-driven-policy-management/demo.sh | 9 +- 3 files changed, 228 insertions(+), 3 deletions(-) diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index b363d0b74..1add4a89d 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -8,6 +8,7 @@ use openshell_core::proto::{ L7Allow, L7DenyRule, L7Rule, NetworkBinary, NetworkEndpoint, NetworkPolicyRule, PolicyChunk, SandboxPolicy as ProtoSandboxPolicy, }; +use openshell_ocsf::{ConfigStateChangeBuilder, SeverityId, StateId, StatusId, ocsf_emit}; use serde::Deserialize; use std::collections::HashMap; use std::path::{Path, PathBuf}; @@ -480,6 +481,12 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j } }; + // Pre-compute the audit summaries before handing `chunks` to the + // gateway client (which consumes the vec). The summaries pair up with + // the gateway's `accepted_chunk_ids` by index for the propose events + // emitted after submit returns. + let audit_summaries: Vec = chunks.iter().map(summarize_chunk_for_audit).collect(); + let response = match client .submit_policy_analysis(sandbox_name, vec![], chunks, "agent_authored") .await @@ -496,6 +503,27 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j } }; + // One OCSF event per accepted chunk so the audit trace in + // `openshell logs ` carries the propose beat alongside the + // proxy deny and policy reload that bracket it. + // + // The gateway compresses its `accepted_chunk_ids` by skipping rejected + // chunks (`grpc/policy.rs:1357-1436`); the proto does not promise 1:1 + // ordering against the request. Today client-side validation catches + // both rejection causes (missing rule_name, missing proposed_rule) + // before submit, so the lengths match in practice. If they don't, we + // can't safely pair audit_summaries by index — fall back to a generic + // event per accepted chunk_id rather than mis-attribute a summary. + let pairing_is_safe = response.accepted_chunk_ids.len() == audit_summaries.len(); + for (idx, chunk_id) in response.accepted_chunk_ids.iter().enumerate() { + let summary = if pairing_is_safe { + audit_summaries[idx].as_str() + } else { + "(summary unavailable: gateway partially accepted)" + }; + emit_policy_propose_event(chunk_id, summary); + } + ( 202, serde_json::json!({ @@ -508,6 +536,122 @@ async fn submit_proposal(ctx: &PolicyLocalContext, body: &[u8]) -> (u16, serde_j ) } +/// Emit one CONFIG:PROPOSED audit event for an agent-authored proposal that +/// the gateway just accepted. The message names the `chunk_id`, the binary, +/// and the endpoint the agent is asking to reach — what a developer needs +/// to see in the audit trace to correlate against the inbox card. +fn emit_policy_propose_event(chunk_id: &str, summary: &str) { + ocsf_emit!( + ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Other, "PROPOSED") + .unmapped("chunk_id", serde_json::json!(chunk_id)) + .message(format!( + "agent_authored proposal chunk:{chunk_id} {summary}" + )) + .build() + ); +} + +/// Emit one CONFIG:APPROVED or CONFIG:REJECTED audit event observed by the +/// `/wait` poll loop. The reviewer's free-form `rejection_reason` (if any) +/// is included verbatim so the audit trace shows what guidance the agent +/// received. +fn emit_policy_decision_event(chunk: &PolicyChunk) { + let summary = summarize_chunk_for_audit(chunk); + match chunk.status.as_str() { + "approved" => ocsf_emit!( + ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(SeverityId::Informational) + .status(StatusId::Success) + .state(StateId::Enabled, "APPROVED") + .unmapped("chunk_id", serde_json::json!(chunk.id)) + .message(format!("chunk:{} approved {summary}", chunk.id)) + .build() + ), + "rejected" => { + // The reviewer's free-form rejection_reason is opaque user + // input. The agent reads the raw text via `GET /v1/proposals/ + // {id}` to redraft; the OCSF surface (which can be shipped to + // external SIEMs per AGENTS.md) gets a sanitized copy — caps + // length and strips control characters so a stray credential + // or escape sequence cannot leak into the audit log. + let sanitized = sanitize_reason_for_audit(&chunk.rejection_reason); + let reason_display = if sanitized.is_empty() { + "(no guidance)".to_string() + } else { + format!("\"{sanitized}\"") + }; + ocsf_emit!( + ConfigStateChangeBuilder::new(crate::ocsf_ctx()) + .severity(SeverityId::Low) + .status(StatusId::Success) + .state(StateId::Disabled, "REJECTED") + .unmapped("chunk_id", serde_json::json!(chunk.id)) + .unmapped("rejection_reason", serde_json::json!(sanitized)) + .message(format!( + "chunk:{} rejected {summary} reason:{reason_display}", + chunk.id + )) + .build() + ); + } + // Caller is gated on `is_terminal_status`, so a non-terminal status + // here is a code change that broke the invariant. Warn loudly so + // the audit gap doesn't go silent. + other => tracing::warn!( + chunk_id = %chunk.id, + status = %other, + "emit_policy_decision_event called on non-terminal status; no audit event emitted" + ), + } +} + +/// Sanitize a free-form reviewer-typed string before it lands in the OCSF +/// audit surface. The agent still reads the raw text via the API — this is +/// audit-side defense only. +fn sanitize_reason_for_audit(raw: &str) -> String { + const MAX_CHARS: usize = 200; + let cleaned: String = raw + .chars() + .filter(|c| !c.is_control() || *c == ' ') + .take(MAX_CHARS) + .collect(); + if raw.chars().count() > MAX_CHARS { + format!("{cleaned}…") + } else { + cleaned + } +} + +/// One-line audit description of a chunk's target: binary, host, port, and +/// L7 method/path if present. Used by both the propose and approve/reject +/// audit events so the trace can be grepped by endpoint without parsing +/// JSON. +fn summarize_chunk_for_audit(chunk: &PolicyChunk) -> String { + let Some(rule) = chunk.proposed_rule.as_ref() else { + return format!("rule_name:{}", chunk.rule_name); + }; + let endpoint = rule.endpoints.first().map_or_else( + || "unknown".to_string(), + |ep| format!("{}:{}", ep.host, ep.port), + ); + let l7 = rule + .endpoints + .first() + .and_then(|ep| ep.rules.first()) + .and_then(|r| r.allow.as_ref()) + .map(|a| format!(" {} {}", a.method, a.path)) + .unwrap_or_default(); + let binary = if chunk.binary.is_empty() { + String::new() + } else { + format!(" by {}", chunk.binary) + }; + format!("on {endpoint}{l7}{binary}") +} + /// `GET /v1/proposals/{chunk_id}` — immediate state. One gateway call, no loop. async fn proposal_status_response( ctx: &PolicyLocalContext, @@ -539,6 +683,11 @@ async fn proposal_wait_response( loop { match fetch_chunk(&session, chunk_id).await { Ok(Some(chunk)) if is_terminal_status(&chunk.status) => { + // Audit beat: emit at the moment this sandbox observes the + // decision so the trace correlates with the proxy events + // bracketing the loop. Multiple waiters on the same chunk + // each fire one event — acceptable for a wakeup audit. + emit_policy_decision_event(&chunk); return (200, chunk_state_payload(&chunk, false)); } Ok(Some(chunk)) => { @@ -1482,4 +1631,70 @@ mod tests { let (status, _) = route_request(&ctx, "GET", "/v1/proposals/abc/wait", &[]).await; assert_eq!(status, 404); } + + #[test] + fn summarize_chunk_for_audit_includes_endpoint_l7_path_and_binary() { + let chunk = PolicyChunk { + id: "ignored".to_string(), + rule_name: "github_write".to_string(), + binary: "/usr/bin/curl".to_string(), + proposed_rule: Some(NetworkPolicyRule { + name: "github_write".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + rules: vec![L7Rule { + allow: Some(L7Allow { + method: "PUT".to_string(), + path: "/repos/foo/bar/contents/x.md".to_string(), + ..Default::default() + }), + }], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }), + ..Default::default() + }; + let summary = summarize_chunk_for_audit(&chunk); + assert!(summary.contains("api.github.com:443")); + assert!(summary.contains("PUT /repos/foo/bar/contents/x.md")); + assert!(summary.contains("/usr/bin/curl")); + } + + #[test] + fn sanitize_reason_for_audit_strips_control_chars_and_caps_length() { + // Tabs and newlines are stripped; ordinary printable chars survive; + // multi-byte characters count as one char in the cap. + let raw = "line one\nline\ttwo\u{0001}\u{0007}"; + let cleaned = sanitize_reason_for_audit(raw); + assert!(!cleaned.contains('\n')); + assert!(!cleaned.contains('\t')); + assert!(!cleaned.contains('\u{0001}')); + assert!(cleaned.contains("line one")); + assert!(cleaned.contains("linetwo")); + + // Length cap with ellipsis marker so a downstream reader can tell + // the audit string is truncated. + let long: String = "x".repeat(500); + let capped = sanitize_reason_for_audit(&long); + assert!(capped.chars().count() <= 201); + assert!(capped.ends_with('…')); + + // Empty input maps to empty output (caller renders "(no guidance)"). + assert_eq!(sanitize_reason_for_audit(""), ""); + } + + #[test] + fn summarize_chunk_for_audit_falls_back_to_rule_name_without_rule() { + let chunk = PolicyChunk { + rule_name: "fallback".to_string(), + proposed_rule: None, + ..Default::default() + }; + assert_eq!(summarize_chunk_for_audit(&chunk), "rule_name:fallback"); + } } diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh index 3bd0143ed..ae9e934ff 100755 --- a/e2e/policy-advisor/wait-smoke.sh +++ b/e2e/policy-advisor/wait-smoke.sh @@ -31,8 +31,11 @@ # Prereqs: # - A running gateway and the openshell CLI built from this branch. # The simplest local path is `mise run helm:skaffold:run`, then -# `kubectl -n openshell port-forward svc/openshell 8090:8080 &` and -# `cargo build -p openshell-cli`. +# `cargo build -p openshell-cli` and start the port-forward with its +# output redirected so kubectl's "Handling connection for 8090" lines +# don't bleed into your terminal: +# KUBECONFIG=kubeconfig kubectl -n openshell \ +# port-forward svc/openshell 8090:8080 >/dev/null 2>&1 & # - The agent-proposals feature flag enabled. Run once: # openshell settings set --global \ # --key agent_policy_proposals_enabled --value true --yes diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index 96da6d8d9..af371918f 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -417,8 +417,15 @@ verify_github_write() { # OpenShell's logging — keep it as-is rather than re-formatting. show_logs() { step "Policy decision trace (OCSF)" + # Filter to the events that tell the loop's story end-to-end, ordered by + # the trace's own timestamps: + # HTTP:PUT DENIED — initial proxy enforcement + # CONFIG:PROPOSED — agent submitted a chunk to the gateway + # CONFIG:APPROVED/REJECTED — developer decided; agent's /wait woke up + # CONFIG:LOADED — supervisor hot-reloaded the merged policy + # HTTP:PUT ALLOWED — agent's retry succeeded "$OPENSHELL_BIN" logs "$DEMO_SANDBOX_NAME" --since 10m -n 200 2>&1 \ - | grep -E 'HTTP:PUT.*(DENIED|ALLOWED)|CONFIG:LOADED.*Policy reloaded' \ + | grep -E 'HTTP:PUT.*(DENIED|ALLOWED)|CONFIG:(PROPOSED|APPROVED|REJECTED|LOADED)' \ | sed 's/^/ /' || true } From e44f1a0b4c06fb15cbf61dbcb0a682c2e600714d Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 18:02:54 -0700 Subject: [PATCH 11/15] feat(policy): /wait awaits local policy reload; demo auto-approves redrafts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three things in one commit, all surfaced by running the demo end-to-end against a real gateway and finding the agent had to draft a broader second proposal. 1. /wait race fix. Previously /wait returned `approved` the moment it observed the gateway's chunk status flip, but the local supervisor reloads policy on its own poll cycle (~10s in practice). The agent's retry would race the reload and hit the still-old policy, getting denied. Codex then drafted a broader rule and re-submitted — sound agent behavior, but not what /wait should provoke. Now /wait captures the local policy version at start, and after observed-approved waits for the supervisor to load a strictly-newer version before returning. Bounded by the caller's deadline; best-effort return if the deadline elapses without the version bumping. Two new unit tests pin the happy path and the deadline-clamped fallback. 2. demo.sh auto-approve loop. Replaces approve_when_pending + wait_for_agent with one approve_pending_until_agent_exits function that keeps watching for pending chunks and approving them until the agent process exits (or the configured timeout). Defense in depth against future redraft scenarios for any reason; today (post-fix #1) the agent should only submit one proposal per task, but we don't want to hang silently if it does submit more. 3. UX. Step headers now carry "[t+1.2s]" relative timestamps so reading the run output makes latency visible (the demo's whole point is the wait is cheap — surface that). A spin_wait helper renders an ASCII spinner during the watch loop so the demo never looks frozen on a TTY. Falls back to plain sleep on non-TTY contexts. Closes the race condition diagnosed from the trace timing where the gateway approved at t+0, sandbox observed at t+0.3s, but the supervisor didn't load v2 until t+9.4s — well after the agent had already retried and been denied. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/policy_local.rs | 113 ++++++++++++++++++ .../agent-driven-policy-management/demo.sh | 103 ++++++++++++---- 2 files changed, 194 insertions(+), 22 deletions(-) diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index 1add4a89d..85b307de4 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -680,9 +680,28 @@ async fn proposal_wait_response( }; let timeout_secs = parse_timeout_query(query); let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); + // Baseline local policy version at /wait start. When a chunk is + // approved upstream the gateway merges it immediately, but the local + // supervisor reloads policy on its own poll cycle — without this + // baseline we can return "approved" before the local rule is active, + // and the agent's single retry races the reload. + let baseline_policy_version: u32 = ctx + .current_policy + .read() + .await + .as_ref() + .map_or(0, |p| p.version); loop { match fetch_chunk(&session, chunk_id).await { Ok(Some(chunk)) if is_terminal_status(&chunk.status) => { + if chunk.status == "approved" { + // Don't promise the agent its retry will succeed until + // the local supervisor has actually loaded the new + // policy. Bounded by the same deadline so a stuck + // supervisor cannot extend the wait beyond what the + // caller asked for. + wait_for_local_policy_bump(ctx, baseline_policy_version, deadline).await; + } // Audit beat: emit at the moment this sandbox observes the // decision so the trace correlates with the proxy events // bracketing the loop. Multiple waiters on the same chunk @@ -753,6 +772,42 @@ fn is_terminal_status(status: &str) -> bool { matches!(status, "approved" | "rejected") } +/// After a chunk is approved upstream, wait until the local supervisor has +/// loaded a policy version strictly newer than the baseline captured at the +/// start of `/wait`. Bounded by the caller-supplied deadline; returns early +/// (best-effort) if the deadline passes without the version bumping. +/// +/// The polling cadence here is faster than `PROPOSAL_WAIT_POLL_INTERVAL` +/// (which paces upstream gateway calls). This loop only reads in-memory +/// state, so 200ms gives a responsive handoff to the agent's retry once +/// the supervisor's own policy poll catches up. +async fn wait_for_local_policy_bump( + ctx: &PolicyLocalContext, + baseline_version: u32, + deadline: tokio::time::Instant, +) { + const TICK: std::time::Duration = std::time::Duration::from_millis(200); + loop { + let current: u32 = ctx + .current_policy + .read() + .await + .as_ref() + .map_or(0, |p| p.version); + if current > baseline_version { + return; + } + let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); + if remaining.is_zero() { + // Best effort: return approved anyway. The agent's retry may + // race the reload, but the alternative (extending past the + // caller's deadline) violates the wait contract. + return; + } + tokio::time::sleep(std::cmp::min(remaining, TICK)).await; + } +} + /// Parse `?timeout=` from the query string. Default applies for missing /// or unparseable values; bounds clamp to keep the agent's hold ceiling /// sane. Re-issue is the right pattern for longer waits. @@ -1665,6 +1720,64 @@ mod tests { assert!(summary.contains("/usr/bin/curl")); } + #[tokio::test] + async fn wait_for_local_policy_bump_returns_when_version_advances() { + let ctx = PolicyLocalContext::new( + Some(ProtoSandboxPolicy { + version: 5, + ..Default::default() + }), + None, + None, + ); + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(2); + let bumped = { + let policy = ctx.current_policy.clone(); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + *policy.write().await = Some(ProtoSandboxPolicy { + version: 6, + ..Default::default() + }); + }) + }; + let start = tokio::time::Instant::now(); + wait_for_local_policy_bump(&ctx, 5, deadline).await; + bumped.await.unwrap(); + // Returned promptly after the bump, well before the 2s deadline. + let elapsed = start.elapsed(); + assert!( + elapsed < std::time::Duration::from_millis(800), + "should return shortly after version bumps; took {elapsed:?}" + ); + } + + #[tokio::test] + async fn wait_for_local_policy_bump_returns_at_deadline_if_no_advance() { + let ctx = PolicyLocalContext::new( + Some(ProtoSandboxPolicy { + version: 5, + ..Default::default() + }), + None, + None, + ); + let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(300); + let start = tokio::time::Instant::now(); + wait_for_local_policy_bump(&ctx, 5, deadline).await; + let elapsed = start.elapsed(); + // Best effort: returns at or shortly after the deadline rather + // than extending past it. + assert!( + elapsed >= std::time::Duration::from_millis(250), + "should wait until ~deadline; only waited {elapsed:?}" + ); + assert!( + elapsed < std::time::Duration::from_millis(800), + "should not extend past deadline by much; took {elapsed:?}" + ); + } + #[test] fn sanitize_reason_for_audit_strips_control_chars_and_caps_length() { // Tabs and newlines are stripped; ordinary printable chars survive; diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index af371918f..a0da377d0 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -73,9 +73,50 @@ RESET=$'\033[0m' AGENT_PID="" -step() { printf "\n${BOLD}${CYAN}==> %s${RESET}\n\n" "$1"; } +# Wall-clock anchor so each step header can carry a "[t+1.2s]" tag and the +# reader sees where time is going. `date +%s.%N` works on macOS bash where +# `${EPOCHREALTIME}` may be unavailable in older bashes. +DEMO_START_EPOCH="$(date +%s.%N)" + +elapsed() { + awk -v s="$DEMO_START_EPOCH" -v now="$(date +%s.%N)" \ + 'BEGIN { printf "%.1fs", now - s }' +} + +step() { + printf "\n${BOLD}${CYAN}==> [t+%s] %s${RESET}\n\n" "$(elapsed)" "$1" +} info() { printf " %b\n" "$*"; } +# ASCII spinner for the watch-for-pending loop. Renders only on a TTY so +# piped runs (CI, tee, etc.) stay clean. spin_wait pairs a message with a +# bounded sleep so the spinner animates smoothly without polling faster +# than necessary. +SPINNER_CHARS=( '⠋' '⠙' '⠹' '⠸' '⠼' '⠴' '⠦' '⠧' '⠇' '⠏' ) +SPINNER_IDX=0 + +spin_wait() { + local message="$1" + local duration_secs="${2:-2}" + if [[ ! -t 1 ]]; then + sleep "$duration_secs" + return + fi + local end=$(( SECONDS + duration_secs )) + while (( SECONDS < end )); do + printf "\r ${DIM}%s${RESET} %s " \ + "${SPINNER_CHARS[SPINNER_IDX]}" "$message" + SPINNER_IDX=$(( (SPINNER_IDX + 1) % ${#SPINNER_CHARS[@]} )) + sleep 0.1 + done +} + +spin_clear() { + if [[ -t 1 ]]; then + printf "\r%*s\r" "${COLUMNS:-100}" '' + fi +} + # Redact host-side credentials from the agent log tail before printing on # failure. Codex shouldn't echo the token, but a misbehaving tool call (e.g., # `curl -v`) could leak it; sanitize before showing the log. @@ -204,9 +245,19 @@ check_gateway() { local raw version # `openshell status` colorizes labels with ANSI even when piped, so strip # escapes before parsing. Use NO_COLOR as a belt-and-suspenders hint for - # libraries that respect it. - raw="$(NO_COLOR=1 "$OPENSHELL_BIN" status 2>/dev/null \ - | sed 's/\x1b\[[0-9;]*m//g')" + # libraries that respect it. Capture stderr explicitly so a connection + # failure (gateway down, port-forward died after a redeploy) surfaces a + # real error message instead of `set -euo pipefail` silently exiting. + if ! raw="$(NO_COLOR=1 "$OPENSHELL_BIN" status 2>&1)"; then + fail "openshell could not reach the gateway. CLI output: +${raw} + +If you just redeployed, the kubectl port-forward you backgrounded earlier +probably died with the old pod. Restart it (silenced so its noise doesn't +bleed into the demo): + KUBECONFIG=kubeconfig kubectl -n openshell port-forward svc/openshell 8090:8080 >/dev/null 2>&1 &" + fi + raw="$(sed 's/\x1b\[[0-9;]*m//g' <<<"$raw")" version="$(awk -F': *' '/Version:/ { print $2; exit }' <<<"$raw")" [[ -n "$version" ]] \ || fail "active OpenShell gateway is not reachable; start one with: openshell gateway start" @@ -357,23 +408,36 @@ EOF info "${DIM}Watching for the pending draft on the gateway...${RESET}" } -approve_when_pending() { +approve_pending_until_agent_exits() { step "Waiting for the agent to draft a policy proposal" narrate_sandbox_workflow - local start now pending + local start now pending approval_count start="$(date +%s)" pending="${TMP_DIR}/pending.txt" + approval_count=0 while true; do + # Agent finished? Drain its exit status and we're done. if ! kill -0 "$AGENT_PID" >/dev/null 2>&1; then - wait "$AGENT_PID" || true + spin_clear + if ! wait "$AGENT_PID"; then + AGENT_PID="" + fail "agent run failed" + fi AGENT_PID="" - fail "agent exited before a pending proposal appeared" + if (( approval_count == 0 )); then + fail "agent exited before any pending proposal appeared" + fi + info "agent exited after ${approval_count} approval(s)" + return fi + # Anything pending? Approve and keep watching — the agent may + # redraft if a previous proposal didn't yield the access it needed. if "$OPENSHELL_BIN" rule get "$DEMO_SANDBOX_NAME" --status pending >"$pending" 2>/dev/null \ && grep -q "Chunk:" "$pending" && grep -q "pending" "$pending"; then + spin_clear info "" info "${GREEN}proposal received:${RESET}" summarize_pending "$pending" @@ -381,24 +445,20 @@ approve_when_pending() { step "Approving — the agent's /wait will return within ~1s" "$OPENSHELL_BIN" rule approve-all "$DEMO_SANDBOX_NAME" \ | awk '/approved/ { print " " $0 }' - return + approval_count=$((approval_count + 1)) fi now="$(date +%s)" if (( now - start >= DEMO_APPROVAL_TIMEOUT_SECS )); then - fail "timed out waiting for the agent to submit a policy proposal" + spin_clear + if (( approval_count == 0 )); then + fail "timed out waiting for the agent to submit a policy proposal" + fi + fail "agent did not exit within ${DEMO_APPROVAL_TIMEOUT_SECS}s after ${approval_count} approval(s)" fi - sleep 2 - done -} -wait_for_agent() { - if ! wait "$AGENT_PID"; then - AGENT_PID="" - fail "agent run failed" - fi - AGENT_PID="" - info "agent's /wait returned approved — single PUT retry succeeded" + spin_wait "watching for pending proposals (approved ${approval_count} so far)" 2 + done } verify_github_write() { @@ -457,8 +517,7 @@ main() { show_run_summary start_agent_sandbox - approve_when_pending - wait_for_agent + approve_pending_until_agent_exits verify_github_write show_logs From 7a9f82ac104166fe4a07a12fdccce1cc9e62511d Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 22:02:05 -0700 Subject: [PATCH 12/15] fix(sandbox): /wait detects policy reload by content, not the schema version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt at the /wait-after-approve race fix compared `SandboxPolicy.version` between /wait start and the policy reload — but that field is the *schema* version (constant 1), not a revision counter. Every comparison was `current(1) > baseline(1) == false`, so the wait blocked until the agent's 300s timeout regardless of whether the supervisor had actually reloaded. The demo SSH connection then timed out around the 240s mark. Diagnosed from a live run's OCSF trace: supervisor pulled v2 at +8.5s after approval (CONFIG:LOADED), but the sandbox-side CONFIG:APPROVED that my /wait emits didn't fire until +304s — exactly at the 300s deadline. Fix: compare the whole policy via prost's derived PartialEq. Any field change (network_policies map being the only one that actually mutates today) flips equality. A clone-per-200ms-tick on a few-KB proto is cheap inside the bounded wait window. Tests rewritten to match the new contract: the supervisor-reload fixture now keeps `version: 1` constant and changes `network_policies` contents, mirroring the exact failure mode from the live run. Signed-off-by: Alexander Watson --- crates/openshell-sandbox/src/policy_local.rs | 106 ++++++++++++------- 1 file changed, 69 insertions(+), 37 deletions(-) diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index 85b307de4..bf11e2640 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -680,17 +680,17 @@ async fn proposal_wait_response( }; let timeout_secs = parse_timeout_query(query); let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); - // Baseline local policy version at /wait start. When a chunk is + // Baseline local policy fingerprint at /wait start. When a chunk is // approved upstream the gateway merges it immediately, but the local // supervisor reloads policy on its own poll cycle — without this // baseline we can return "approved" before the local rule is active, // and the agent's single retry races the reload. - let baseline_policy_version: u32 = ctx - .current_policy - .read() - .await - .as_ref() - .map_or(0, |p| p.version); + // + // We compare encoded policy bytes rather than `SandboxPolicy.version`, + // which is a schema-version constant (always 1) and never bumps on a + // revision change. A naive version compare loops forever; a byte + // compare picks up any rule mutation, which is what reload means. + let baseline_fingerprint = local_policy_fingerprint(ctx).await; loop { match fetch_chunk(&session, chunk_id).await { Ok(Some(chunk)) if is_terminal_status(&chunk.status) => { @@ -700,7 +700,7 @@ async fn proposal_wait_response( // policy. Bounded by the same deadline so a stuck // supervisor cannot extend the wait beyond what the // caller asked for. - wait_for_local_policy_bump(ctx, baseline_policy_version, deadline).await; + wait_for_local_policy_bump(ctx, &baseline_fingerprint, deadline).await; } // Audit beat: emit at the moment this sandbox observes the // decision so the trace correlates with the proxy events @@ -772,10 +772,25 @@ fn is_terminal_status(status: &str) -> bool { matches!(status, "approved" | "rejected") } +/// Snapshot of the local policy at a moment in time, used as a revision +/// fingerprint by `wait_for_local_policy_bump`. Compares with `==` against +/// a later snapshot to detect a reload — prost-generated types derive +/// `PartialEq`, so any field change (including the `network_policies` map) +/// flips equality. +async fn local_policy_fingerprint(ctx: &PolicyLocalContext) -> Option { + ctx.current_policy.read().await.clone() +} + /// After a chunk is approved upstream, wait until the local supervisor has -/// loaded a policy version strictly newer than the baseline captured at the -/// start of `/wait`. Bounded by the caller-supplied deadline; returns early -/// (best-effort) if the deadline passes without the version bumping. +/// loaded a policy that differs from the baseline captured at the start of +/// `/wait`. Bounded by the caller-supplied deadline; returns early +/// (best-effort) if the deadline passes without a change. +/// +/// We compare the whole policy proto rather than any single field because +/// `SandboxPolicy.version` is a schema-version constant (always 1) and does +/// not track revisions — only the policy *content* changes when a reload +/// lands. Equality is via prost's derived `PartialEq`; a clone-per-tick on +/// a few-KB struct is cheap for the bounded wait window this lives in. /// /// The polling cadence here is faster than `PROPOSAL_WAIT_POLL_INTERVAL` /// (which paces upstream gateway calls). This loop only reads in-memory @@ -783,18 +798,13 @@ fn is_terminal_status(status: &str) -> bool { /// the supervisor's own policy poll catches up. async fn wait_for_local_policy_bump( ctx: &PolicyLocalContext, - baseline_version: u32, + baseline: &Option, deadline: tokio::time::Instant, ) { const TICK: std::time::Duration = std::time::Duration::from_millis(200); loop { - let current: u32 = ctx - .current_policy - .read() - .await - .as_ref() - .map_or(0, |p| p.version); - if current > baseline_version { + let current = local_policy_fingerprint(ctx).await; + if current.as_ref() != baseline.as_ref() { return; } let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); @@ -1721,50 +1731,72 @@ mod tests { } #[tokio::test] - async fn wait_for_local_policy_bump_returns_when_version_advances() { - let ctx = PolicyLocalContext::new( - Some(ProtoSandboxPolicy { - version: 5, - ..Default::default() - }), - None, - None, - ); + async fn wait_for_local_policy_bump_returns_when_policy_content_changes() { + // Baseline: a policy with one network policy keyed "initial". + // SandboxPolicy.version stays 1 across reloads (it's a schema + // marker, not a revision id) — this test deliberately changes + // *content* without bumping the version field, which is the + // failure mode that surfaced in the demo (supervisor reloaded + // the policy but kept version=1, so the prior version-compare + // implementation hung until deadline). + let initial = ProtoSandboxPolicy { + version: 1, + network_policies: HashMap::from([( + "initial".to_string(), + NetworkPolicyRule { + name: "initial".to_string(), + ..Default::default() + }, + )]), + ..Default::default() + }; + let ctx = PolicyLocalContext::new(Some(initial), None, None); + let baseline = local_policy_fingerprint(&ctx).await; + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(2); - let bumped = { + let changed = { let policy = ctx.current_policy.clone(); tokio::spawn(async move { tokio::time::sleep(std::time::Duration::from_millis(100)).await; + // Same schema version (1), different content — exactly + // what the supervisor does on a real reload. *policy.write().await = Some(ProtoSandboxPolicy { - version: 6, + version: 1, + network_policies: HashMap::from([( + "reloaded".to_string(), + NetworkPolicyRule { + name: "reloaded".to_string(), + ..Default::default() + }, + )]), ..Default::default() }); }) }; let start = tokio::time::Instant::now(); - wait_for_local_policy_bump(&ctx, 5, deadline).await; - bumped.await.unwrap(); - // Returned promptly after the bump, well before the 2s deadline. + wait_for_local_policy_bump(&ctx, &baseline, deadline).await; + changed.await.unwrap(); let elapsed = start.elapsed(); assert!( elapsed < std::time::Duration::from_millis(800), - "should return shortly after version bumps; took {elapsed:?}" + "should return shortly after content changes; took {elapsed:?}" ); } #[tokio::test] - async fn wait_for_local_policy_bump_returns_at_deadline_if_no_advance() { + async fn wait_for_local_policy_bump_returns_at_deadline_if_no_change() { let ctx = PolicyLocalContext::new( Some(ProtoSandboxPolicy { - version: 5, + version: 1, ..Default::default() }), None, None, ); + let baseline = local_policy_fingerprint(&ctx).await; let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(300); let start = tokio::time::Instant::now(); - wait_for_local_policy_bump(&ctx, 5, deadline).await; + wait_for_local_policy_bump(&ctx, &baseline, deadline).await; let elapsed = start.elapsed(); // Best effort: returns at or shortly after the deadline rather // than extending past it. From bafd8ac1fbbf11464cc5ef12f03043c27e0ef050 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Mon, 11 May 2026 22:02:15 -0700 Subject: [PATCH 13/15] fix(examples): redact tokens with python literal-string replace, not sed The sed-based redact_log in demo.sh broke when one of the auth tokens contained a character that conflicted with sed's pattern parser ("unterminated substitute pattern" on the Codex JWT). The whole log tail then blanks on failure, hiding the very failure context we're trying to surface. Switch to a python subprocess that takes the tokens via argv and does literal str.replace. No regex, no delimiter games, no truncation. Signed-off-by: Alexander Watson --- .../agent-driven-policy-management/demo.sh | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index a0da377d0..f52ad4bd6 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -120,13 +120,25 @@ spin_clear() { # Redact host-side credentials from the agent log tail before printing on # failure. Codex shouldn't echo the token, but a misbehaving tool call (e.g., # `curl -v`) could leak it; sanitize before showing the log. +# +# Uses python's literal `str.replace` rather than sed because tokens +# (especially JWTs) can contain characters that break sed's pattern parser +# — a sed delimiter collision in one of the substitutions blanks the entire +# log tail, hiding the very failure context we're trying to surface. redact_log() { - local replacement='[redacted]' - sed \ - -e "s|${DEMO_GITHUB_TOKEN:-__no_github_token__}|${replacement}|g" \ - -e "s|${CODEX_AUTH_ACCESS_TOKEN:-__no_codex_access__}|${replacement}|g" \ - -e "s|${CODEX_AUTH_REFRESH_TOKEN:-__no_codex_refresh__}|${replacement}|g" \ - -e "s|${CODEX_AUTH_ACCOUNT_ID:-__no_codex_account__}|${replacement}|g" + python3 - \ + "${DEMO_GITHUB_TOKEN:-}" \ + "${CODEX_AUTH_ACCESS_TOKEN:-}" \ + "${CODEX_AUTH_REFRESH_TOKEN:-}" \ + "${CODEX_AUTH_ACCOUNT_ID:-}" \ + <<'PY' +import sys +tokens = [t for t in sys.argv[1:] if t] +for line in sys.stdin: + for t in tokens: + line = line.replace(t, "[redacted]") + sys.stdout.write(line) +PY } fail() { From 88afffb695a55a245a3150b7c4f708ef31621ad5 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Tue, 12 May 2026 15:03:56 -0700 Subject: [PATCH 14/15] fix(sandbox): scope /wait reload check to the approved rule MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewer (John Myers) flagged two failure modes in the prior whole-policy fingerprint approach used by policy.local /wait: - False sleep: when the supervisor reloads between two /wait calls (the skill tells the agent to re-issue on timed_out), the new call snapshots the already-updated policy as baseline and burns the full timeout waiting for a change that never comes. - False wakeup: any unrelated reload (other agent's approval, settings change) flips the diff, but the chunk's actual rule may not be loaded yet — the agent retries and hits policy_denied for no real signal. Replace the diff with rule-coverage. New public helper openshell_policy::policy_covers_rule reuses endpoints_overlap (so it matches add_rule's merge semantics, including the fold-into-existing-key case) plus an L7 allow check on method/path (so an existing endpoint that doesn't yet contain the proposed method doesn't signal coverage). Add policy_reloaded: true|false to the /wait response on approve, with a 500ms floor on the reload-wait phase so approvals arriving near the deadline still get a fair shot at reloaded=true. Update the policy_advisor skill to branch on it: reloaded=true → retry; reloaded=false → re-issue /wait once with timeout=30, then surface to user. Don't loop tightly. Tests: - 9 new unit tests in openshell-policy pinning coverage semantics (L4-only, L7 method gap, fold-into-existing-key, empty binaries). - 4 new tokio tests in policy_local mirroring John's exact scenarios. - wait-smoke.sh asserts policy_reloaded=true on Flow A. Signed-off-by: Alexander Watson --- crates/openshell-policy/src/lib.rs | 2 +- crates/openshell-policy/src/merge.rs | 365 ++++++++++++++++++ crates/openshell-sandbox/src/policy_local.rs | 326 +++++++++++----- crates/openshell-sandbox/src/skills.rs | 8 +- .../src/skills/policy_advisor.md | 21 +- e2e/policy-advisor/wait-smoke.sh | 11 +- .../agent-driven-policy-management/README.md | 19 +- .../agent-driven-policy-management/demo.sh | 57 ++- 8 files changed, 699 insertions(+), 110 deletions(-) diff --git a/crates/openshell-policy/src/lib.rs b/crates/openshell-policy/src/lib.rs index 908450111..632170c0b 100644 --- a/crates/openshell-policy/src/lib.rs +++ b/crates/openshell-policy/src/lib.rs @@ -27,7 +27,7 @@ use serde::{Deserialize, Serialize}; pub use compose::{ProviderPolicyLayer, compose_effective_policy, provider_rule_name}; pub use merge::{ PolicyMergeError, PolicyMergeOp, PolicyMergeResult, PolicyMergeWarning, generated_rule_name, - merge_policy, + merge_policy, policy_covers_rule, }; // --------------------------------------------------------------------------- diff --git a/crates/openshell-policy/src/merge.rs b/crates/openshell-policy/src/merge.rs index d99d9c216..c01445b11 100644 --- a/crates/openshell-policy/src/merge.rs +++ b/crates/openshell-policy/src/merge.rs @@ -207,6 +207,78 @@ pub struct PolicyMergeResult { pub changed: bool, } +/// Returns true iff `policy` semantically contains the rule an `AddRule` +/// merge of `proposed` would produce. +/// +/// "Contains" means: for every endpoint in `proposed`, some rule in +/// `policy.network_policies` has an endpoint with overlapping +/// host/path/port set AND containing every L7 allow (method/path) the +/// proposed endpoint requested, and that rule's binaries cover every +/// binary in `proposed`. +/// +/// The sandbox's `policy.local /wait` long-poll uses this to decide when +/// the local supervisor has actually loaded a policy that includes the +/// chunk the agent just had approved. A whole-policy hash compare is wrong +/// in both directions: it can wake the wait on unrelated reloads (false +/// wakeup) and can fail to wake when the supervisor reloaded between two +/// `/wait` calls (false sleep). This check is the property the agent +/// actually cares about — "is my rule in effect right now?". +/// +/// L4-vs-L7 split: endpoint overlap reuses `endpoints_overlap` so the +/// L4 surface (host/path/port) lines up with the `add_rule` merge — if +/// the gateway folded the chunk into an existing rule under a different +/// key, this check still returns true. The L7 layer is checked +/// separately because `endpoints_overlap` is intentionally L4-only: +/// without the L7 check, coverage would return true the instant the +/// supervisor reloaded *any* change to an overlapping endpoint, even +/// before the new method/path actually landed — exactly the false-wakeup +/// mode this fix exists to prevent, just one layer down. +pub fn policy_covers_rule(policy: &SandboxPolicy, proposed: &NetworkPolicyRule) -> bool { + if proposed.endpoints.is_empty() { + return false; + } + proposed.endpoints.iter().all(|target_endpoint| { + policy.network_policies.values().any(|rule| { + rule.endpoints.iter().any(|endpoint| { + endpoints_overlap(endpoint, target_endpoint) + && endpoint_l7_covers(endpoint, target_endpoint) + }) && proposed.binaries.iter().all(|target_binary| { + rule.binaries + .iter() + .any(|binary| binary.path == target_binary.path) + }) + }) + }) +} + +/// L7 coverage for a single endpoint match. If the proposed endpoint +/// declared explicit L7 allow rules (method+path), every one of them must +/// be present in the merged endpoint's `rules`. An empty `proposed.rules` +/// is treated as "L4-only" and returns true (the endpoint match alone is +/// sufficient). +/// +/// Conservative on access presets: if a merged endpoint uses +/// `access: read-write` instead of explicit rules, this returns false +/// even though the preset would permit the method at runtime. That +/// produces a one-cycle re-issue on the agent's side — preferable to a +/// false-positive coverage signal that lets the agent retry too early. +fn endpoint_l7_covers(merged: &NetworkEndpoint, proposed: &NetworkEndpoint) -> bool { + if proposed.rules.is_empty() { + return true; + } + proposed.rules.iter().all(|proposed_rule| { + let Some(proposed_allow) = proposed_rule.allow.as_ref() else { + return true; + }; + merged.rules.iter().any(|existing| { + existing.allow.as_ref().is_some_and(|existing_allow| { + existing_allow.method == proposed_allow.method + && existing_allow.path == proposed_allow.path + }) + }) + }) +} + pub fn merge_policy( policy: SandboxPolicy, operations: &[PolicyMergeOp], @@ -782,6 +854,7 @@ mod tests { use super::{ PolicyMergeError, PolicyMergeOp, PolicyMergeWarning, generated_rule_name, merge_policy, + policy_covers_rule, }; use crate::restrictive_default_policy; use openshell_core::proto::{ @@ -1187,6 +1260,298 @@ mod tests { assert!(!result.policy.network_policies.contains_key("github")); } + #[test] + fn policy_covers_rule_returns_true_when_merged_rule_present() { + let proposed = NetworkPolicyRule { + name: "agent_proposed".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + let merged = merge_policy( + restrictive_default_policy(), + &[PolicyMergeOp::AddRule { + rule_name: "allow_api_github_com_443".to_string(), + rule: proposed.clone(), + }], + ) + .expect("merge should succeed"); + + assert!(policy_covers_rule(&merged.policy, &proposed)); + } + + #[test] + fn policy_covers_rule_returns_false_when_unrelated_rule_present() { + let proposed = NetworkPolicyRule { + name: "agent_proposed".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + // Merge an *unrelated* rule for a different host. The proposed rule + // for api.github.com is still not present — this is John's + // "false-wakeup" case: an unrelated policy reload must not signal + // that the agent's rule is loaded. + let merged = merge_policy( + restrictive_default_policy(), + &[PolicyMergeOp::AddRule { + rule_name: "allow_api_example_com_443".to_string(), + rule: rule_with_endpoint("unrelated", "api.example.com", 443), + }], + ) + .expect("merge should succeed"); + + assert!(!policy_covers_rule(&merged.policy, &proposed)); + } + + #[test] + fn policy_covers_rule_handles_merge_into_existing_endpoint() { + // The merge logic folds a new rule into an existing rule when their + // endpoints overlap, even under a different network_policies key. + // Coverage must survive that fold — name-keyed checks would miss it. + let proposed = NetworkPolicyRule { + name: "agent_proposed".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + let mut policy = restrictive_default_policy(); + policy.network_policies.insert( + "preexisting_github".to_string(), + NetworkPolicyRule { + name: "preexisting_github".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/git".to_string(), + ..Default::default() + }], + }, + ); + + let merged = merge_policy( + policy, + &[PolicyMergeOp::AddRule { + rule_name: "allow_api_github_com_443".to_string(), + rule: proposed.clone(), + }], + ) + .expect("merge should succeed"); + + assert!( + !merged + .policy + .network_policies + .contains_key("allow_api_github_com_443"), + "proposed rule should have been folded into the existing key" + ); + assert!(policy_covers_rule(&merged.policy, &proposed)); + } + + #[test] + fn policy_covers_rule_returns_false_when_binary_missing() { + let proposed = NetworkPolicyRule { + name: "agent_proposed".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + // Endpoint exists in the policy but with a *different* binary. The + // agent's retry would still be denied; reload coverage should + // reflect that. + let mut policy = restrictive_default_policy(); + policy.network_policies.insert( + "existing".to_string(), + NetworkPolicyRule { + name: "existing".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/git".to_string(), + ..Default::default() + }], + }, + ); + + assert!(!policy_covers_rule(&policy, &proposed)); + } + + #[test] + fn policy_covers_rule_returns_false_for_empty_proposed_endpoints() { + // Defensive: a rule with no endpoints carries no signal we can match + // on, so coverage is never true. + let proposed = NetworkPolicyRule::default(); + let policy = restrictive_default_policy(); + assert!(!policy_covers_rule(&policy, &proposed)); + } + + #[test] + fn policy_covers_rule_returns_false_when_proposed_l7_method_not_loaded() { + // John's false-wakeup mode at L7: the supervisor has an + // overlapping endpoint loaded (e.g. read-only GET), but the + // chunk's proposed PUT method is not in the merged endpoint's + // rules yet. Coverage must NOT return true here, or the agent + // retries the PUT and hits another policy_denied. + let proposed = NetworkPolicyRule { + name: "agent_put".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + ports: vec![443], + protocol: "rest".to_string(), + rules: vec![rest_rule("PUT", "/repos/foo/bar/contents/x.md")], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + let mut policy = restrictive_default_policy(); + policy.network_policies.insert( + "existing_readonly".to_string(), + NetworkPolicyRule { + name: "existing_readonly".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + ports: vec![443], + protocol: "rest".to_string(), + rules: vec![rest_rule("GET", "/repos/foo/bar/contents/x.md")], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + + assert!( + !policy_covers_rule(&policy, &proposed), + "endpoint overlaps but L7 PUT not loaded yet; must not signal coverage" + ); + } + + #[test] + fn policy_covers_rule_returns_true_after_l7_merge_lands() { + // Same setup as above, but with the proposed L7 rule merged in. + // Coverage must now return true. + let proposed = NetworkPolicyRule { + name: "agent_put".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + ports: vec![443], + protocol: "rest".to_string(), + rules: vec![rest_rule("PUT", "/repos/foo/bar/contents/x.md")], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }; + + let mut policy = restrictive_default_policy(); + policy.network_policies.insert( + "existing".to_string(), + NetworkPolicyRule { + name: "existing".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + ports: vec![443], + protocol: "rest".to_string(), + rules: vec![ + rest_rule("GET", "/repos/foo/bar/contents/x.md"), + rest_rule("PUT", "/repos/foo/bar/contents/x.md"), + ], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + + assert!(policy_covers_rule(&policy, &proposed)); + } + + #[test] + fn policy_covers_rule_returns_true_for_l4_only_proposed_when_endpoint_present() { + // A chunk that targets a non-REST surface (no L7 rules) needs + // only the L4 endpoint match to be considered covered. Empty + // proposed.rules must not be treated as "no method matches". + let proposed = NetworkPolicyRule { + name: "ssh_clone".to_string(), + endpoints: vec![NetworkEndpoint { + host: "github.com".to_string(), + port: 22, + ports: vec![22], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/git".to_string(), + ..Default::default() + }], + }; + + let merged = merge_policy( + restrictive_default_policy(), + &[PolicyMergeOp::AddRule { + rule_name: "allow_github_com_22".to_string(), + rule: proposed.clone(), + }], + ) + .expect("merge should succeed"); + + assert!(policy_covers_rule(&merged.policy, &proposed)); + } + + #[test] + fn policy_covers_rule_treats_empty_proposed_binaries_as_any_binary() { + // A proposed rule with no binaries is the "any binary" shape. + // The merged rule keeps its own binaries; coverage holds iff + // endpoint and (vacuously satisfied) binary set match. Document + // the semantics so a future reader doesn't flip it accidentally. + let proposed = NetworkPolicyRule { + name: "any_binary_rule".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![], + }; + + let mut policy = restrictive_default_policy(); + policy.network_policies.insert( + "existing".to_string(), + NetworkPolicyRule { + name: "existing".to_string(), + endpoints: vec![endpoint("api.github.com", 443)], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + }, + ); + + assert!( + policy_covers_rule(&policy, &proposed), + "empty proposed binaries should match any merged binary set" + ); + } + #[test] fn add_rule_without_existing_match_inserts_requested_key() { let policy = restrictive_default_policy(); diff --git a/crates/openshell-sandbox/src/policy_local.rs b/crates/openshell-sandbox/src/policy_local.rs index bf11e2640..657fd760f 100644 --- a/crates/openshell-sandbox/src/policy_local.rs +++ b/crates/openshell-sandbox/src/policy_local.rs @@ -45,6 +45,12 @@ const PROPOSAL_WAIT_DEFAULT_SECS: u64 = 60; const PROPOSAL_WAIT_MIN_SECS: u64 = 1; const PROPOSAL_WAIT_MAX_SECS: u64 = 300; const PROPOSAL_WAIT_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_secs(1); +/// Minimum window the reload-readiness phase gets after a chunk +/// terminalizes, even if the caller's deadline is shorter. Without this, +/// approvals that arrive at T-50ms always return `policy_reloaded=false` +/// and force a re-issue. 500ms is well below typical supervisor poll +/// latency but enough to cover the in-memory coverage check. +const RELOAD_WAIT_MIN_FLOOR: std::time::Duration = std::time::Duration::from_millis(500); const MAX_POLICY_LOCAL_BODY_BYTES: usize = 64 * 1024; /// Hard ceiling on how long a single request body read can stall. Bounds a @@ -680,39 +686,49 @@ async fn proposal_wait_response( }; let timeout_secs = parse_timeout_query(query); let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(timeout_secs); - // Baseline local policy fingerprint at /wait start. When a chunk is - // approved upstream the gateway merges it immediately, but the local - // supervisor reloads policy on its own poll cycle — without this - // baseline we can return "approved" before the local rule is active, - // and the agent's single retry races the reload. - // - // We compare encoded policy bytes rather than `SandboxPolicy.version`, - // which is a schema-version constant (always 1) and never bumps on a - // revision change. A naive version compare loops forever; a byte - // compare picks up any rule mutation, which is what reload means. - let baseline_fingerprint = local_policy_fingerprint(ctx).await; loop { match fetch_chunk(&session, chunk_id).await { Ok(Some(chunk)) if is_terminal_status(&chunk.status) => { - if chunk.status == "approved" { - // Don't promise the agent its retry will succeed until - // the local supervisor has actually loaded the new - // policy. Bounded by the same deadline so a stuck - // supervisor cannot extend the wait beyond what the - // caller asked for. - wait_for_local_policy_bump(ctx, &baseline_fingerprint, deadline).await; - } // Audit beat: emit at the moment this sandbox observes the // decision so the trace correlates with the proxy events // bracketing the loop. Multiple waiters on the same chunk // each fire one event — acceptable for a wakeup audit. emit_policy_decision_event(&chunk); - return (200, chunk_state_payload(&chunk, false)); + let policy_reloaded = if chunk.status == "approved" { + // Hold the wait until the local supervisor has loaded a + // policy that semantically contains this chunk's + // proposed rule. Reloads triggered by *other* chunks or + // settings changes do not wake us; a missing + // proposed_rule (defensive) skips the check and + // returns reloaded=false so the agent can decide. + // + // Floor the reload-wait window to RELOAD_WAIT_MIN_FLOOR + // so an approval that arrives at T-50ms still gets a + // realistic shot at seeing the reload. Worst case we + // overshoot the caller's deadline by this floor — + // preferable to returning reloaded=false on every + // short-budget call and forcing the agent to re-issue. + let reload_deadline = std::cmp::max( + deadline, + tokio::time::Instant::now() + RELOAD_WAIT_MIN_FLOOR, + ); + match chunk.proposed_rule.as_ref() { + Some(rule) => { + wait_for_local_policy_to_cover(ctx, rule, reload_deadline).await + } + None => false, + } + } else { + // Rejected: no reload semantics — the agent reads + // rejection_reason and redrafts. + false + }; + return (200, chunk_state_payload(&chunk, false, policy_reloaded)); } Ok(Some(chunk)) => { let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); if remaining.is_zero() { - return (200, chunk_state_payload(&chunk, true)); + return (200, chunk_state_payload(&chunk, true, false)); } let sleep_for = std::cmp::min(remaining, PROPOSAL_WAIT_POLL_INTERVAL); tokio::time::sleep(sleep_for).await; @@ -739,7 +755,7 @@ async fn fetch_chunk_or_404( timed_out: bool, ) -> (u16, serde_json::Value) { match fetch_chunk(session, chunk_id).await { - Ok(Some(chunk)) => (200, chunk_state_payload(&chunk, timed_out)), + Ok(Some(chunk)) => (200, chunk_state_payload(&chunk, timed_out, false)), Ok(None) => chunk_not_found_payload(chunk_id), Err(err) => err, } @@ -749,11 +765,18 @@ async fn fetch_chunk_or_404( /// /// Selection rule: include the fields the agent needs to decide what to do /// next on the redraft loop — identity (`chunk_id`, `status`), the proposal -/// it submitted (`rule_name`, `binary`), and the two feedback signals +/// it submitted (`rule_name`, `binary`), the two feedback signals /// (`rejection_reason` from the reviewer, `validation_result` from the -/// gateway prover). Display-only proto fields (`hit_count`, `confidence`, -/// `stage`, timing) are left off until a concrete agent need surfaces them. -fn chunk_state_payload(chunk: &PolicyChunk, timed_out: bool) -> serde_json::Value { +/// gateway prover), and (on /wait) `policy_reloaded` so the agent can tell +/// "approved AND the new rule is loaded — safe to retry" from "approved +/// but the supervisor hasn't reloaded yet — re-issue /wait or surface to +/// user". Display-only proto fields (`hit_count`, `confidence`, `stage`, +/// timing) are left off until a concrete agent need surfaces them. +fn chunk_state_payload( + chunk: &PolicyChunk, + timed_out: bool, + policy_reloaded: bool, +) -> serde_json::Value { let mut payload = serde_json::json!({ "chunk_id": chunk.id, "status": chunk.status, @@ -765,6 +788,9 @@ fn chunk_state_payload(chunk: &PolicyChunk, timed_out: bool) -> serde_json::Valu if timed_out { payload["timed_out"] = serde_json::json!(true); } + if chunk.status == "approved" { + payload["policy_reloaded"] = serde_json::json!(policy_reloaded); + } payload } @@ -772,47 +798,52 @@ fn is_terminal_status(status: &str) -> bool { matches!(status, "approved" | "rejected") } -/// Snapshot of the local policy at a moment in time, used as a revision -/// fingerprint by `wait_for_local_policy_bump`. Compares with `==` against -/// a later snapshot to detect a reload — prost-generated types derive -/// `PartialEq`, so any field change (including the `network_policies` map) -/// flips equality. -async fn local_policy_fingerprint(ctx: &PolicyLocalContext) -> Option { - ctx.current_policy.read().await.clone() -} - /// After a chunk is approved upstream, wait until the local supervisor has -/// loaded a policy that differs from the baseline captured at the start of -/// `/wait`. Bounded by the caller-supplied deadline; returns early -/// (best-effort) if the deadline passes without a change. +/// loaded a policy that semantically contains the chunk's proposed rule. +/// Returns `true` if coverage was observed before the deadline, `false` +/// otherwise — the caller reports that bool back to the agent as +/// `policy_reloaded` so it can decide whether to retry immediately or +/// re-issue `/wait`. /// -/// We compare the whole policy proto rather than any single field because -/// `SandboxPolicy.version` is a schema-version constant (always 1) and does -/// not track revisions — only the policy *content* changes when a reload -/// lands. Equality is via prost's derived `PartialEq`; a clone-per-tick on -/// a few-KB struct is cheap for the bounded wait window this lives in. +/// Why rule-coverage instead of whole-policy diff (as we used to do): +/// +/// 1. **False sleep.** If the agent re-issues `/wait` after a `timed_out` +/// response, the chunk may have approved AND the supervisor may have +/// reloaded between the two `/wait` calls. A diff-based check snapshots +/// the already-updated policy as baseline and then waits forever for +/// another change. The skill tells the agent to re-issue on +/// `timed_out`, so the diff approach is broken on the happy path. +/// 2. **False wakeup.** Any unrelated reload (another agent's approval, +/// settings change) flips a whole-policy diff, but the chunk's actual +/// rule may not be loaded yet. The agent retries, hits another +/// `policy_denied`, and the revise-loop fires with no real signal to +/// revise on. /// /// The polling cadence here is faster than `PROPOSAL_WAIT_POLL_INTERVAL` /// (which paces upstream gateway calls). This loop only reads in-memory /// state, so 200ms gives a responsive handoff to the agent's retry once /// the supervisor's own policy poll catches up. -async fn wait_for_local_policy_bump( +async fn wait_for_local_policy_to_cover( ctx: &PolicyLocalContext, - baseline: &Option, + proposed_rule: &NetworkPolicyRule, deadline: tokio::time::Instant, -) { +) -> bool { const TICK: std::time::Duration = std::time::Duration::from_millis(200); loop { - let current = local_policy_fingerprint(ctx).await; - if current.as_ref() != baseline.as_ref() { - return; + // Clone the snapshot out of the RwLock before running coverage — + // otherwise the read guard is held across `policy_covers_rule`'s + // iteration of `network_policies`, serializing a writer (supervisor + // reload) on the very thing we're waiting for. Clone-per-tick on + // a few-KB struct is cheap for the bounded wait window here. + let snapshot = ctx.current_policy.read().await.clone(); + if let Some(policy) = snapshot.as_ref() + && openshell_policy::policy_covers_rule(policy, proposed_rule) + { + return true; } let remaining = deadline.saturating_duration_since(tokio::time::Instant::now()); if remaining.is_zero() { - // Best effort: return approved anyway. The agent's retry may - // race the reload, but the alternative (extending past the - // caller's deadline) violates the wait contract. - return; + return false; } tokio::time::sleep(std::cmp::min(remaining, TICK)).await; } @@ -1630,18 +1661,39 @@ mod tests { validation_result: "no exfil paths".to_string(), ..Default::default() }; - let pending = chunk_state_payload(&chunk, false); + let pending = chunk_state_payload(&chunk, false, false); assert_eq!(pending["chunk_id"], "chunk-x"); assert_eq!(pending["status"], "rejected"); assert_eq!(pending["rejection_reason"], "scope too broad"); assert_eq!(pending["validation_result"], "no exfil paths"); - // timed_out only appears when set. + // timed_out and policy_reloaded only appear when relevant. assert!(pending.get("timed_out").is_none()); + assert!( + pending.get("policy_reloaded").is_none(), + "policy_reloaded is only meaningful for approved chunks" + ); - let timed = chunk_state_payload(&chunk, true); + let timed = chunk_state_payload(&chunk, true, false); assert_eq!(timed["timed_out"], true); } + #[test] + fn chunk_state_payload_includes_policy_reloaded_when_approved() { + let chunk = PolicyChunk { + id: "chunk-y".to_string(), + status: "approved".to_string(), + rule_name: "allow_github".to_string(), + binary: "/usr/bin/curl".to_string(), + ..Default::default() + }; + let reloaded = chunk_state_payload(&chunk, false, true); + assert_eq!(reloaded["status"], "approved"); + assert_eq!(reloaded["policy_reloaded"], true); + + let not_reloaded = chunk_state_payload(&chunk, false, false); + assert_eq!(not_reloaded["policy_reloaded"], false); + } + #[tokio::test] async fn proposal_routes_reject_malformed_paths() { let _guard = ProposalsFlagGuard::set(true).await; @@ -1730,61 +1782,146 @@ mod tests { assert!(summary.contains("/usr/bin/curl")); } + // Helpers — synthetic proposed rule + policy with that rule already + // merged. Both reused across reload-readiness tests. + fn proposed_curl_rule_for_github() -> NetworkPolicyRule { + NetworkPolicyRule { + name: "agent_proposed".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.github.com".to_string(), + port: 443, + ports: vec![443], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + } + } + + fn policy_with_rule(rule: NetworkPolicyRule) -> ProtoSandboxPolicy { + ProtoSandboxPolicy { + version: 1, + network_policies: HashMap::from([(rule.name.clone(), rule)]), + ..Default::default() + } + } + + #[tokio::test] + async fn wait_returns_reloaded_true_when_rule_already_loaded() { + // John's false-sleep case: the supervisor has already reloaded a + // policy containing the proposed rule before /wait starts. A + // whole-policy diff would never see another change and burn the + // full timeout. Rule-coverage must return immediately. + let proposed = proposed_curl_rule_for_github(); + let ctx = PolicyLocalContext::new(Some(policy_with_rule(proposed.clone())), None, None); + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(2); + + let start = tokio::time::Instant::now(); + let reloaded = wait_for_local_policy_to_cover(&ctx, &proposed, deadline).await; + let elapsed = start.elapsed(); + + assert!(reloaded, "should report reloaded=true on coverage"); + assert!( + elapsed < std::time::Duration::from_millis(200), + "should return immediately, not poll-and-wait; took {elapsed:?}" + ); + } + #[tokio::test] - async fn wait_for_local_policy_bump_returns_when_policy_content_changes() { - // Baseline: a policy with one network policy keyed "initial". - // SandboxPolicy.version stays 1 across reloads (it's a schema - // marker, not a revision id) — this test deliberately changes - // *content* without bumping the version field, which is the - // failure mode that surfaced in the demo (supervisor reloaded - // the policy but kept version=1, so the prior version-compare - // implementation hung until deadline). + async fn wait_does_not_wake_on_unrelated_policy_change() { + // John's false-wakeup case: a *different* rule gets added to the + // local policy (other agent's approval, settings change, etc.). + // The agent's specific rule is still not loaded. A diff-based + // check would wake here; coverage must not. + let proposed = proposed_curl_rule_for_github(); + // Start with a policy that does NOT contain the proposed rule. let initial = ProtoSandboxPolicy { version: 1, - network_policies: HashMap::from([( - "initial".to_string(), - NetworkPolicyRule { - name: "initial".to_string(), - ..Default::default() - }, - )]), ..Default::default() }; let ctx = PolicyLocalContext::new(Some(initial), None, None); - let baseline = local_policy_fingerprint(&ctx).await; - let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(2); - let changed = { + // Concurrently, an unrelated rule lands. We must not return. + let unrelated_load = { let policy = ctx.current_policy.clone(); + tokio::spawn(async move { + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + *policy.write().await = Some(policy_with_rule(NetworkPolicyRule { + name: "unrelated".to_string(), + endpoints: vec![NetworkEndpoint { + host: "api.example.com".to_string(), + port: 443, + ports: vec![443], + ..Default::default() + }], + binaries: vec![NetworkBinary { + path: "/usr/bin/curl".to_string(), + ..Default::default() + }], + })); + }) + }; + + let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(400); + let start = tokio::time::Instant::now(); + let reloaded = wait_for_local_policy_to_cover(&ctx, &proposed, deadline).await; + unrelated_load.await.unwrap(); + let elapsed = start.elapsed(); + + assert!( + !reloaded, + "must not wake on an unrelated reload; coverage was never satisfied" + ); + assert!( + elapsed >= std::time::Duration::from_millis(350), + "should have held until the deadline; only waited {elapsed:?}" + ); + } + + #[tokio::test] + async fn wait_wakes_when_matching_rule_arrives_mid_flight() { + // Sandbox starts without the rule, then a reload lands containing + // it. /wait should observe coverage and return reloaded=true. + let proposed = proposed_curl_rule_for_github(); + let ctx = PolicyLocalContext::new( + Some(ProtoSandboxPolicy { + version: 1, + ..Default::default() + }), + None, + None, + ); + + let matching_load = { + let policy = ctx.current_policy.clone(); + let target = proposed.clone(); tokio::spawn(async move { tokio::time::sleep(std::time::Duration::from_millis(100)).await; - // Same schema version (1), different content — exactly - // what the supervisor does on a real reload. - *policy.write().await = Some(ProtoSandboxPolicy { - version: 1, - network_policies: HashMap::from([( - "reloaded".to_string(), - NetworkPolicyRule { - name: "reloaded".to_string(), - ..Default::default() - }, - )]), - ..Default::default() - }); + *policy.write().await = Some(policy_with_rule(target)); }) }; + + let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(2); let start = tokio::time::Instant::now(); - wait_for_local_policy_bump(&ctx, &baseline, deadline).await; - changed.await.unwrap(); + let reloaded = wait_for_local_policy_to_cover(&ctx, &proposed, deadline).await; + matching_load.await.unwrap(); let elapsed = start.elapsed(); + + assert!(reloaded, "should report reloaded=true after coverage lands"); assert!( elapsed < std::time::Duration::from_millis(800), - "should return shortly after content changes; took {elapsed:?}" + "should return shortly after coverage; took {elapsed:?}" ); } #[tokio::test] - async fn wait_for_local_policy_bump_returns_at_deadline_if_no_change() { + async fn wait_returns_reloaded_false_at_deadline_when_no_coverage() { + // Deadline budget exhausted, the proposed rule never showed up. + // Coverage check returns false — the agent gets policy_reloaded= + // false and decides whether to retry blind or re-issue /wait. + let proposed = proposed_curl_rule_for_github(); let ctx = PolicyLocalContext::new( Some(ProtoSandboxPolicy { version: 1, @@ -1793,13 +1930,12 @@ mod tests { None, None, ); - let baseline = local_policy_fingerprint(&ctx).await; let deadline = tokio::time::Instant::now() + std::time::Duration::from_millis(300); let start = tokio::time::Instant::now(); - wait_for_local_policy_bump(&ctx, &baseline, deadline).await; + let reloaded = wait_for_local_policy_to_cover(&ctx, &proposed, deadline).await; let elapsed = start.elapsed(); - // Best effort: returns at or shortly after the deadline rather - // than extending past it. + + assert!(!reloaded); assert!( elapsed >= std::time::Duration::from_millis(250), "should wait until ~deadline; only waited {elapsed:?}" diff --git a/crates/openshell-sandbox/src/skills.rs b/crates/openshell-sandbox/src/skills.rs index d2b898226..d29d56247 100644 --- a/crates/openshell-sandbox/src/skills.rs +++ b/crates/openshell-sandbox/src/skills.rs @@ -61,9 +61,15 @@ mod tests { assert!(content.contains("addRule")); // The wait-loop teaching is load-bearing for the agent feedback // UX; lock the workflow language in so future skill edits cannot - // drop it silently. Both substrings target the directive, not the + // drop it silently. Each substring targets a directive, not the // field name (which could appear in the API doc block alone). assert!(content.contains("/v1/proposals/{chunk_id}/wait")); assert!(content.contains("read `rejection_reason`")); + // policy_reloaded distinguishes "safe to retry" from "approval + // landed but supervisor hasn't reloaded yet"; without both + // branches taught the agent retries blind on approve+not-yet + // and re-runs into policy_denied. + assert!(content.contains("`policy_reloaded: true`")); + assert!(content.contains("`policy_reloaded: false`")); } } diff --git a/crates/openshell-sandbox/src/skills/policy_advisor.md b/crates/openshell-sandbox/src/skills/policy_advisor.md index 3503674b9..8ca64f977 100644 --- a/crates/openshell-sandbox/src/skills/policy_advisor.md +++ b/crates/openshell-sandbox/src/skills/policy_advisor.md @@ -32,7 +32,10 @@ The sandbox-local policy API is reachable at `http://policy.local`: - `GET /v1/proposals/{chunk_id}/wait?timeout=` — block on this proposal until the developer decides or the timeout expires. Default 60s, clamped [1, 300]. On timeout you get `status: "pending"` plus - `timed_out: true`. Use this instead of polling `/v1/proposals/{chunk_id}`. + `timed_out: true`. On approval the response also carries + `policy_reloaded: true|false` indicating whether the local sandbox has + already loaded a policy containing the approved rule. Use this endpoint + instead of polling `/v1/proposals/{chunk_id}`. The proposal body takes an `intent_summary` and one or more `addRule` operations. Each `addRule` carries a complete narrow `NetworkPolicyRule`. @@ -54,11 +57,17 @@ operations. Each `addRule` carries a complete narrow `NetworkPolicyRule`. waiting on the rest. 7. For each accepted chunk_id, call `GET /v1/proposals/{chunk_id}/wait?timeout=300` and act on the result: - - `status: "approved"` — retry the original denied action. Policy - hot-reloads after approval, so the request should succeed. If it - still fails with `policy_denied`, re-read the denial — your rule - may not match. If it fails for any other reason, surface to the - user. + - `status: "approved"` with `policy_reloaded: true` — retry the + original denied action. The merged policy is already loaded; the + request should succeed. If it still fails with `policy_denied`, + re-read the denial — your rule may not match. If it fails for any + other reason, surface to the user. + - `status: "approved"` with `policy_reloaded: false` — approval + landed but the local sandbox hasn't observed the reload within the + `/wait` window. Re-issue the same `/wait` call once with + `timeout=30`. If the second response is still + `policy_reloaded: false`, surface to the user rather than retrying + blind; do not loop tightly. - `status: "rejected"` — read `rejection_reason` and `validation_result`. `rejection_reason` is what the reviewer typed; `validation_result` is the prover's verdict, which often explains diff --git a/e2e/policy-advisor/wait-smoke.sh b/e2e/policy-advisor/wait-smoke.sh index ae9e934ff..87f135d8f 100755 --- a/e2e/policy-advisor/wait-smoke.sh +++ b/e2e/policy-advisor/wait-smoke.sh @@ -192,8 +192,15 @@ run_flow_a_approve() { || { printf '%s\n' "$body" >&2; fail "Flow A: /wait did not return approved"; } [[ "$(printf '%s\n' "$body" | jq -r '.rejection_reason')" == "" ]] \ || fail "Flow A: rejection_reason should be empty on approve" - - ok "/wait returned status=approved in ${elapsed}s" + # policy_reloaded must be present on every approved response and must + # be true once the supervisor has loaded the merged policy. A false + # here would mean the agent is being told "go ahead and retry" when + # the rule isn't actually in effect — exactly the regression John's + # review caught. + [[ "$(printf '%s\n' "$body" | jq -r '.policy_reloaded')" == "true" ]] \ + || { printf '%s\n' "$body" >&2; fail "Flow A: policy_reloaded should be true on approve"; } + + ok "/wait returned status=approved, policy_reloaded=true in ${elapsed}s" } run_flow_b_reject() { diff --git a/examples/agent-driven-policy-management/README.md b/examples/agent-driven-policy-management/README.md index c84687e60..190123cfe 100644 --- a/examples/agent-driven-policy-management/README.md +++ b/examples/agent-driven-policy-management/README.md @@ -50,6 +50,22 @@ That's the whole thing. The demo resolves your GitHub handle from `gh`, picks `openshell-policy-demo` as the repo, and writes one timestamped markdown file under `openshell-policy-advisor-demo/` per run. +## Driving it manually (real-session UX) + +```shell +DEMO_MANUAL_APPROVE=1 bash examples/agent-driven-policy-management/demo.sh +``` + +Same flow, but the script no longer auto-approves. When the agent submits a +proposal, the demo prints the exact `approve` and `reject --reason` commands +and pauses until you run one from another terminal. This is how you'd review +a coding agent's privilege ask in a real session — read the structured grant, +decide, type one command, watch the agent's `/wait` unblock within ~1s. + +Try a rejection-with-guidance to see the full revise-and-resubmit loop: +reject with `--reason "scope to docs/ paths only"` and the agent reads +`rejection_reason`, drafts a tighter proposal, and pauses again. + ## Overrides (all optional) | Env var | Default | @@ -60,7 +76,8 @@ under `openshell-policy-advisor-demo/` per run. | `DEMO_RUN_ID` | timestamp | | `DEMO_GITHUB_TOKEN` | falls back to `GITHUB_TOKEN`, `GH_TOKEN`, or `gh auth token` | | `DEMO_KEEP_SANDBOX` | `0` (set `1` to inspect the sandbox after the demo) | -| `DEMO_APPROVAL_TIMEOUT_SECS` | `240` | +| `DEMO_MANUAL_APPROVE` | `0` (set `1` to pause for host-side `rule approve` / `rule reject --reason`) | +| `DEMO_APPROVAL_TIMEOUT_SECS` | `240` (auto), `1800` (manual mode) | | `OPENSHELL_BIN` | `target/debug/openshell` if present, else `openshell` on `PATH` | ## What the agent sees diff --git a/examples/agent-driven-policy-management/demo.sh b/examples/agent-driven-policy-management/demo.sh index f52ad4bd6..a3e1d1836 100755 --- a/examples/agent-driven-policy-management/demo.sh +++ b/examples/agent-driven-policy-management/demo.sh @@ -51,7 +51,15 @@ DEMO_FILE_PATH="${DEMO_FILE_DIR}/${DEMO_RUN_ID}.md" DEMO_SANDBOX_NAME="${DEMO_SANDBOX_NAME:-policy-demo-${DEMO_RUN_ID}}" DEMO_CODEX_PROVIDER_NAME="${DEMO_CODEX_PROVIDER_NAME:-codex-policy-demo-${DEMO_RUN_ID}}" DEMO_GITHUB_PROVIDER_NAME="${DEMO_GITHUB_PROVIDER_NAME:-github-policy-demo-${DEMO_RUN_ID}}" -DEMO_APPROVAL_TIMEOUT_SECS="${DEMO_APPROVAL_TIMEOUT_SECS:-240}" +DEMO_MANUAL_APPROVE="${DEMO_MANUAL_APPROVE:-0}" +# Manual approvals need more headroom than the auto-approve loop — a human +# reads the proposal, thinks, and decides. Bump the default to 30 min when +# the developer is in the loop. Explicit overrides still win. +if [[ "$DEMO_MANUAL_APPROVE" == "1" ]]; then + DEMO_APPROVAL_TIMEOUT_SECS="${DEMO_APPROVAL_TIMEOUT_SECS:-1800}" +else + DEMO_APPROVAL_TIMEOUT_SECS="${DEMO_APPROVAL_TIMEOUT_SECS:-240}" +fi DEMO_KEEP_SANDBOX="${DEMO_KEEP_SANDBOX:-0}" TMP_DIR="$(mktemp -d "${TMPDIR:-/tmp}/openshell-policy-demo.XXXXXX")" @@ -420,6 +428,43 @@ EOF info "${DIM}Watching for the pending draft on the gateway...${RESET}" } +# In DEMO_MANUAL_APPROVE mode, swap auto-approve for a human-in-the-loop pause. +# The agent's /wait is already parked on a socket — we just stop driving the +# decision from this script and tell the user the exact commands to run from +# their other terminal. We poll `rule get --status pending` because that's the +# durable signal: a chunk leaving the pending bucket means a decision landed, +# whether approve or reject. The outer loop handles whichever path comes next +# (agent exits cleanly after approve; agent redrafts and we see a fresh +# pending chunk after reject --reason). +approve_manually() { + local pending="$1" + local chunk_id + chunk_id="$(sed 's/\x1b\[[0-9;]*m//g' "$pending" \ + | awk '/^[[:space:]]*Chunk:/ { print $2; exit }')" + [[ -n "$chunk_id" ]] || fail "could not extract chunk_id from pending output" + local short_id="${chunk_id:0:8}" + + step "Decide from your other terminal — the agent's /wait is parked" + info "Run ONE on the host:" + info "" + info " ${BOLD}${GREEN}approve:${RESET} ${DIM}${OPENSHELL_BIN} rule approve ${DEMO_SANDBOX_NAME} --chunk-id ${chunk_id}${RESET}" + info " ${BOLD}reject:${RESET} ${DIM}${OPENSHELL_BIN} rule reject ${DEMO_SANDBOX_NAME} --chunk-id ${chunk_id} --reason \"scope this to ...\"${RESET}" + info "" + info " ${DIM}reject --reason sends free-form guidance back to the agent; it will${RESET}" + info " ${DIM}read rejection_reason, draft a revised proposal, and we'll pause again.${RESET}" + info "" + + while true; do + if ! "$OPENSHELL_BIN" rule get "$DEMO_SANDBOX_NAME" --status pending 2>/dev/null \ + | grep -q "$chunk_id"; then + spin_clear + info " ${GREEN}✓${RESET} decision recorded for chunk ${short_id}" + return + fi + spin_wait "awaiting your decision on chunk ${short_id}" 2 + done +} + approve_pending_until_agent_exits() { step "Waiting for the agent to draft a policy proposal" narrate_sandbox_workflow @@ -454,9 +499,13 @@ approve_pending_until_agent_exits() { info "${GREEN}proposal received:${RESET}" summarize_pending "$pending" - step "Approving — the agent's /wait will return within ~1s" - "$OPENSHELL_BIN" rule approve-all "$DEMO_SANDBOX_NAME" \ - | awk '/approved/ { print " " $0 }' + if [[ "$DEMO_MANUAL_APPROVE" == "1" ]]; then + approve_manually "$pending" + else + step "Approving — the agent's /wait will return within ~1s" + "$OPENSHELL_BIN" rule approve-all "$DEMO_SANDBOX_NAME" \ + | awk '/approved/ { print " " $0 }' + fi approval_count=$((approval_count + 1)) fi From c2632dd957275f5cffbbf49b52a796a7c54226e6 Mon Sep 17 00:00:00 2001 From: Alexander Watson Date: Tue, 12 May 2026 20:43:28 -0700 Subject: [PATCH 15/15] fix(server): make GetDraftPolicy dual-auth so /wait works under OIDC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit policy.local calls GetDraftPolicy from inside the sandbox supervisor via the sandbox gRPC client, which authenticates with the shared x-sandbox-secret. GetDraftPolicy was listed only in the Bearer-auth scope table (config:read) and was not in SANDBOX_SECRET_METHODS or DUAL_AUTH_METHODS, so OIDC-enabled gateways rejected those calls and the /wait long-poll surfaced gateway_lookup_failed. Local/no-OIDC setups happened to work because the auth check is short-circuited. Add GetDraftPolicy to DUAL_AUTH_METHODS, matching the existing GetSandboxConfig pattern (called by both CLI reviewer surfaces with Bearer and the sandbox supervisor with x-sandbox-secret). Dual-auth short-circuits the scope check for sandbox-secret callers, so the config:read entry in authz.rs continues to gate Bearer-only flows. Mirror the openshell_get_sandbox_config_is_dual_auth assertion for GetDraftPolicy. Note: ssh_handshake_secret is server-wide, not per-sandbox, so a sandbox-secret caller can today name any sandbox in a SubmitPolicyAnalysis request — and now in a GetDraftPolicy request. The exposure is symmetric with the existing SANDBOX_SECRET_METHODS pattern. Filed as a follow-up: per-sandbox secret binding, tracked separately. Signed-off-by: Alexander Watson --- crates/openshell-server/src/auth/oidc.rs | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/crates/openshell-server/src/auth/oidc.rs b/crates/openshell-server/src/auth/oidc.rs index d3b74aa81..554fa0593 100644 --- a/crates/openshell-server/src/auth/oidc.rs +++ b/crates/openshell-server/src/auth/oidc.rs @@ -56,9 +56,13 @@ const SANDBOX_SECRET_METHODS: &[&str] = &[ /// (policy/settings mutations) and the sandbox supervisor (policy sync on /// startup). `OpenShell/GetSandboxConfig` serves CLI settings reads while /// remaining compatible with sandbox-secret-authenticated callers. +/// `GetDraftPolicy` serves CLI reviewer surfaces (`openshell rule get`, +/// TUI inbox) AND the sandbox-side `policy.local /wait` long-poll that +/// blocks on the agent's proposal until the developer decides. const DUAL_AUTH_METHODS: &[&str] = &[ "/openshell.v1.OpenShell/UpdateConfig", "/openshell.v1.OpenShell/GetSandboxConfig", + "/openshell.v1.OpenShell/GetDraftPolicy", ]; /// Returns `true` if the method accepts either Bearer or sandbox-secret auth. @@ -495,6 +499,21 @@ mod tests { )); } + #[test] + fn openshell_get_draft_policy_is_dual_auth() { + // policy.local calls GetDraftPolicy with x-sandbox-secret (from + // inside the sandbox supervisor), and the CLI/TUI reviewer + // surfaces call it with an OIDC Bearer. Sandbox-secret-only + // would lock CLI out; Bearer-only would 401 the /wait long-poll + // in OIDC-enabled deployments. + assert!(!is_sandbox_secret_method( + "/openshell.v1.OpenShell/GetDraftPolicy" + )); + assert!(is_dual_auth_method( + "/openshell.v1.OpenShell/GetDraftPolicy" + )); + } + #[test] fn sandbox_secret_validation() { let mut headers = http::HeaderMap::new();