diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs index ed8e355..7e6b0ee 100644 --- a/src/domain_fronter.rs +++ b/src/domain_fronter.rs @@ -435,8 +435,32 @@ pub struct TunnelResponse { /// `e` only when this is `None` and compatibility is needed. #[serde(default)] pub code: Option, + /// Per-session sequence number echoed from the request's `seq` (if any). + /// `None` from servers that don't speak the pipelining protocol — the + /// client treats absence as "not pipelining-capable" and uses the + /// single-in-flight loop. Present and matching the request's seq + /// otherwise; the client routes the response into the per-session + /// reorder buffer keyed by this value. + /// + /// `u64` (not `u32`): a long-lived TCP session generating ~100 ops/s + /// would hit `u32::MAX` in ~1.4 years, at which point the server's + /// `expected` saturates and refuses every subsequent op. `u64` + /// pushes that horizon past any realistic hardware lifetime. + #[serde(default)] + pub seq: Option, + /// Capability bitfield, advertised on `connect` / `connect_data` + /// success responses by tunnel-nodes that speak ≥ this protocol + /// version. Bit 0 = `CAPS_PIPELINE_SEQ`. Old tunnel-nodes return + /// `None`; the client opts into pipelining only when the bit is set. + #[serde(default)] + pub caps: Option, } +/// Capability bit advertised by a tunnel-node that supports per-session +/// `data`-op sequence numbers (per-session pipelining). Only checked on +/// `connect` / `connect_data` success replies. +pub const CAPS_PIPELINE_SEQ: u32 = 1 << 0; + /// A single op in a batch tunnel request. #[derive(Serialize, Clone, Debug)] pub struct BatchOp { @@ -449,6 +473,16 @@ pub struct BatchOp { pub port: Option, #[serde(skip_serializing_if = "Option::is_none")] pub d: Option, + /// Per-session monotonic sequence number for `data` ops on a + /// pipelining-enabled session. Lets the tunnel-node enforce + /// in-order processing of ops that arrive in different batches + /// (potentially via different deployments and network paths). + /// Skipped from the wire when `None`; old tunnel-nodes that don't + /// recognize the field ignore it (serde default + structural-only + /// JSON dispatch on the server). `u64` to keep long-lived sessions + /// from saturating — see `TunnelResponse::seq` for the math. + #[serde(skip_serializing_if = "Option::is_none")] + pub seq: Option, } /// Batch tunnel response from Apps Script / tunnel node. @@ -2793,10 +2827,35 @@ impl DomainFronter { /// Like `tunnel_batch_request` but targets a specific deployment ID. /// Used by the pipeline mux to pin a batch to a deployment whose /// per-account concurrency slot has already been acquired. + /// + /// Backward-compatible 3-arg signature: forwards to + /// `tunnel_batch_request_with_timeout` using the configured + /// `self.batch_timeout`. New callers that need pipelined seq's + /// longer effective budget should use the explicit + /// `tunnel_batch_request_with_timeout` directly. pub async fn tunnel_batch_request_to( &self, script_id: &str, ops: &[BatchOp], + ) -> Result { + self.tunnel_batch_request_with_timeout(script_id, ops, self.batch_timeout) + .await + } + + /// Targets a specific deployment with an explicit per-batch + /// timeout, applied to both the h2 fast path (via + /// `h2_relay_request`'s `response_deadline`) and the h1 + /// fallback's header read. The caller computes this: legacy + /// batches use `self.batch_timeout`, pipelined batches use + /// `max(self.batch_timeout, PIPELINED_BATCH_TIMEOUT_FLOOR)` so a + /// valid server-side wait of `SEQ_WAIT_TIMEOUT + + /// LONGPOLL_DEADLINE` doesn't fire a spurious transport timeout + /// AND a deployment timeout strike. + pub async fn tunnel_batch_request_with_timeout( + &self, + script_id: &str, + ops: &[BatchOp], + batch_timeout: Duration, ) -> Result { let mut map = serde_json::Map::new(); map.insert("k".into(), Value::String(self.auth_key.clone())); @@ -2813,11 +2872,14 @@ impl DomainFronter { // `data`/`udp_data`/`connect` may have already executed // upstream when the response framing failed. Replaying the // whole batch on h1 risks duplicating every op in it. Only - // fall back when h2 definitely never sent. Honors - // user-configured batch_timeout so a slow but legitimate - // batch isn't cut off at an arbitrary fixed cap. + // fall back when h2 definitely never sent. Honors the + // caller-supplied `batch_timeout` (legacy = configured + // `request_timeout_secs`; pipelined = max(configured, + // PIPELINED_BATCH_TIMEOUT_FLOOR)) so a slow but legitimate + // batch isn't cut off at an arbitrary fixed cap, and + // pipelined-batch server-side waits stay inside the budget. match self - .h2_relay_request(&path, payload.clone(), self.batch_timeout) + .h2_relay_request(&path, payload.clone(), batch_timeout) .await { Ok((status, _hdrs, _resp_body)) if is_h2_fronting_refusal_status(status) => { @@ -2865,8 +2927,14 @@ impl DomainFronter { entry.stream.write_all(&payload).await?; entry.stream.flush().await?; + // Use `batch_timeout` for the per-read deadline on this h1 + // fallback. The legacy 10 s default in `read_http_response` + // would fire before a pipelined batch's server-side + // `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` (~45 s) wait could + // legitimately complete — that fired as "batch timed out" + // client-side AND recorded a deployment timeout strike. let (mut status, mut resp_headers, mut resp_body) = - read_http_response(&mut entry.stream).await?; + read_http_response_with_timeout(&mut entry.stream, batch_timeout).await?; // Follow redirect chain for _ in 0..5 { @@ -2879,7 +2947,7 @@ impl DomainFronter { ); entry.stream.write_all(req.as_bytes()).await?; entry.stream.flush().await?; - let (s, h, b) = read_http_response(&mut entry.stream).await?; + let (s, h, b) = read_http_response_with_timeout(&mut entry.stream, batch_timeout).await?; status = s; resp_headers = h; resp_body = b; } @@ -3665,14 +3733,41 @@ fn parse_redirect(location: &str) -> (String, Option) { /// Read a single HTTP/1.1 response from the stream. Keep-alive safe: respects /// Content-Length or chunked transfer-encoding. +/// Default first-byte / per-read deadline for `read_http_response`. +/// Single-tunnel-op and relay paths use this — Apps Script normally +/// streams headers within ~1 s, so 10 s catches stalls without +/// false-firing on jitter. The batched-tunnel path overrides via +/// `read_http_response_with_timeout` because pipelined batches can +/// legitimately wait `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` before +/// Apps Script sends a single response byte. +const DEFAULT_HTTP_READ_TIMEOUT: Duration = Duration::from_secs(10); + async fn read_http_response(stream: &mut S) -> Result<(u16, Vec<(String, String)>, Vec), FronterError> +where + S: tokio::io::AsyncRead + Unpin, +{ + read_http_response_with_timeout(stream, DEFAULT_HTTP_READ_TIMEOUT).await +} + +/// Like [`read_http_response`] but with a caller-controlled per-read +/// deadline. Used by the batched-tunnel h1 path so a pipelined batch +/// whose server-side worst-case wait is `SEQ_WAIT_TIMEOUT (30 s) + +/// LONGPOLL_DEADLINE (15 s)` doesn't get cut off at the legacy 10 s +/// header read — that fired before the response could legitimately +/// arrive, surfacing as a "batch timed out" client-side AND +/// recording a timeout strike against an otherwise-healthy +/// deployment. +async fn read_http_response_with_timeout( + stream: &mut S, + read_timeout: Duration, +) -> Result<(u16, Vec<(String, String)>, Vec), FronterError> where S: tokio::io::AsyncRead + Unpin, { let mut buf = Vec::with_capacity(8192); let mut tmp = [0u8; 8192]; let header_end = loop { - let n = timeout(Duration::from_secs(10), stream.read(&mut tmp)).await + let n = timeout(read_timeout, stream.read(&mut tmp)).await .map_err(|_| FronterError::Timeout)??; if n == 0 { return Err(FronterError::BadResponse("connection closed before headers".into())); diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs index cb6ce12..7e928ce 100644 --- a/src/tunnel_client.rs +++ b/src/tunnel_client.rs @@ -24,7 +24,9 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt}; use tokio::net::TcpStream; use tokio::sync::{mpsc, oneshot, Semaphore}; -use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse}; +use crate::domain_fronter::{ + BatchOp, DomainFronter, FronterError, TunnelResponse, CAPS_PIPELINE_SEQ, +}; /// Apps Script allows 30 concurrent executions per account / deployment. const CONCURRENCY_PER_DEPLOYMENT: usize = 30; @@ -51,6 +53,38 @@ const MAX_BATCH_OPS: usize = 50; /// blocking indefinitely. const REPLY_TIMEOUT: Duration = Duration::from_secs(35); +/// Slack added to the effective batch timeout to derive the per-session +/// pipelined reply watchdog (`TunnelMux::pipelined_reply_timeout`). +/// Covers queueing stages the batch HTTP timeout doesn't include: +/// +/// * coalesce wait (≤ `coalesce_max_ms`, default 1 s) +/// * per-deployment semaphore acquire (typical sub-second under +/// HTTP/2 multiplex; worst case bounded by another batch_timeout) +/// * mpsc channel hop into / out of `mux_loop` +/// +/// Setting this any tighter risks the watchdog firing on legitimate +/// slow batches that are still inside their HTTP-layer budget; any +/// looser just delays detection of a genuinely broken mux. 15 s +/// covers the realistic queueing while keeping the upper bound +/// finite. +const PIPELINED_REPLY_SLACK: Duration = Duration::from_secs(15); + +/// Floor on the per-batch HTTP timeout for batches that contain at +/// least one seq'd `data` op. The server-side worst-case wait for +/// a seq op is `SEQ_WAIT_TIMEOUT (~30 s) + LONGPOLL_DEADLINE (~15 s) +/// = ~45 s`. The default `Config::request_timeout_secs` is 30 s, +/// which would fire BEFORE a valid pipelined batch could complete — +/// triggering "batch timed out", closing the pipelined session, AND +/// recording a timeout strike against an otherwise-healthy +/// deployment (eventually blacklisting it). Take `max(configured, +/// 60 s)` for seq-bearing batches so the legitimate server wait +/// always fits inside the client budget. The per-session reply +/// watchdog (`TunnelMux::pipelined_reply_timeout`) is derived from +/// this same effective batch timeout plus `PIPELINED_REPLY_SLACK` +/// — the two agree on what counts as "definitely broken" without +/// the per-session watchdog ever firing on a still-valid batch. +const PIPELINED_BATCH_TIMEOUT_FLOOR: Duration = Duration::from_secs(60); + /// How long we'll briefly hold the client socket after the local /// CONNECT/SOCKS5 handshake, waiting for the client's first bytes (the /// TLS ClientHello for HTTPS). Bundling those bytes with the tunnel-node @@ -80,10 +114,11 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP"; /// Empty poll round-trip latency below which we conclude the tunnel-node /// is *not* long-polling (legacy fixed-sleep drain instead). On a /// long-poll-capable server an empty poll with no upstream push either -/// returns near `LONGPOLL_DEADLINE` (~5 s) or comes back early *with* -/// pushed bytes — neither matches a fast empty reply. Threshold sits -/// well above the legacy `~350 ms` drain and well below the long-poll -/// floor, so network jitter on either side won't false-trigger. +/// returns near `LONGPOLL_DEADLINE` (~15 s on current tunnel-nodes) or +/// comes back early *with* pushed bytes — neither matches a fast empty +/// reply. Threshold sits well above the legacy `~350 ms` drain and well +/// below the long-poll floor, so network jitter on either side won't +/// false-trigger. const LEGACY_DETECT_THRESHOLD: Duration = Duration::from_millis(1500); /// How long a deployment stays in "legacy / no long-poll" mode after the @@ -173,6 +208,13 @@ enum MuxMsg { Data { sid: String, data: Bytes, + /// Per-session monotonic seq for pipelined sessions. `None` for + /// non-pipelined sessions (and any session whose tunnel-node + /// didn't advertise `CAPS_PIPELINE_SEQ`). When `Some`, the + /// tunnel-node enforces in-order processing and echoes the + /// value back in the response so the client can route it into + /// its per-session reorder buffer. + seq: Option, reply: BatchedReply, }, UdpOpen { @@ -208,6 +250,9 @@ struct PendingOp { /// only `connect_data`, which uses presence of `d` as the signal /// that the caller is opting into the bundled-first-bytes flow). encode_empty: bool, + /// Per-session seq for pipelined `data` ops; `None` otherwise. + /// Forwarded verbatim to `BatchOp::seq` by `encode_pending`. + seq: Option, } pub struct TunnelMux { @@ -280,10 +325,31 @@ pub struct TunnelMux { /// `(host, port)`, value is the expiry instant. Plain Mutex is /// fine: it's touched once per CONNECT (cheap) and once per failure. unreachable_cache: Mutex>, + /// Snapshot of the configured `Config::request_timeout_secs` taken + /// at mux construction. Used to derive the per-session pipelined + /// reply timeout (see `pipelined_reply_timeout()`) so a user-tuned + /// `request_timeout_secs > 60` doesn't cause sessions to close + /// before the batch layer can legitimately complete. + batch_timeout: Duration, + /// Set when ANY pipelined reply observed the server dropping the + /// `seq` field — i.e. the request reached an old tunnel-node + /// that doesn't speak the pipelining protocol. Once set, new + /// sessions skip the `caps` opt-in even if a connect_data reply + /// advertises it; the safe fallback is the legacy single-in-flight + /// loop. This protects mixed-version multi-deployment configs + /// where round-robin can land a session's seq ops on an + /// un-upgraded backend after a different deployment advertised + /// the bit. The flag is process-wide (not per-deployment): once + /// observed, downgrade everyone until restart. + pipelining_globally_disabled: AtomicBool, } impl TunnelMux { - pub fn start(fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) -> Arc { + pub fn start( + fronter: Arc, + coalesce_step_ms: u64, + coalesce_max_ms: u64, + ) -> Arc { // Dedupe before snapshotting: the aggregate `all_legacy` gate // compares `legacy_deployments.len()` (a HashMap, so unique // keys) against this count, so using the raw `num_scripts()` @@ -308,9 +374,18 @@ impl TunnelMux { unique_n, CONCURRENCY_PER_DEPLOYMENT ); - let step = if coalesce_step_ms > 0 { coalesce_step_ms } else { DEFAULT_COALESCE_STEP_MS }; - let max = if coalesce_max_ms > 0 { coalesce_max_ms } else { DEFAULT_COALESCE_MAX_MS }; + let step = if coalesce_step_ms > 0 { + coalesce_step_ms + } else { + DEFAULT_COALESCE_STEP_MS + }; + let max = if coalesce_max_ms > 0 { + coalesce_max_ms + } else { + DEFAULT_COALESCE_MAX_MS + }; tracing::info!("batch coalesce: step={}ms max={}ms", step, max); + let batch_timeout = fronter.batch_timeout(); let (tx, rx) = mpsc::channel(512); tokio::spawn(mux_loop(rx, fronter, step, max)); Arc::new(Self { @@ -326,9 +401,64 @@ impl TunnelMux { preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), unreachable_cache: Mutex::new(HashMap::new()), + batch_timeout, + pipelining_globally_disabled: AtomicBool::new(false), }) } + /// Effective per-session reply timeout for pipelined sessions. + /// Has to exceed the actual batch layer's worst-case time — + /// otherwise the session's reply watchdog fires before + /// `fire_batch` returns, dropping in-flight oneshots and + /// forcing avoidable disconnects on legitimate slow batches. + /// + /// `effective_batch_timeout = max(configured, PIPELINED_BATCH_TIMEOUT_FLOOR)` + /// accounts for the server-side worst-case wait + /// (`SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE`) plus the h2/h1 + /// transport. The watchdog covers TWO of those (rather than + /// one + small slack) because under per-deployment semaphore + /// saturation — all 30 in-flight slots holding long-poll seq + /// batches — a fresh op waits up to one full + /// `effective_batch_timeout` for a permit BEFORE its own + /// `effective_batch_timeout`-bounded request even starts. With + /// a one-batch budget the watchdog fires while the op is still + /// queued for a permit, dropping the oneshot and closing a + /// healthy pipelined session under load. + /// + /// Additional `PIPELINED_REPLY_SLACK` covers the smaller + /// queueing stages: mpsc hop into `mux_loop`, coalesce wait + /// (≤ `coalesce_max_ms`), encode/transit overhead. Total + /// default budget: 60 s × 2 + 15 s = 135 s. With a configured + /// `request_timeout_secs = 120 s`: 120 × 2 + 15 = 255 s. + pub(crate) fn pipelined_reply_timeout(&self) -> Duration { + let effective_batch = self.batch_timeout.max(PIPELINED_BATCH_TIMEOUT_FLOOR); + // 2× to cover saturated-semaphore permit wait + own request. + effective_batch * 2 + PIPELINED_REPLY_SLACK + } + + /// True once any pipelined reply observed a `None` seq from the + /// server, indicating the deployment route reached an old + /// tunnel-node that doesn't speak the protocol. Sticky for the + /// process lifetime — downgrade all sessions to the legacy loop + /// until restart. + pub(crate) fn pipelining_disabled(&self) -> bool { + self.pipelining_globally_disabled.load(Ordering::Acquire) + } + + pub(crate) fn mark_pipelining_disabled(&self) { + if !self + .pipelining_globally_disabled + .swap(true, Ordering::AcqRel) + { + tracing::warn!( + "tunnel-node along the round-robin path dropped the seq \ + field on a pipelined reply; disabling pipelining for new \ + sessions until restart. Mixed-version config? Make sure \ + every script_id forwards to the same upgraded tunnel-node." + ); + } + } + async fn send(&self, msg: MuxMsg) { let _ = self.tx.send(msg).await; } @@ -591,7 +721,12 @@ impl TunnelMux { } } -async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, coalesce_step_ms: u64, coalesce_max_ms: u64) { +async fn mux_loop( + mut rx: mpsc::Receiver, + fronter: Arc, + coalesce_step_ms: u64, + coalesce_max_ms: u64, +) { let coalesce_step = Duration::from_millis(coalesce_step_ms); let coalesce_max = Duration::from_millis(coalesce_max_ms); // One semaphore per deployment ID, each allowing 30 concurrent requests. @@ -679,10 +814,18 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, c port: Some(port), data: Some(data), encode_empty: true, + seq: None, }; - accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await; + accum + .push_or_fire(op, op_bytes, reply, &sems, &fronter) + .await; } - MuxMsg::Data { sid, data, reply } => { + MuxMsg::Data { + sid, + data, + seq, + reply, + } => { let op_bytes = encoded_len(data.len()); let op = PendingOp { op: "data", @@ -691,8 +834,11 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, c port: None, data: if data.is_empty() { None } else { Some(data) }, encode_empty: false, + seq, }; - accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await; + accum + .push_or_fire(op, op_bytes, reply, &sems, &fronter) + .await; } MuxMsg::UdpOpen { host, @@ -708,8 +854,11 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, c port: Some(port), data: if data.is_empty() { None } else { Some(data) }, encode_empty: false, + seq: None, }; - accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await; + accum + .push_or_fire(op, op_bytes, reply, &sems, &fronter) + .await; } MuxMsg::UdpData { sid, data, reply } => { let op_bytes = encoded_len(data.len()); @@ -720,8 +869,11 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, c port: None, data: if data.is_empty() { None } else { Some(data) }, encode_empty: false, + seq: None, }; - accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await; + accum + .push_or_fire(op, op_bytes, reply, &sems, &fronter) + .await; } MuxMsg::Close { sid } => { close_sids.push(sid); @@ -739,6 +891,7 @@ async fn mux_loop(mut rx: mpsc::Receiver, fronter: Arc, c port: None, data: None, encode_empty: false, + seq: None, }); } @@ -842,6 +995,7 @@ fn encode_pending(p: PendingOp) -> BatchOp { sid: p.sid, host: p.host, port: p.port, + seq: p.seq, d, } } @@ -880,10 +1034,34 @@ async fn fire_batch( // Bounded-wait: if the batch takes longer than the configured // batch timeout (Config::request_timeout_secs), all sessions in // this batch get an error and can retry. - let batch_timeout = f.batch_timeout(); + // + // Pipelined-batch floor: if ANY op in this batch carries a + // `seq`, the server-side worst-case wait is + // `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` (≈ 45 s). The default + // configured timeout (30 s) would fire first, surface as + // "batch timed out" client-side, AND record a timeout strike + // against the deployment. `max(configured, 60 s)` keeps the + // client budget aligned with what the server can legitimately + // take, without changing legacy-batch behavior. + let configured = f.batch_timeout(); + let has_seq_op = data_ops.iter().any(|op| op.seq.is_some()); + let batch_timeout = if has_seq_op { + configured.max(PIPELINED_BATCH_TIMEOUT_FLOOR) + } else { + configured + }; + // Pass the effective timeout into `tunnel_batch_request_to` + // so the underlying h2 / h1 transports use it too. Without + // this, the inner `h2_relay_request` (and h1 + // `read_http_response`) defaulted to `self.batch_timeout` + // (30 s) and fired before our outer 60 s budget could ever + // be reached on a pipelined batch — surfacing as "batch + // timed out" client-side and recording an undeserved + // timeout strike against the deployment. The outer + // `tokio::time::timeout` stays as a defensive ceiling. let result = tokio::time::timeout( batch_timeout, - f.tunnel_batch_request_to(&script_id, &data_ops), + f.tunnel_batch_request_with_timeout(&script_id, &data_ops, batch_timeout), ) .await; tracing::info!( @@ -1047,22 +1225,51 @@ pub async fn tunnel_connection( } }; - let (sid, first_resp, pending_client_data) = match initial_data { + let (sid, caps, first_resp, pending_client_data) = match initial_data { Some(data) => match connect_with_initial_data(host, port, data.clone(), mux).await? { - ConnectDataOutcome::Opened { sid, response } => (sid, Some(response), None), + ConnectDataOutcome::Opened { sid, response } => { + let caps = response.caps; + (sid, caps, Some(response), None) + } ConnectDataOutcome::Unsupported => { mux.mark_connect_data_unsupported(); - let sid = connect_plain(host, port, mux).await?; + let (sid, caps) = connect_plain(host, port, mux).await?; // Replay the buffered ClientHello on the first tunnel_loop // iteration. `Bytes::clone()` is a cheap Arc bump — no // copy of the 64 KB buffer. - (sid, None, Some(data)) + (sid, caps, None, Some(data)) } }, - None => (connect_plain(host, port, mux).await?, None, None), + None => { + let (sid, caps) = connect_plain(host, port, mux).await?; + (sid, caps, None, None) + } }; - tracing::info!("tunnel session {} opened for {}:{}", sid, host, port); + // Opt into pipelining only when: + // 1. THIS connect/connect_data reply advertised the bit, AND + // 2. We haven't observed a stale-version reply previously + // (round-robin can land us on an un-upgraded backend after + // a different deployment served the connect with caps set). + // The global toggle is sticky for the process lifetime — once + // `mark_pipelining_disabled` fires (on a `seq=None` reply for a + // pipelined session), every NEW session falls back to the legacy + // single-in-flight loop instead of risking another disconnect on + // the next mixed-version seq op. + let pipeline_advertised = caps.map(|c| c & CAPS_PIPELINE_SEQ != 0).unwrap_or(false); + let pipeline = pipeline_advertised && !mux.pipelining_disabled(); + tracing::info!( + "tunnel session {} opened for {}:{} (pipeline={}{})", + sid, + host, + port, + pipeline, + if pipeline_advertised && !pipeline { + ", advertised but disabled (mixed-version backend observed)" + } else { + "" + }, + ); // Run the first-response write + tunnel_loop inside an async block so // any io-error propagates via `?` without bypassing the Close below. @@ -1087,7 +1294,11 @@ pub async fn tunnel_connection( return Ok(()); } } - tunnel_loop(&mut sock, &sid, mux, pending_client_data).await + if pipeline { + tunnel_loop_pipelined(&mut sock, &sid, mux, pending_client_data).await + } else { + tunnel_loop(&mut sock, &sid, mux, pending_client_data).await + } } .await; @@ -1104,7 +1315,16 @@ enum ConnectDataOutcome { Unsupported, } -async fn connect_plain(host: &str, port: u16, mux: &Arc) -> std::io::Result { +/// Open a tunnel session via plain `connect` (no bundled first bytes). +/// Returns the new sid plus the tunnel-node's `caps` advertisement — +/// `None` against legacy tunnel-nodes, `Some(bits)` against new ones. +/// The caller passes `caps` into `tunnel_loop` to pick between the +/// legacy single-in-flight loop and the pipelined variant. +async fn connect_plain( + host: &str, + port: u16, + mux: &Arc, +) -> std::io::Result<(String, Option)> { let (reply_tx, reply_rx) = oneshot::channel(); mux.send(MuxMsg::Connect { host: host.to_string(), @@ -1127,9 +1347,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc) -> std::io:: e.clone(), )); } - resp.sid.ok_or_else(|| { + let caps = resp.caps; + let sid = resp.sid.ok_or_else(|| { std::io::Error::new(std::io::ErrorKind::Other, "tunnel connect: no session id") - }) + })?; + Ok((sid, caps)) } Ok(Err(e)) => { tracing::error!("tunnel connect error for {}:{}: {}", host, port, e); @@ -1282,7 +1504,7 @@ async fn tunnel_loop( loop { // Cadence depends on whether the tunnel-node is doing long-poll // drains. With long-poll, the server holds empty polls open up - // to its `LONGPOLL_DEADLINE` (~5 s currently), so the client + // to its `LONGPOLL_DEADLINE` (~15 s on current tunnel-nodes), so the client // can keep this read timeout short — the wait is on the wire, // not here. Against *legacy* tunnel-nodes (no long-poll, fast // empty replies), the same short cadence + always-poll behavior @@ -1361,6 +1583,7 @@ async fn tunnel_loop( mux.send(MuxMsg::Data { sid: sid.to_string(), data, + seq: None, reply: reply_tx, }) .await; @@ -1427,6 +1650,279 @@ async fn tunnel_loop( Ok(()) } +/// Maximum number of `data` ops a pipelined session keeps in flight +/// at any one time. Two parallel batches per session is enough to +/// overlap one Apps Script round-trip with the next on a single TCP +/// stream — that's the throughput win. Higher depths run into +/// diminishing returns (server-side seq enforcement serializes the +/// drains anyway) and burn more Apps Script quota on speculative +/// polls when a session goes idle. +const PIPELINE_DEPTH: usize = 2; + +/// Pipelined variant of `tunnel_loop`: keeps up to `PIPELINE_DEPTH` +/// `data` ops in flight per session, each tagged with a per-session +/// monotonic `seq`. The tunnel-node enforces in-order processing so +/// replies — which may travel back over different deployments and +/// arrive out of order on the wire — carry bytes that are correctly +/// sequenced within the session's stream. +/// +/// The reorder concern is handled implicitly by FIFO `await` on the +/// in-flight oneshot queue: whichever reply arrived first stays in +/// its oneshot's buffer until our task reaches its turn in the +/// queue. Output to the local socket is therefore always in send +/// order, which (because the server processed the corresponding +/// uplinks in seq order) matches the upstream byte order. +async fn tunnel_loop_pipelined( + sock: &mut TcpStream, + sid: &str, + mux: &Arc, + mut pending_client_data: Option, +) -> std::io::Result<()> { + let (mut reader, mut writer) = sock.split(); + const READ_CHUNK: usize = 65536; + const ZERO_COPY_THRESHOLD: usize = READ_CHUNK / 2; + let mut buf = BytesMut::with_capacity(READ_CHUNK); + // FIFO of in-flight (seq, oneshot) pairs. Front = oldest seq + // still awaiting a response; we always await the front so output + // to the local socket is in seq order regardless of which reply + // arrived first on the wire. The seq is kept alongside the + // receiver so we can verify the server echoed the right value + // (defends against a server bug or version skew silently + // mis-routing bytes between sessions). + let mut in_flight: std::collections::VecDeque<( + u64, + oneshot::Receiver>, + )> = std::collections::VecDeque::new(); + let mut next_send_seq: u64 = 0; + let mut consecutive_empty: u32 = 0; + // Flag set when the local client half-closes (TCP shutdown send, + // returns Ok(0) from read) or its socket errors. Once set, we + // stop queuing new uplink ops but continue draining replies for + // seqs already in flight — those replies may carry downlink + // bytes the server already produced. Returning early on EOF + // (the previous shape) dropped the queued oneshot receivers + // and silently lost any pending downlink, breaking valid + // half-close patterns like HTTP request/response with the + // client shutting its write half after the request. + let mut client_send_closed = false; + + loop { + // SEND PHASE. While we have room and either the client has + // data or we're keeping the pipeline full, queue another op. + // `consecutive_empty == 0` means the last reply brought + // downlink bytes — we treat that as "active transfer" and + // pre-fetch with PIPELINE_DEPTH polls. Otherwise (idle) we + // keep just one in-flight poll so the server long-polls it + // and we don't burn quota on speculative empties. + // + // When `client_send_closed` is set we skip the send phase + // entirely and fall through to RECEIVE — the loop exits + // cleanly once `in_flight` drains. + let active = consecutive_empty == 0; + let target_depth = if active { PIPELINE_DEPTH } else { 1 }; + while !client_send_closed && in_flight.len() < target_depth { + // Decide what data to send: replay any pending preread + // bytes first; then attempt a client read. If we have + // nothing in flight we *must* send (otherwise the loop + // stalls waiting for replies that aren't coming), so + // use the legacy escalating timeout. With ops already + // in flight we only send when the client has data + // *immediately* available — speculative empty polls in + // that branch would just burn Apps Script quota. + // + // `Option::None` from the inner match means EOF + // / read-error: set `client_send_closed` and break out + // of the send phase. The remaining in_flight ops still + // get drained by the RECEIVE PHASE below. + let data: Option = if let Some(d) = pending_client_data.take() { + Some(d) + } else if in_flight.is_empty() { + let read_timeout = match (mux.all_servers_legacy(), consecutive_empty) { + (_, 0) => Duration::from_millis(20), + (_, 1) => Duration::from_millis(80), + (_, 2) => Duration::from_millis(200), + (false, _) => Duration::from_millis(500), + (true, _) => Duration::from_secs(30), + }; + buf.reserve(READ_CHUNK); + match tokio::time::timeout(read_timeout, reader.read_buf(&mut buf)).await { + Ok(Ok(0)) => None, + Ok(Ok(n)) => { + consecutive_empty = 0; + Some(if n >= ZERO_COPY_THRESHOLD { + buf.split().freeze() + } else { + let owned = Bytes::copy_from_slice(&buf[..n]); + buf.clear(); + owned + }) + } + Ok(Err(_)) => None, + Err(_) => Some(Bytes::new()), + } + } else { + // Already have ops in flight. Non-blocking read: if + // the client has data right now, pipeline another op; + // otherwise drop out and await replies. + buf.reserve(READ_CHUNK); + match tokio::time::timeout(Duration::from_millis(0), reader.read_buf(&mut buf)) + .await + { + Ok(Ok(0)) => None, + Ok(Ok(n)) => { + consecutive_empty = 0; + Some(if n >= ZERO_COPY_THRESHOLD { + buf.split().freeze() + } else { + let owned = Bytes::copy_from_slice(&buf[..n]); + buf.clear(); + owned + }) + } + Ok(Err(_)) => None, + Err(_) => break, // no data ready, stop trying to send more + } + }; + + let Some(data) = data else { + // EOF or read error. Flag the close and let RECEIVE + // drain whatever's already in flight before we exit. + client_send_closed = true; + break; + }; + + let seq = next_send_seq; + next_send_seq = next_send_seq.saturating_add(1); + let (reply_tx, reply_rx) = oneshot::channel(); + mux.send(MuxMsg::Data { + sid: sid.to_string(), + data, + seq: Some(seq), + reply: reply_tx, + }) + .await; + in_flight.push_back((seq, reply_rx)); + } + + // RECEIVE PHASE. Await the front-of-queue reply. Subsequent + // replies (whose seqs come later) wait in their oneshot + // buffers until we get to them — that's the reorder + // mechanism, and it doesn't need its own data structure. + let Some((expected_seq, reply_rx)) = in_flight.pop_front() else { + // Nothing in flight. If the client has half-closed, we + // can exit cleanly now — there are no more replies to + // drain. Otherwise (defensive) loop back; the send + // phase guarantees at least one op in flight when the + // client is still active. + if client_send_closed { + return Ok(()); + } + continue; + }; + // Derive the per-session reply timeout from the effective + // batch timeout (max(configured, PIPELINED_BATCH_TIMEOUT_FLOOR)) + // plus queueing slack — see `TunnelMux::pipelined_reply_timeout`. + // Computing this dynamically (rather than the previous fixed + // 60 s constant) closes the session-watchdog vs batch-layer + // race: with `request_timeout_secs > 60`, the batch is still + // inside its budget while the session watchdog would + // otherwise close it, dropping in-flight oneshots. + let reply_timeout = mux.pipelined_reply_timeout(); + let (resp, _script_id) = match tokio::time::timeout(reply_timeout, reply_rx).await { + Ok(Ok(Ok((r, sid_used)))) => (r, sid_used), + Ok(Ok(Err(e))) => { + tracing::debug!("pipelined data error: {}", e); + break; + } + Ok(Err(_)) => break, // channel dropped + Err(_) => { + // Pipelined timeout has to close the session + // rather than retry. Continuing here drops the + // oneshot for `expected_seq` — if the server + // later completes it, any drained downlink bytes + // it would have carried are lost, and subsequent + // seq replies happily fill the local socket past + // a now-invisible gap (silent stream corruption). + // The dynamic `reply_timeout` is sized to exceed + // any valid server-side wait + queueing, so + // hitting it means the mux dropped a message or + // the task panicked — closing is the only + // bytes-correct response. + tracing::warn!( + "pipelined sess {}: reply timeout for seq {} after {:?}, \ + closing session (further ops would risk silent \ + corruption from missing downlink bytes)", + &sid[..sid.len().min(8)], + expected_seq, + reply_timeout, + ); + break; + } + }; + + // Verify the server echoed the seq we expected. A mismatch + // means either (a) the tunnel-node has a bug, or (b) a + // version-skew situation where the response shape changed + // — either way we can't trust the bytes belong to this + // session position, so close the session rather than + // silently writing potentially-misrouted bytes. + match resp.seq { + Some(s) if s == expected_seq => { /* match — proceed */ } + Some(s) => { + tracing::error!( + "pipelined sess {}: server echoed seq {} but we expected {}; \ + closing session to avoid misordered output", + &sid[..sid.len().min(8)], + s, + expected_seq, + ); + break; + } + None => { + // The reply reached an old tunnel-node along the + // round-robin path that doesn't speak the seq + // protocol. Globally disable pipelining for new + // sessions until the process restarts — the next + // round-robin pick could repeat the disconnect, and + // staying in legacy mode is strictly safer than + // hoping the broken backend rotates out. + mux.mark_pipelining_disabled(); + tracing::error!( + "pipelined sess {}: server reply for seq {} omitted seq field; \ + closing session and disabling pipelining for new sessions \ + (mixed-version backend along the round-robin path)", + &sid[..sid.len().min(8)], + expected_seq, + ); + break; + } + } + + if let Some(ref e) = resp.e { + tracing::debug!("pipelined tunnel error: {}", e); + break; + } + + let got_data = match write_tunnel_response(&mut writer, &resp).await? { + WriteOutcome::Wrote => true, + WriteOutcome::NoData => false, + WriteOutcome::BadBase64 => break, + }; + + if resp.eof.unwrap_or(false) { + break; + } + + if got_data { + consecutive_empty = 0; + } else { + consecutive_empty = consecutive_empty.saturating_add(1); + } + } + + Ok(()) +} + enum WriteOutcome { Wrote, NoData, @@ -1489,6 +1985,8 @@ mod tests { eof: None, e: e.map(str::to_string), code: code.map(str::to_string), + seq: None, + caps: None, } } @@ -1588,11 +2086,7 @@ mod tests { #[test] fn negative_cache_ignores_non_unreachable_errors() { let (mux, _rx) = mux_for_test(); - mux.record_unreachable_if_match( - "example.com", - 443, - "connect failed: connection refused", - ); + mux.record_unreachable_if_match("example.com", 443, "connect failed: connection refused"); assert!(!mux.is_unreachable("example.com", 443)); } @@ -1624,9 +2118,10 @@ mod tests { async fn negative_cache_skips_outer_relay_errors() { let (mux, mut rx) = mux_for_test(); let mux_for_task = mux.clone(); - let task = tokio::spawn(async move { - connect_plain("real.target.example", 443, &mux_for_task).await - }); + let task = + tokio::spawn( + async move { connect_plain("real.target.example", 443, &mux_for_task).await }, + ); // Receive the Connect msg and reply with an outer Err whose string // would otherwise match `is_unreachable_error_str`. @@ -1636,7 +2131,7 @@ mod tests { other => panic!("expected Connect, got {:?}", std::mem::discriminant(&other)), }; let _ = reply.send(Err( - "relay failed: Network is unreachable (os error 101)".into(), + "relay failed: Network is unreachable (os error 101)".into() )); let res = task.await.expect("task"); @@ -1662,11 +2157,7 @@ mod tests { "connect failed: Network is unreachable (os error 101)", ); } - let len = mux - .unreachable_cache - .lock() - .map(|g| g.len()) - .unwrap_or(0); + let len = mux.unreachable_cache.lock().map(|g| g.len()).unwrap_or(0); assert!( len <= UNREACHABLE_CACHE_MAX, "cache size {} exceeded cap {}", @@ -1719,6 +2210,12 @@ mod tests { preread_win_total_us: AtomicU64::new(0), preread_total_events: AtomicU64::new(0), unreachable_cache: Mutex::new(HashMap::new()), + // Tests don't go through `start()`, so we pick a + // representative configured batch_timeout (matches the + // 30s `default_request_timeout_secs`) for derivations + // like `pipelined_reply_timeout`. + batch_timeout: Duration::from_secs(30), + pipelining_globally_disabled: AtomicBool::new(false), }); (mux, rx) } @@ -1757,7 +2254,12 @@ mod tests { .expect("mux channel closed unexpectedly"); match msg { - MuxMsg::Data { sid, data, reply } => { + MuxMsg::Data { + sid, + data, + seq: _, + reply, + } => { assert_eq!(sid, "sid-under-test"); assert_eq!(&data[..], b"CLIENTHELLO"); // Reply with eof so tunnel_loop unwinds cleanly. @@ -1769,6 +2271,8 @@ mod tests { eof: Some(true), e: None, code: None, + seq: None, + caps: None, }, "test-script".to_string(), ))); @@ -1838,9 +2342,18 @@ mod tests { )) .expect("mux channel closed unexpectedly"); match msg { - MuxMsg::Data { sid, data, reply } => { + MuxMsg::Data { + sid, + data, + seq: _, + reply, + } => { assert_eq!(sid, "sid-mixed"); - assert!(data.is_empty(), "expected empty poll, got {} bytes", data.len()); + assert!( + data.is_empty(), + "expected empty poll, got {} bytes", + data.len() + ); let last = i == 5; let _ = reply.send(Ok(( TunnelResponse { @@ -1850,6 +2363,8 @@ mod tests { eof: if last { Some(true) } else { None }, e: None, code: None, + seq: None, + caps: None, }, "script-A".to_string(), ))); @@ -1866,6 +2381,569 @@ mod tests { .expect("tunnel_loop did not exit after eof"); } + /// Capability detection: a TunnelResponse whose `caps` field + /// includes `CAPS_PIPELINE_SEQ` must enable pipelining for the + /// session, including when the response shape is the batched + /// `connect_data` reply (the dominant HTTPS fast path). Without + /// this end-to-end check, we can regress the bit by serializing + /// the wrong response shape on the server (the case the + /// reviewer caught: batched connect_data went through + /// `tcp_drain_response` which sets `caps: None`). + #[test] + fn caps_field_drives_pipeline_decision() { + // Server response with caps bit set → pipeline opt-in. + let resp_pipelined = TunnelResponse { + sid: Some("s".into()), + d: None, + pkts: None, + eof: None, + e: None, + code: None, + seq: None, + caps: Some(CAPS_PIPELINE_SEQ), + }; + let pipeline = resp_pipelined + .caps + .map(|c| c & CAPS_PIPELINE_SEQ != 0) + .unwrap_or(false); + assert!(pipeline, "caps bit set must enable pipelining"); + + // Old tunnel-node, no caps field → legacy path. + let resp_legacy = TunnelResponse { + sid: Some("s".into()), + d: None, + pkts: None, + eof: None, + e: None, + code: None, + seq: None, + caps: None, + }; + let pipeline = resp_legacy + .caps + .map(|c| c & CAPS_PIPELINE_SEQ != 0) + .unwrap_or(false); + assert!(!pipeline, "absent caps must NOT enable pipelining"); + + // Caps present but bit cleared (forward-compat for future + // capability bits the client doesn't recognize) → legacy. + let resp_other_cap = TunnelResponse { + sid: Some("s".into()), + d: None, + pkts: None, + eof: None, + e: None, + code: None, + seq: None, + caps: Some(0), + }; + let pipeline = resp_other_cap + .caps + .map(|c| c & CAPS_PIPELINE_SEQ != 0) + .unwrap_or(false); + assert!( + !pipeline, + "caps without the pipeline bit must NOT enable pipelining" + ); + } + + /// `tunnel_loop_pipelined`'s first emitted op for a session must + /// carry `seq: Some(0)`. The tunnel-node side relies on seqs being + /// per-session monotonic from 0 — if this regresses, the very + /// first op deadlocks the server's seq lock (it's waiting for + /// expected=0, never sees it). + #[tokio::test] + async fn pipelined_loop_first_op_carries_seq_zero() { + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let _client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + let (mux, mut rx) = mux_for_test(); + let pending = Some(Bytes::from_static(b"FIRST")); + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await } + }); + + let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv()) + .await + .expect("pipelined loop did not emit a message within 2s") + .expect("mux channel closed unexpectedly"); + + match msg { + MuxMsg::Data { + sid, + data, + seq, + reply, + } => { + assert_eq!(sid, "sid-pipe"); + assert_eq!(&data[..], b"FIRST"); + assert_eq!( + seq, + Some(0), + "first pipelined op for a session must be seq=0", + ); + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-pipe".into()), + d: None, + pkts: None, + eof: Some(true), + e: None, + code: None, + seq: Some(0), + caps: None, + }, + "test-script".to_string(), + ))); + } + _ => panic!("expected Data, got something else"), + } + + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await; + } + + /// Reordering correctness: when reply for seq=1 arrives at the + /// client BEFORE reply for seq=0 (different batches racing back + /// through different deployments), `tunnel_loop_pipelined` must + /// still write seq=0's bytes to the local socket first. Failure + /// mode if this regresses: silent stream corruption — the client + /// app sees later bytes before earlier ones. + /// + /// We force two ops in flight by stuffing both `pending_client_data` + /// (becomes seq=0) and a second chunk in the socket kernel buffer + /// (read becomes seq=1) before the loop starts. The loop's send + /// phase consumes both before entering the receive phase. + #[tokio::test] + async fn pipelined_loop_writes_in_seq_order_when_replies_arrive_reversed() { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let mut client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + // Pre-stuff "SECOND" so the loop's non-blocking read picks it + // up after consuming pending="FIRST". + client.write_all(b"SECOND").await.unwrap(); + client.flush().await.unwrap(); + // Brief wait so the bytes definitely land in the kernel buffer. + tokio::time::sleep(Duration::from_millis(30)).await; + + let (mux, mut rx) = mux_for_test(); + let pending = Some(Bytes::from_static(b"FIRST")); + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await } + }); + + // Receive both ops back-to-back. Loop sends them before + // entering the receive phase because PIPELINE_DEPTH=2 and + // both reads succeed in the send phase. + let msg0 = tokio::time::timeout(Duration::from_secs(2), rx.recv()) + .await + .expect("first op not emitted") + .expect("mux channel closed"); + let reply_0 = match msg0 { + MuxMsg::Data { + sid, + data, + seq, + reply, + } => { + assert_eq!(sid, "sid-pipe"); + assert_eq!(seq, Some(0)); + assert_eq!(&data[..], b"FIRST"); + reply + } + _ => panic!("expected Data 0"), + }; + + let msg1 = tokio::time::timeout(Duration::from_secs(2), rx.recv()) + .await + .expect("second op not emitted") + .expect("mux channel closed"); + let reply_1 = match msg1 { + MuxMsg::Data { + sid, + data, + seq, + reply, + } => { + assert_eq!(sid, "sid-pipe"); + assert_eq!(seq, Some(1)); + assert_eq!(&data[..], b"SECOND"); + reply + } + _ => panic!("expected Data 1"), + }; + + // Fire reply for seq=1 FIRST (out of order on the wire) with + // downlink bytes "B". + let _ = reply_1.send(Ok(( + TunnelResponse { + sid: Some("sid-pipe".into()), + d: Some(B64.encode(b"B")), + pkts: None, + eof: Some(true), // eof so the loop exits after seq=1 + e: None, + code: None, + seq: Some(1), + caps: None, + }, + "test-script".to_string(), + ))); + + // Brief delay to let the seq=1 reply settle in its oneshot's + // buffer. The loop is still awaiting the seq=0 oneshot at the + // front of the in-flight queue, so it MUST NOT have written + // anything yet. + tokio::time::sleep(Duration::from_millis(30)).await; + let mut peek = [0u8; 16]; + match tokio::time::timeout(Duration::from_millis(20), client.read(&mut peek)).await { + Ok(Ok(_)) => panic!( + "loop wrote downlink bytes before seq=0 reply arrived — \ + reorder invariant broken" + ), + Ok(Err(_)) | Err(_) => { /* expected: nothing written yet */ } + } + + // Now fire reply for seq=0 with "A". Loop should write "A" + // then immediately drain seq=1 from its oneshot and write "B". + let _ = reply_0.send(Ok(( + TunnelResponse { + sid: Some("sid-pipe".into()), + d: Some(B64.encode(b"A")), + pkts: None, + eof: None, + e: None, + code: None, + seq: Some(0), + caps: None, + }, + "test-script".to_string(), + ))); + + let mut received = vec![0u8; 16]; + let mut total = 0; + // Read until we have both bytes or the loop closes. + loop { + match tokio::time::timeout(Duration::from_secs(2), client.read(&mut received[total..])) + .await + { + Ok(Ok(0)) => break, + Ok(Ok(n)) => { + total += n; + if total >= 2 { + break; + } + } + Ok(Err(_)) | Err(_) => break, + } + } + assert_eq!( + &received[..total], + b"AB", + "downlink bytes must be written in seq order (got {:?})", + &received[..total], + ); + + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await; + } + + /// Critical regression: `pipelined_reply_timeout` must cover + /// BOTH the per-deployment semaphore wait AND the request's own + /// budget — under saturation, a fresh op waits up to one full + /// `effective_batch_timeout` for a permit before its own + /// `effective_batch_timeout`-bounded request even starts. With + /// a one-batch budget the watchdog fires while the op is still + /// queued for a permit, dropping the oneshot and closing a + /// healthy pipelined session under load. + /// + /// The test pins the `2× + slack` invariant so future changes + /// to either the formula or PIPELINED_BATCH_TIMEOUT_FLOOR + /// require an intentional update. + #[test] + fn pipelined_reply_timeout_covers_two_batch_budgets_for_semaphore_saturation() { + let (mux, _rx) = mux_for_test(); + let reply = mux.pipelined_reply_timeout(); + // Default config has 30 s configured → 60 s floor → 60×2 = 120 s. + let effective_batch = mux.batch_timeout.max(PIPELINED_BATCH_TIMEOUT_FLOOR); + assert!( + reply >= effective_batch * 2, + "default-config reply timeout ({:?}) must cover 2× the \ + effective batch budget ({:?}) so a permit-saturated \ + pipeline has time to acquire a slot AND complete the \ + request before the session watchdog fires", + reply, + effective_batch * 2, + ); + + // Same invariant under a user-tuned long timeout. + let (tx, _rx) = mpsc::channel(16); + let custom_batch = Duration::from_secs(120); + let mux = Arc::new(TunnelMux { + tx, + connect_data_unsupported: Arc::new(AtomicBool::new(false)), + legacy_deployments: Mutex::new(HashMap::new()), + all_legacy: Arc::new(AtomicBool::new(false)), + num_scripts: 1, + preread_win: AtomicU64::new(0), + preread_loss: AtomicU64::new(0), + preread_skip_port: AtomicU64::new(0), + preread_skip_unsupported: AtomicU64::new(0), + preread_win_total_us: AtomicU64::new(0), + preread_total_events: AtomicU64::new(0), + unreachable_cache: Mutex::new(HashMap::new()), + batch_timeout: custom_batch, + pipelining_globally_disabled: AtomicBool::new(false), + }); + let reply = mux.pipelined_reply_timeout(); + assert!( + reply >= custom_batch * 2, + "with request_timeout_secs={:?}, reply timeout ({:?}) \ + must cover 2× the budget for permit + request", + custom_batch, + reply, + ); + } + + /// `mark_pipelining_disabled` must flip the global toggle (sticky + /// for the process lifetime) so subsequent connects fall back to + /// the legacy loop, even when the connect_data reply still + /// advertises `caps`. This protects mixed-version deployments + /// where round-robin can land a session's seq ops on an + /// un-upgraded backend after a different deployment served the + /// connect with caps set. + #[test] + fn mark_pipelining_disabled_is_sticky_and_overrides_caps() { + let (mux, _rx) = mux_for_test(); + assert!(!mux.pipelining_disabled()); + + mux.mark_pipelining_disabled(); + assert!(mux.pipelining_disabled()); + + // Idempotent — re-marking doesn't toggle off. + mux.mark_pipelining_disabled(); + assert!(mux.pipelining_disabled()); + + // The actual decision in `tunnel_connection` is + // `pipeline_advertised && !mux.pipelining_disabled()`, so + // even a caps-advertising connect_data reply must NOT + // re-enable pipelining once the toggle is set. + let advertised = true; + let pipeline = advertised && !mux.pipelining_disabled(); + assert!( + !pipeline, + "advertised caps must not re-enable pipelining after the \ + mixed-version toggle has fired", + ); + } + + /// Critical regression: when the local client half-closes (TCP + /// shutdown send → `Ok(0)` on read) AFTER queuing request bytes, + /// the loop has ops in flight whose replies may carry downlink + /// data the server already produced. Returning early on EOF + /// (the previous shape) dropped the queued oneshot receivers + /// and silently lost those bytes — breaking valid request/ + /// response patterns where the client closes its write half + /// after sending. The fix: set `client_send_closed` and continue + /// draining `in_flight` before returning. + #[tokio::test] + async fn pipelined_loop_drains_in_flight_on_client_half_close() { + use tokio::io::AsyncReadExt; + use tokio::io::AsyncWriteExt; + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let mut client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + let (mux, mut rx) = mux_for_test(); + // Pre-stuff "REQUEST" so the loop's first iteration sends it + // as seq=0, then half-closes from the client side. The loop + // must still wait for seq=0's reply (with downlink bytes) + // before returning. + client.write_all(b"REQUEST").await.unwrap(); + client.flush().await.unwrap(); + // Half-close the client's write side. Subsequent reads on + // server_side will return Ok(0) once the buffered "REQUEST" + // is consumed. + client.shutdown().await.unwrap(); + tokio::time::sleep(Duration::from_millis(20)).await; + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { + tunnel_loop_pipelined(&mut server_side, "sid-halfclose", &mux, None).await + } + }); + + // The loop emits seq=0 with the buffered "REQUEST" bytes. + let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv()) + .await + .expect("first op not emitted") + .expect("mux channel closed"); + let reply = match msg { + MuxMsg::Data { sid, data, seq, reply } => { + assert_eq!(sid, "sid-halfclose"); + assert_eq!(seq, Some(0)); + assert_eq!(&data[..], b"REQUEST"); + reply + } + _ => panic!("expected Data 0"), + }; + + // Brief delay so the loop's next send-phase iteration sees + // the half-closed socket (Ok(0)) and flips + // `client_send_closed` — but stays parked on the in_flight + // oneshot for seq=0. + tokio::time::sleep(Duration::from_millis(50)).await; + + // The loop MUST still be alive — it's draining the pending + // reply, not exiting on the half-close. + assert!( + !loop_handle.is_finished(), + "loop must keep running while in_flight has pending replies; \ + returning on Ok(0) would drop seq=0's oneshot and lose its \ + downlink bytes", + ); + + // Now deliver the reply with downlink bytes "RESPONSE" + eof. + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-halfclose".into()), + d: Some(B64.encode(b"RESPONSE")), + pkts: None, + eof: Some(true), + e: None, + code: None, + seq: Some(0), + caps: None, + }, + "test-script".to_string(), + ))); + + // Loop should write "RESPONSE" to the local socket and exit. + let mut received = vec![0u8; 32]; + let mut total = 0; + loop { + match tokio::time::timeout( + Duration::from_secs(2), + client.read(&mut received[total..]), + ) + .await + { + Ok(Ok(0)) => break, + Ok(Ok(n)) => { + total += n; + if total >= 8 { + break; + } + } + Ok(Err(_)) | Err(_) => break, + } + } + assert_eq!( + &received[..total], + b"RESPONSE", + "downlink bytes must be delivered even after client half-close \ + (got {:?})", + &received[..total], + ); + + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await; + } + + /// A pipelined session whose server reply carries the wrong echoed + /// seq must close, not silently write potentially-misrouted bytes + /// to the local socket. The reply we'd otherwise honor could + /// contain bytes from a different session position (server bug or + /// version skew where the response shape changed) — emitting them + /// is the silent-corruption failure mode the seq protocol exists + /// to prevent. + #[tokio::test] + async fn pipelined_loop_closes_on_seq_echo_mismatch() { + use tokio::io::AsyncReadExt; + use tokio::net::TcpListener; + + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let addr = listener.local_addr().unwrap(); + let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 }); + let mut client = TcpStream::connect(addr).await.unwrap(); + let mut server_side = accept.await.unwrap(); + + let (mux, mut rx) = mux_for_test(); + let pending = Some(Bytes::from_static(b"FIRST")); + + let loop_handle = tokio::spawn({ + let mux = mux.clone(); + async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await } + }); + + let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv()) + .await + .expect("first op not emitted") + .expect("mux channel closed"); + let reply = match msg { + MuxMsg::Data { + sid: _, + data: _, + seq, + reply, + } => { + assert_eq!(seq, Some(0)); + reply + } + _ => panic!("expected Data 0"), + }; + + // Reply with a WRONG seq (echoed seq=99 for our seq=0 op). + // The loop should detect the mismatch, close the session, + // and write nothing to the local socket. + let _ = reply.send(Ok(( + TunnelResponse { + sid: Some("sid-pipe".into()), + d: Some(B64.encode(b"BOGUS")), // bytes that would be wrong to write + pkts: None, + eof: None, + e: None, + code: None, + seq: Some(99), + caps: None, + }, + "test-script".to_string(), + ))); + + // Wait for the loop to terminate. + let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle) + .await + .expect("loop must exit on seq mismatch"); + + // Local socket must NOT have received "BOGUS". + let mut peek = [0u8; 16]; + match tokio::time::timeout(Duration::from_millis(100), client.read(&mut peek)).await { + Ok(Ok(0)) | Ok(Err(_)) | Err(_) => { /* expected: socket closed or empty */ } + Ok(Ok(n)) => panic!( + "loop wrote {} bytes despite seq mismatch — should have closed silently \ + (got: {:?})", + n, + &peek[..n] + ), + } + } + /// Once `mark_connect_data_unsupported` is called, future sessions /// must see the flag — no per-session repeat of the detect-and-fallback /// cost. If this regresses, every new flow pays an extra round trip @@ -2007,23 +3085,11 @@ mod tests { #[test] fn should_fire_when_payload_would_exceed_cap() { // Exactly at the cap is fine — strict `>`. - assert!(!should_fire( - 10, - MAX_BATCH_PAYLOAD_BYTES - 100, - 100, - )); + assert!(!should_fire(10, MAX_BATCH_PAYLOAD_BYTES - 100, 100,)); // One byte over: fire. - assert!(should_fire( - 10, - MAX_BATCH_PAYLOAD_BYTES - 100, - 101, - )); + assert!(should_fire(10, MAX_BATCH_PAYLOAD_BYTES - 100, 101,)); // Sum overflow well past the cap: fire. - assert!(should_fire( - 10, - MAX_BATCH_PAYLOAD_BYTES, - 1, - )); + assert!(should_fire(10, MAX_BATCH_PAYLOAD_BYTES, 1,)); } /// Reply indices must point at the slot the op occupies *within its @@ -2057,6 +3123,7 @@ mod tests { port: None, data: Some(Bytes::from_static(b"x")), encode_empty: false, + seq: None, }; let mk_reply = || oneshot::channel::>().0; @@ -2068,7 +3135,11 @@ mod tests { push_no_fire(&mut accum, mk_op("a2"), 4, mk_reply()); assert_eq!(accum.pending_ops.len(), 3); assert_eq!( - accum.data_replies.iter().map(|(i, _)| *i).collect::>(), + accum + .data_replies + .iter() + .map(|(i, _)| *i) + .collect::>(), vec![0, 1, 2], ); assert_eq!(accum.payload_bytes, 12); @@ -2084,7 +3155,11 @@ mod tests { push_no_fire(&mut accum, mk_op("b1"), 4, mk_reply()); assert_eq!(accum.pending_ops.len(), 2); assert_eq!( - accum.data_replies.iter().map(|(i, _)| *i).collect::>(), + accum + .data_replies + .iter() + .map(|(i, _)| *i) + .collect::>(), vec![0, 1], "post-flush indices must restart at 0 — otherwise fire_batch's \ batch_resp.r.get(idx) returns None and every session in the \ @@ -2093,6 +3168,28 @@ mod tests { assert_eq!(accum.payload_bytes, 8); } + /// `seq` plumbs from `MuxMsg::Data` → `PendingOp` → `BatchOp` so + /// the tunnel-node sees the seq we generated. Without this + /// propagation the server would see a None seq and fall back to + /// the legacy unordered path — the client would still process + /// replies in oneshot-FIFO order, but the bytes inside would be + /// in unspecified order (server drained read_buf in batch + /// arrival order, not seq order). + #[test] + fn encode_pending_propagates_seq_to_batch_op() { + let op = PendingOp { + op: "data", + sid: Some("sid".into()), + host: None, + port: None, + data: Some(Bytes::from_static(b"x")), + encode_empty: false, + seq: Some(42), + }; + let b = encode_pending(op); + assert_eq!(b.seq, Some(42)); + } + #[test] fn encode_pending_data_op_with_payload_emits_base64() { let op = PendingOp { @@ -2102,6 +3199,7 @@ mod tests { port: None, data: Some(Bytes::from_static(b"hello")), encode_empty: false, + seq: None, }; let b = encode_pending(op); assert_eq!(b.op, "data"); @@ -2119,6 +3217,7 @@ mod tests { port: None, data: None, encode_empty: false, + seq: None, }; assert!(encode_pending(empty_poll).d.is_none()); @@ -2130,6 +3229,7 @@ mod tests { port: None, data: None, encode_empty: false, + seq: None, }; assert!(encode_pending(udp_poll).d.is_none()); @@ -2141,6 +3241,7 @@ mod tests { port: None, data: None, encode_empty: false, + seq: None, }; assert!(encode_pending(close).d.is_none()); } @@ -2158,6 +3259,7 @@ mod tests { port: Some(443), data: Some(Bytes::new()), encode_empty: true, + seq: None, }; let b = encode_pending(op); assert_eq!(b.op, "connect_data"); @@ -2173,6 +3275,7 @@ mod tests { port: Some(443), data: Some(Bytes::from_static(b"\x16\x03\x01")), // ClientHello prefix encode_empty: true, + seq: None, }; let b = encode_pending(op); assert_eq!(b.d.as_deref(), Some(B64.encode(b"\x16\x03\x01").as_str())); diff --git a/tunnel-node/src/main.rs b/tunnel-node/src/main.rs index e3bc3c0..e97d901 100644 --- a/tunnel-node/src/main.rs +++ b/tunnel-node/src/main.rs @@ -153,8 +153,59 @@ struct SessionInner { /// to wake the drain phase as soon as any session has something to /// ship, replacing the old fixed-sleep heuristic. notify: Notify, + /// Per-session next-expected `data`-op sequence number for the + /// pipelining protocol. Only consulted by ops that carry a `seq` + /// field; legacy non-seq `data` ops bypass this entirely and use + /// the inline first-come-first-served path. Held inside a + /// `tokio::sync::Mutex` so the (claim seq → write → drain → bump) + /// critical section can `await` without blocking the runtime. + seq_state: Mutex, + /// Fired (`notify_waiters`) every time `seq_state.expected` advances. + /// Tasks waiting for their `seq` turn re-check `expected` after each + /// wake; combined with a per-task `SEQ_WAIT_TIMEOUT` this prevents + /// indefinite stalls when an earlier seq is lost or the client dies. + seq_advance: Notify, } +/// Ordering state for per-session `data`-op pipelining. `expected` is +/// the next seq the session will accept; any op whose `seq < expected` +/// is treated as a duplicate (likely a client-side retry) and skipped, +/// any op whose `seq > expected` blocks on `seq_advance` until either +/// its seq matches or `SEQ_WAIT_TIMEOUT` elapses. New sessions start at +/// `expected = 0` so the first `data` op the client sends is seq 0. +struct SeqState { + /// Next-expected `data`-op sequence number for this session. + /// `u64` rather than `u32` so a long-lived TCP session generating + /// ~100 ops/s doesn't saturate `u32::MAX` after ~1.4 years and + /// refuse every subsequent op. The wire protocol matches + /// (`BatchOp::seq` / `TunnelResponse::seq` are also `u64`). + expected: u64, +} + +/// How long a `data` op carrying a `seq` will wait for earlier seqs to +/// land before giving up and returning an error to the client. 30 s is +/// well past any normal Apps Script round-trip (~1.5 s) plus reordering +/// across deployments, but short enough that a genuinely lost seq — +/// e.g. the client crashed mid-pipeline — doesn't hold every subsequent +/// op forever. After timeout the client's session reaper / explicit +/// close cleans up. +/// +/// Latency trade-off worth flagging: a seq op stuck waiting for an +/// earlier seq holds an Apps Script execution slot AND the batch's +/// HTTP response open until either the earlier seq lands or this +/// timeout fires. `handle_batch` collects every job before +/// responding, so a single stuck seq from session A can keep +/// session B's already-ready job's results sitting in +/// `seq_data_jobs` for up to `SEQ_WAIT_TIMEOUT`. We accept this: +/// permanent seq loss is rare (it requires the client's earlier +/// seq batch to vanish in transit), and shortening this would +/// cause more rejections in the common reordered-arrival case +/// where seqs land 2-10 s apart across deployments. The +/// regression test +/// `unrelated_seq_session_in_same_batch_is_not_delayed_past_seq_wait` +/// pins the upper bound so any future change here is intentional. +const SEQ_WAIT_TIMEOUT: Duration = Duration::from_secs(30); + struct ManagedSession { inner: Arc, reader_handle: tokio::task::JoinHandle<()>, @@ -171,6 +222,137 @@ impl ManagedSession { } } +/// Per-batch shared wait primitive for seq jobs and Phase 2's TCP +/// drain. Collapses to one watcher per *unique* `SessionInner` +/// (deduplicated by `Arc::as_ptr`) regardless of how many seq jobs +/// or batch waiters subscribe — bounded at one task per session, +/// not the `M × N` (jobs × sessions) the per-job +/// `wait_for_any_drainable` would have spawned. +/// +/// Each watcher relays `inner.notify` to a single batch-wide +/// `Arc` via `notify_waiters()`, so a single push wakes +/// every parked job in the batch. `reader_task` itself fires +/// `notify_waiters()` (not `notify_one()`), so concurrent batches +/// observing the same session also all wake on each push — the +/// `notify_one()` semantics would have left only one of N parked +/// watchers winning the wake. +/// +/// Watchers are held in `AbortOnDrop` so they're aborted on every +/// exit path of `handle_batch` (including cancellation). +struct BatchWait { + /// Fired (`notify_waiters`) when any unique inner's `reader_task` + /// pushes data or marks eof. All seq jobs and Phase 2's TCP wait + /// subscribe to this single Notify. Held as `Arc` so + /// the (pre-spawned) watcher tasks can keep firing on it + /// without re-borrowing `Self` — the construction order would + /// otherwise need an `Arc::get_mut` dance. + wake: Arc, + /// Sessions whose drainability the wait helper checks + /// synchronously before parking. Owned (deduplicated) here so + /// every caller agrees on the set. + inners: Vec>, + /// One bridge task per unique inner. Each calls + /// `inner.notify.notified().await` in a loop and fans out real + /// state changes to `wake.notify_waiters()`. Held in + /// `AbortOnDrop` so they go away when `BatchWait` drops. + _watchers: Vec, +} + +impl BatchWait { + fn new(inners: Vec>) -> Arc { + // Deduplicate by Arc pointer. The same session can appear in + // a batch via multiple paths — e.g. a `connect_data` plus a + // seq `data` op for the same sid. Spawning two watchers for + // it would double the wake fan-out work for no benefit. + // We also STORE the dedup'd list (not the original `inners`), + // so `is_any_drainable` doesn't re-lock the same Mutex once + // per duplicate appearance. + let mut seen: std::collections::HashSet = + std::collections::HashSet::new(); + let mut unique: Vec> = Vec::with_capacity(inners.len()); + for inner in &inners { + let ptr = Arc::as_ptr(inner) as usize; + if seen.insert(ptr) { + unique.push(inner.clone()); + } + } + + let wake = Arc::new(Notify::new()); + let mut watchers: Vec = Vec::with_capacity(unique.len()); + for inner in unique.iter().cloned() { + let wake = wake.clone(); + watchers.push(AbortOnDrop(tokio::spawn(async move { + loop { + // Pre-`enable()` the next Notified BEFORE + // checking state, so a `notify_waiters()` fired + // in the gap between our previous fan-out and + // this register isn't lost. `reader_task` uses + // `notify_waiters()` (not `notify_one()`) so no + // permits are stored for unregistered waiters + // — the enable() registration is what guarantees + // we observe every push. + let notified = inner.notify.notified(); + tokio::pin!(notified); + notified.as_mut().enable(); + + // Filter false wakes: only fan out when the + // session is actually drainable. A wake fired + // for a push that's already been drained by a + // prior batch loops back without disturbing + // batch waiters. + let drainable = inner.eof.load(Ordering::Acquire) + || !inner.read_buf.lock().await.is_empty(); + if drainable { + wake.notify_waiters(); + } + + notified.await; + } + }))); + } + + Arc::new(Self { + wake, + inners: unique, + _watchers: watchers, + }) + } + + /// Wait until any unique inner is drainable, or `deadline` + /// elapses. The pre-armed `Notified` closes the same race + /// `wait_for_seq_turn` handles: any wake fired between + /// the synchronous drainability check and the await is still + /// observed. `notify_waiters()` fans out to ALL parked + /// `BatchWait::wait` callers at once, so a single push wakes + /// every seq job and Phase 2's TCP wait simultaneously — the + /// exact behavior the per-job `wait_for_any_drainable` calls + /// failed to deliver under `notify_one()`. + async fn wait(&self, deadline: Duration) { + if self.inners.is_empty() { + return; + } + let notified = self.wake.notified(); + tokio::pin!(notified); + notified.as_mut().enable(); + if self.is_any_drainable().await { + return; + } + let _ = tokio::time::timeout(deadline, notified).await; + } + + async fn is_any_drainable(&self) -> bool { + for inner in &self.inners { + if inner.eof.load(Ordering::Acquire) { + return true; + } + if !inner.read_buf.lock().await.is_empty() { + return true; + } + } + false + } +} + /// UDP equivalent of `SessionInner`. Holds a *connected* `UdpSocket` /// pinned to one `(host, port)` upstream so we don't have to re-resolve /// or re-parse the destination on every datagram. `notify` is fired by @@ -212,12 +394,18 @@ async fn create_session(host: &str, port: u16) -> std::io::Result ManagedSession { eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), notify: Notify::new(), + seq_state: Mutex::new(SeqState { expected: 0 }), + seq_advance: Notify::new(), }); let inner_ref = inner.clone(); let reader_handle = tokio::spawn(reader_task(read_half, inner_ref)); let udpgw_handle = Some(tokio::spawn(udpgw::udpgw_server_task(server_half))); - ManagedSession { inner, reader_handle, udpgw_handle } + ManagedSession { + inner, + reader_handle, + udpgw_handle, + } } async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc) { @@ -246,25 +440,37 @@ async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc { session.eof.store(true, Ordering::Release); - session.notify.notify_one(); + session.notify.notify_waiters(); break; } Ok(n) => { // Extend the buffer before notifying. The MutexGuard is // dropped at the end of the statement, *before* the - // notify_one call below, so any waiter that wakes on the - // notify and then locks read_buf can immediately observe - // the new bytes — no torn read where the wake fires but - // the buffer still looks empty. Notify::notify_one also - // stores a permit if no waiter is currently registered, - // so we never lose an edge across the spawn race in - // wait_for_any_drainable. + // notify call below, so any waiter that wakes on the + // notify and then locks read_buf can immediately + // observe the new bytes — no torn read where the wake + // fires but the buffer still looks empty. + // + // `notify_waiters()` (not `notify_one()`) so EVERY + // parked waiter wakes — required for correctness when + // pipelining puts seq=N and seq=N+1 for the same + // session in different batches: each batch builds its + // own `BatchWait` watcher on this session's notify, + // and a single `notify_one()` would only wake one of + // them, leaving the other batch's watcher asleep + // until `LONGPOLL_DEADLINE`. The cost of dropping the + // permit-storage semantics is handled by the + // pre-`enable()` pattern in `BatchWait`'s and + // `wait_for_any_drainable`'s watchers — they register + // their `Notified` future before the synchronous + // drainability check, so a wake fired in the gap is + // still observed. session.read_buf.lock().await.extend_from_slice(&buf[..n]); - session.notify.notify_one(); + session.notify.notify_waiters(); } Err(_) => { session.eof.store(true, Ordering::Release); - session.notify.notify_one(); + session.notify.notify_waiters(); break; } } @@ -442,6 +648,13 @@ async fn wait_for_any_drainable(inners: &[Arc], deadline: Duration // race-safety note above. Watchers are held in a Vec of // `AbortOnDrop`, so they're aborted on every exit path — // including cancellation by an outer `select!`. + // + // The pre-`enable()` pattern is required since + // `reader_task` uses `notify_waiters()` (not `notify_one()`): + // there are no stored permits, so a wake fired between the + // synchronous state check and parking on `notified.await` + // would otherwise be lost. Registering the future before the + // check guarantees we observe every push. let (tx, mut rx) = mpsc::channel::<()>(1); let mut _watchers: Vec = Vec::with_capacity(inners.len()); for inner in inners { @@ -449,17 +662,20 @@ async fn wait_for_any_drainable(inners: &[Arc], deadline: Duration let tx = tx.clone(); _watchers.push(AbortOnDrop(tokio::spawn(async move { loop { - inner.notify.notified().await; + let notified = inner.notify.notified(); + tokio::pin!(notified); + notified.as_mut().enable(); if inner.eof.load(Ordering::Acquire) { break; } if !inner.read_buf.lock().await.is_empty() { break; } - // Stale permit (notify fired but state didn't change in - // an observable way — e.g., bytes were already drained - // by a prior batch). Loop back and wait for a real - // notify, don't wake the caller. + notified.await; + // Wake observed — state may have changed; loop back + // and re-check. If the wake was for a push that + // another batch already drained, the next iteration + // sees buf empty + eof unset and re-arms. } let _ = tx.try_send(()); }))); @@ -582,9 +798,15 @@ async fn wait_and_drain(session: &SessionInner, max_wait: Duration) -> (Vec, prev_len = cur_len; ever_had_data = true; } - if is_eof { break; } - if Instant::now() >= deadline { break; } - if ever_had_data && last_growth.elapsed() > Duration::from_millis(100) { break; } + if is_eof { + break; + } + if Instant::now() >= deadline { + break; + } + if ever_had_data && last_growth.elapsed() > Duration::from_millis(100) { + break; + } tokio::time::sleep(Duration::from_millis(10)).await; } @@ -626,34 +848,78 @@ struct AppState { struct TunnelRequest { k: String, op: String, - #[serde(default)] host: Option, - #[serde(default)] port: Option, - #[serde(default)] sid: Option, - #[serde(default)] data: Option, + #[serde(default)] + host: Option, + #[serde(default)] + port: Option, + #[serde(default)] + sid: Option, + #[serde(default)] + data: Option, } #[derive(Serialize, Clone, Debug)] struct TunnelResponse { - #[serde(skip_serializing_if = "Option::is_none")] sid: Option, - #[serde(skip_serializing_if = "Option::is_none")] d: Option, + #[serde(skip_serializing_if = "Option::is_none")] + sid: Option, + #[serde(skip_serializing_if = "Option::is_none")] + d: Option, /// UDP datagrams returned to the client, base64-encoded individually. /// `None` for TCP responses; `Some(vec![])` is never serialized /// (the field is dropped when empty by the empty-on-None check above). - #[serde(skip_serializing_if = "Option::is_none")] pkts: Option>, - #[serde(skip_serializing_if = "Option::is_none")] eof: Option, - #[serde(skip_serializing_if = "Option::is_none")] e: Option, - #[serde(skip_serializing_if = "Option::is_none")] code: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pkts: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + eof: Option, + #[serde(skip_serializing_if = "Option::is_none")] + e: Option, + #[serde(skip_serializing_if = "Option::is_none")] + code: Option, + /// Per-session sequence number echoed from `BatchOp::seq` for `data` + /// ops (pipelining). Lets the client route an out-of-order reply + /// into its per-session reorder buffer. Skipped on the wire when + /// `None` (the legacy/non-pipelined path). `u64` to keep + /// long-lived sessions from saturating — see `SeqState::expected` + /// for the math. + #[serde(skip_serializing_if = "Option::is_none")] + seq: Option, + /// Capability bitfield set on `connect` / `connect_data` success + /// replies so the client can opt into protocol features it knows + /// the tunnel-node speaks. Bit 0 = `CAPS_PIPELINE_SEQ`. Skipped + /// elsewhere (other op replies don't advertise capabilities). + #[serde(skip_serializing_if = "Option::is_none")] + caps: Option, } +/// Capability bit for per-session `data`-op sequence numbers +/// (pipelining). Set on `connect` / `connect_data` success replies so +/// new clients opt into pipelining; old clients ignore the unknown +/// JSON field. +const CAPS_PIPELINE_SEQ: u32 = 1 << 0; + impl TunnelResponse { fn error(msg: impl Into) -> Self { - Self { sid: None, d: None, pkts: None, eof: None, e: Some(msg.into()), code: None } + Self { + sid: None, + d: None, + pkts: None, + eof: None, + e: Some(msg.into()), + code: None, + seq: None, + caps: None, + } } fn unsupported_op(op: &str) -> Self { Self { - sid: None, d: None, pkts: None, eof: None, + sid: None, + d: None, + pkts: None, + eof: None, e: Some(format!("unknown op: {}", op)), code: Some(CODE_UNSUPPORTED_OP.into()), + seq: None, + caps: None, } } } @@ -671,10 +937,24 @@ struct BatchRequest { #[derive(Deserialize)] struct BatchOp { op: String, - #[serde(default)] sid: Option, - #[serde(default)] host: Option, - #[serde(default)] port: Option, - #[serde(default)] d: Option, // base64 data + #[serde(default)] + sid: Option, + #[serde(default)] + host: Option, + #[serde(default)] + port: Option, + #[serde(default)] + d: Option, // base64 data + /// Per-session monotonic sequence number for `data` ops on a + /// pipelining-enabled session. When present, the server enforces + /// in-order processing: ops with `seq` arriving out of order are + /// held until earlier seqs land (or the session times out). Old + /// clients omit the field — `serde(default)` deserializes them + /// to `None`, which selects the legacy first-come-first-served + /// path. Echoed back in the response's `seq`. `u64` for the same + /// long-running-session reason as `SeqState::expected`. + #[serde(default)] + seq: Option, } #[derive(Serialize)] @@ -695,9 +975,7 @@ async fn handle_tunnel( } let resp: TunnelResponse = match req.op.as_str() { "connect" => handle_connect(&state, req.host, req.port).await, - "connect_data" => { - handle_connect_data_single(&state, req.host, req.port, req.data).await - } + "connect_data" => handle_connect_data_single(&state, req.host, req.port, req.data).await, "data" => handle_data_single(&state, req.sid, req.data).await, "close" => handle_close(&state, req.sid).await, other => TunnelResponse::unsupported_op(other), @@ -730,10 +1008,7 @@ fn decoy_or_unauthorized(diagnostic_mode: bool) -> axum::response::Response { // Batch handler // --------------------------------------------------------------------------- -async fn handle_batch( - State(state): State, - body: Bytes, -) -> impl IntoResponse { +async fn handle_batch(State(state): State, body: Bytes) -> impl IntoResponse { // Decompress if gzipped let json_bytes = if body.starts_with(&[0x1f, 0x8b]) { match decompress_gzip(&body) { @@ -741,8 +1016,13 @@ async fn handle_batch( Err(e) => { let resp = serde_json::to_vec(&BatchResponse { r: vec![TunnelResponse::error(format!("gzip decode: {}", e))], - }).unwrap_or_default(); - return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + }) + .unwrap_or_default(); + return ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/json")], + resp, + ); } } } else { @@ -754,8 +1034,13 @@ async fn handle_batch( Err(e) => { let resp = serde_json::to_vec(&BatchResponse { r: vec![TunnelResponse::error(format!("bad json: {}", e))], - }).unwrap_or_default(); - return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + }) + .unwrap_or_default(); + return ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/json")], + resp, + ); } }; @@ -763,8 +1048,13 @@ async fn handle_batch( if state.diagnostic_mode { let resp = serde_json::to_vec(&BatchResponse { r: vec![TunnelResponse::error("unauthorized")], - }).unwrap_or_default(); - return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp); + }) + .unwrap_or_default(); + return ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/json")], + resp, + ); } // Production: same nginx-404 decoy as the single-op path. See // `decoy_or_unauthorized` for rationale. @@ -773,7 +1063,11 @@ async fn handle_batch(
nginx
\r\n\r\n\r\n" .as_bytes() .to_vec(); - return (StatusCode::NOT_FOUND, [(header::CONTENT_TYPE, "text/html")], body); + return ( + StatusCode::NOT_FOUND, + [(header::CONTENT_TYPE, "text/html")], + body, + ); } // Process all ops in two phases. @@ -797,7 +1091,22 @@ async fn handle_batch( // map lock isn't held across the per-session read_buf / packets // mutex acquisition — without this, every other batch (and every // connect/close op) head-of-line-blocks behind the drain. - let mut tcp_drains: Vec<(usize, String, Arc)> = Vec::new(); + // + // The `is_connect_data` flag distinguishes drains that came from + // a successful `connect_data` op (a fresh session) from drains + // that came from a regular `data` op (an existing session). The + // batched-connect_data response is the FIRST reply the client + // sees for a session, so it has to carry `caps` — same as the + // single-op `handle_connect_data_single` path. Without this + // distinction the batch path silently downgrades every HTTPS + // fast-path session to the legacy non-pipelined loop. + struct TcpDrain { + i: usize, + sid: String, + inner: Arc, + is_connect_data: bool, + } + let mut tcp_drains: Vec = Vec::new(); let mut udp_drains: Vec<(usize, String, Arc)> = Vec::new(); // True iff the batch contained any op that performed a real action // upstream — a new connection or a non-empty data write. A batch of @@ -811,6 +1120,46 @@ async fn handle_batch( UdpOpen(Result<(String, Arc), TunnelResponse>), } let mut new_conn_jobs: JoinSet<(usize, NewConn)> = JoinSet::new(); + // Seq-ordered `data` op specs collected during the dispatch loop + // and spawned later, after `had_writes_or_connects` is final and + // after `new_conn_jobs` has populated `tcp_drains`. Spawning then + // running concurrently with Phase 2 (rather than awaiting all + // seq jobs *before* Phase 2) prevents a single idle pipelined + // poll from holding the whole batch response open for 15 s while + // sibling sessions are ready to ship bytes. + struct SeqDataSpec { + i: usize, + sid: String, + seq: u64, + data: Option, + inner: Arc, + } + let mut seq_specs: Vec = Vec::new(); + // `close` ops collected during dispatch and processed AFTER + // seq-data jobs complete. Running close inline would race + // same-sid seq ops by removing the session from `state.sessions` + // and aborting `reader_task` before the deferred seq job's + // (write, drain) ran on its cloned `Arc`. + let mut pending_closes: Vec<(usize, Option)> = Vec::new(); + // Sids that have an earlier-in-batch op deferred to Phase 2 or + // `seq_data_jobs`. Used by the `close` arm to decide whether + // to defer this close (race with the deferred op) or run it + // inline (preserve the request's close-first ordering when no + // earlier same-sid op exists). Without this check, a batch + // ordered as `[close(X), data(seq=0, X)]` would unconditionally + // defer the close, letting the data write succeed against a + // session the client asked to be closed FIRST — a protocol- + // ordering regression vs the pre-pipelining shape. + let mut sids_with_pending_deferred: std::collections::HashSet = + std::collections::HashSet::new(); + // Shared response-byte budget for this batch. seq-data jobs run + // concurrently with Phase 2's `tcp_drains` loop, so the cap + // (#863) must be shared across both code paths — M concurrent + // pipelined sessions each grabbing `TCP_DRAIN_MAX_BYTES` would + // stack past Apps Script's 50 MiB response ceiling. The Mutex is + // held only briefly per reservation; the actual drain runs + // unlocked. + let response_budget: Arc> = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); for (i, op) in req.ops.iter().enumerate() { match op.op.as_str() { @@ -820,7 +1169,10 @@ async fn handle_batch( let host = op.host.clone(); let port = op.port; new_conn_jobs.spawn(async move { - (i, NewConn::Connect(handle_connect(&state, host, port).await)) + ( + i, + NewConn::Connect(handle_connect(&state, host, port).await), + ) }); } "connect_data" => { @@ -858,7 +1210,10 @@ async fn handle_batch( "data" => { let sid = match &op.sid { Some(s) if !s.is_empty() => s.clone(), - _ => { results.push((i, TunnelResponse::error("missing sid"))); continue; } + _ => { + results.push((i, TunnelResponse::error("missing sid"))); + continue; + } }; // Clone the inner under the map lock and release it @@ -872,6 +1227,35 @@ async fn handle_batch( let sessions = state.sessions.lock().await; sessions.get(&sid).map(|s| s.inner.clone()) }; + // Pipelining branch: a `data` op carrying a seq is + // dispatched into the seq_data_jobs JoinSet so its + // potentially-long wait for an earlier seq doesn't + // block other ops in this batch. The job's + // `process_seq_data_op` runs the full (write, drain) + // sequence under the per-session seq lock, which is + // what enforces downlink-byte ordering across batches. + if let Some(seq) = op.seq { + if let Some(inner) = inner { + let had_uplink_b64 = + op.d.as_deref().map(|s| !s.is_empty()).unwrap_or(false); + if had_uplink_b64 { + had_writes_or_connects = true; + } + sids_with_pending_deferred.insert(sid.clone()); + seq_specs.push(SeqDataSpec { + i, + sid: sid.clone(), + seq, + data: op.d.clone(), + inner, + }); + } else { + let mut r = eof_response(sid); + r.seq = Some(seq); + results.push((i, r)); + } + continue; + } if let Some(inner) = inner { *inner.last_active.lock().await = Instant::now(); if let Some(ref data_b64) = op.d { @@ -899,7 +1283,13 @@ async fn handle_batch( } } } - tcp_drains.push((i, sid, inner)); + sids_with_pending_deferred.insert(sid.clone()); + tcp_drains.push(TcpDrain { + i, + sid, + inner, + is_connect_data: false, + }); } else { results.push((i, eof_response(sid))); } @@ -907,7 +1297,10 @@ async fn handle_batch( "udp_data" => { let sid = match &op.sid { Some(s) if !s.is_empty() => s.clone(), - _ => { results.push((i, TunnelResponse::error("missing sid"))); continue; } + _ => { + results.push((i, TunnelResponse::error("missing sid"))); + continue; + } }; let inner = { @@ -942,14 +1335,37 @@ async fn handle_batch( if had_uplink { *inner.last_active.lock().await = Instant::now(); } + sids_with_pending_deferred.insert(sid.clone()); udp_drains.push((i, sid, inner)); } else { results.push((i, eof_response(sid))); } } "close" => { - let r = handle_close(&state, op.sid.clone()).await; - results.push((i, r)); + // Defer ONLY if there's an earlier-in-batch op for + // the same sid that's already deferred (seq data, + // legacy data, udp_data). Without an earlier + // deferred op, running close inline preserves the + // request's per-sid op order — important for + // `[close, data]` and `[close, data(seq)]` shapes: + // the client asked us to close FIRST, then write, + // and the write should land on a closed (eof'd) + // session, not race the close. With an earlier + // deferred op, deferring close prevents the race + // where same-sid seq jobs run on a closed session + // (writing to an aborted upstream, draining an + // empty buffer). + let needs_defer = op + .sid + .as_deref() + .map(|s| sids_with_pending_deferred.contains(s)) + .unwrap_or(false); + if needs_defer { + pending_closes.push((i, op.sid.clone())); + } else { + let r = handle_close(&state, op.sid.clone()).await; + results.push((i, r)); + } } other => { results.push((i, TunnelResponse::unsupported_op(other))); @@ -964,7 +1380,16 @@ async fn handle_batch( match join { Ok((i, NewConn::Connect(r))) => results.push((i, r)), Ok((i, NewConn::ConnectData(Ok((sid, inner))))) => { - tcp_drains.push((i, sid, inner)); + // First reply on this session — must carry caps so + // the client knows it can opt into the pipelined + // tunnel_loop variant. Phase 2's drain stamps the + // bit on responses where `is_connect_data` is true. + tcp_drains.push(TcpDrain { + i, + sid, + inner, + is_connect_data: true, + }); } Ok((i, NewConn::ConnectData(Err(r)))) => results.push((i, r)), Ok((i, NewConn::UdpOpen(Ok((sid, inner))))) => { @@ -977,162 +1402,294 @@ async fn handle_batch( } } - // Phase 2: signal-driven wait for any session (TCP or UDP) to have - // data, then drain TCP and UDP independently in a single pass each. - // Deadlines: - // * `ACTIVE_DRAIN_DEADLINE` (~350 ms) when the batch had real work. - // Typical responses arrive in ms; the wait helpers return on - // the first notify. For active batches we settle for - // `STRAGGLER_SETTLE` so neighbors whose replies trail by a few - // ms aren't reported empty. - // * `LONGPOLL_DEADLINE` for pure-poll batches — held open until - // upstream pushes data. UDP idle polls benefit from this just - // as much as TCP, so the same window applies. - if !tcp_drains.is_empty() || !udp_drains.is_empty() { - let deadline = if had_writes_or_connects { - ACTIVE_DRAIN_DEADLINE - } else { - LONGPOLL_DEADLINE - }; + // Build the shared wait set: every TCP session that anyone in + // this batch will drain (Phase 2 connect_data / data, plus seq + // jobs). Build a single shared `BatchWait` so: + // * One watcher task per UNIQUE inner — not the M × N spawn + // the per-job `wait_for_any_drainable` calls produced. + // * `notify_waiters()` fans out to ALL parked jobs at once + // (fixing the `notify_one` race where a single push only + // woke one of N waiters and the rest sat until + // `LONGPOLL_DEADLINE`). + let batch_wait: Arc = BatchWait::new({ + let mut v: Vec> = + tcp_drains.iter().map(|d| d.inner.clone()).collect(); + v.extend(seq_specs.iter().map(|s| s.inner.clone())); + v + }); - // Phase 1 already gave us each session's Arc<…Inner>, so we - // don't need to re-acquire the sessions map lock here. Cloning - // the Arc is just a refcount bump. - let tcp_inners: Vec> = - tcp_drains.iter().map(|(_, _, inner)| inner.clone()).collect(); - let udp_inners: Vec> = - udp_drains.iter().map(|(_, _, inner)| inner.clone()).collect(); - - // Wake on whichever side has work first. The previous - // `tokio::join!` was conjunctive — a TCP burst still paid the - // UDP deadline in mixed batches because the UDP waiter had to - // elapse too. `wait_for_*_drainable` short-circuits on an empty - // slice, so we have to skip the empty side; otherwise its - // instant return would fire the select arm before the other - // side ever got a chance to wait. - match (tcp_inners.is_empty(), udp_inners.is_empty()) { - (true, true) => {} - (false, true) => wait_for_any_drainable(&tcp_inners, deadline).await, - (true, false) => wait_for_any_udp_drainable(&udp_inners, deadline).await, - (false, false) => { - tokio::select! { - _ = wait_for_any_drainable(&tcp_inners, deadline) => {} - _ = wait_for_any_udp_drainable(&udp_inners, deadline) => {} - } - } - } + // Spawn seq-ordered `data` ops AFTER `new_conn_jobs` so each + // task sees the final value of `had_writes_or_connects` and can + // pick the right drain deadline (active vs long-poll). Keeping + // these in a JoinSet lets us race them with Phase 2 below, so a + // single idle pipelined poll doesn't gate the whole batch + // response on `LONGPOLL_DEADLINE`. + let mut seq_data_jobs: JoinSet<(usize, TunnelResponse)> = JoinSet::new(); + for spec in seq_specs { + let budget = response_budget.clone(); + let active = had_writes_or_connects; + let bw = batch_wait.clone(); + seq_data_jobs.spawn(async move { + let resp = process_seq_data_op( + spec.inner, spec.sid, spec.seq, spec.data, budget, active, bw, + ) + .await; + (spec.i, resp) + }); + } - if had_writes_or_connects { - // Adaptive settle: keep waiting in steps while new data - // keeps arriving. Break when: - // 1. No new data arrived in the last step (burst is over) - // 2. STRAGGLER_SETTLE_MAX reached - let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX; - let mut prev_tcp_bytes: usize = 0; - let mut prev_udp_pkts: usize = 0; - // Snapshot current buffer sizes. - for inner in &tcp_inners { - prev_tcp_bytes += inner.read_buf.lock().await.len(); - } - for inner in &udp_inners { - prev_udp_pkts += inner.packets.lock().await.len(); - } - loop { - let now = Instant::now(); - if now >= settle_end { - break; + // Phase 2 (legacy non-seq drain) and seq-data jobs run + // concurrently — neither's wait phase gates the other. Phase 2 + // waits up to ACTIVE_DRAIN_DEADLINE / LONGPOLL_DEADLINE for + // tcp_drains / udp_drains; seq jobs have their own per-session + // waits (per-inner for active, shared `BatchWait` for idle). + // Both share the same `BatchWait`, so a wake from any session's + // `reader_task` resolves both sides simultaneously. + // + // Note: the HTTP batch response itself is held until BOTH sides + // finish — the `tokio::join!` below joins everything before + // building the JSON body. The "concurrent" claim is about wait- + // phase coupling, not about returning the response early. + // Apps Script / the batched protocol surface doesn't support + // partial / streamed responses, so all results have to land in + // one body. + let response_budget_p2 = response_budget.clone(); + let phase_2_batch_wait = batch_wait.clone(); + let phase_2_fut = async move { + let mut p2_results: Vec<(usize, TunnelResponse)> = Vec::new(); + let mut tcp_eof_sids: Vec = Vec::new(); + let mut udp_eof_sids: Vec = Vec::new(); + + if !tcp_drains.is_empty() || !udp_drains.is_empty() { + let deadline = if had_writes_or_connects { + ACTIVE_DRAIN_DEADLINE + } else { + LONGPOLL_DEADLINE + }; + + // Phase 1 already gave us each session's Arc<…Inner>, so + // we don't need to re-acquire the sessions map lock + // here. Cloning the Arc is just a refcount bump. + let tcp_inners: Vec> = + tcp_drains.iter().map(|d| d.inner.clone()).collect(); + let udp_inners: Vec> = udp_drains + .iter() + .map(|(_, _, inner)| inner.clone()) + .collect(); + + // Wake on whichever side has work first. The TCP wait + // is the SHARED `BatchWait` — its watcher tasks bridge + // every unique session's `inner.notify` to a single + // batch-wide `notify_waiters()`, so a seq session + // getting bytes wakes Phase 2 too AND a Phase 2 session + // getting bytes wakes every seq job. We only DRAIN our + // own `tcp_inners` below; the seq jobs handle theirs. + // + // The previous `tokio::join!` was conjunctive — a TCP + // burst still paid the UDP deadline in mixed batches. + // `BatchWait::wait` short-circuits on an empty inner set + // (and `wait_for_any_udp_drainable` does the same for + // UDP), so we skip the empty side to avoid its instant + // return firing the select arm prematurely. + let tcp_wait_empty = phase_2_batch_wait.inners.is_empty(); + match (tcp_wait_empty, udp_inners.is_empty()) { + (true, true) => {} + (false, true) => phase_2_batch_wait.wait(deadline).await, + (true, false) => wait_for_any_udp_drainable(&udp_inners, deadline).await, + (false, false) => { + tokio::select! { + _ = phase_2_batch_wait.wait(deadline) => {} + _ = wait_for_any_udp_drainable(&udp_inners, deadline) => {} + } } - let remaining = settle_end.duration_since(now); - tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await; + } - // Measure current buffer sizes. - let mut tcp_bytes: usize = 0; - let mut udp_pkts: usize = 0; + if had_writes_or_connects { + // Adaptive settle: keep waiting in steps while new + // data keeps arriving. Break when: + // 1. No new data arrived in the last step (burst is over) + // 2. STRAGGLER_SETTLE_MAX reached + let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX; + let mut prev_tcp_bytes: usize = 0; + let mut prev_udp_pkts: usize = 0; for inner in &tcp_inners { - tcp_bytes += inner.read_buf.lock().await.len(); + prev_tcp_bytes += inner.read_buf.lock().await.len(); } for inner in &udp_inners { - udp_pkts += inner.packets.lock().await.len(); + prev_udp_pkts += inner.packets.lock().await.len(); } + loop { + let now = Instant::now(); + if now >= settle_end { + break; + } + let remaining = settle_end.duration_since(now); + tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await; - // No new data since last step — burst is over. - if tcp_bytes == prev_tcp_bytes && udp_pkts == prev_udp_pkts { - break; - } + let mut tcp_bytes: usize = 0; + let mut udp_pkts: usize = 0; + for inner in &tcp_inners { + tcp_bytes += inner.read_buf.lock().await.len(); + } + for inner in &udp_inners { + udp_pkts += inner.packets.lock().await.len(); + } + + if tcp_bytes == prev_tcp_bytes && udp_pkts == prev_udp_pkts { + break; + } - prev_tcp_bytes = tcp_bytes; - prev_udp_pkts = udp_pkts; + prev_tcp_bytes = tcp_bytes; + prev_udp_pkts = udp_pkts; + } } - } - // ---- TCP drain ---- - // Drain through each session's already-cloned Arc so the global - // sessions map lock isn't held across the per-session - // read_buf.lock().await. - // - // Cleanup is driven off `drain_now`'s returned `eof`, NOT the - // raw `inner.eof` atomic. When the buffer exceeds - // `TCP_DRAIN_MAX_BYTES`, `drain_now` deliberately returns - // `eof = false` and leaves the tail in the buffer so the - // client can pick it up on the next poll. The previous cleanup - // read the atomic directly, so on a high-throughput session - // that closed mid-burst (issue #460-style) it would remove the - // session and abort the reader_task with the tail still - // buffered, dropping those bytes. - let mut tcp_eof_sids: Vec = Vec::new(); - // Track remaining batch-response budget across all session drains - // (#863). Per-session `TCP_DRAIN_MAX_BYTES` alone wasn't enough — - // several concurrent sessions each contributing 16 MiB summed past - // Apps Script's 50 MiB response ceiling. This cap stops one session - // short of the cliff; deferred sessions drain on the next poll. - let mut remaining_budget: usize = BATCH_RESPONSE_BUDGET; - for (i, sid, inner) in &tcp_drains { - let (data, eof) = drain_now(inner, remaining_budget).await; - let drained = data.len(); - if eof { - tcp_eof_sids.push(sid.clone()); + // ---- TCP drain ---- + // Cleanup is driven off `drain_now`'s returned `eof`, NOT + // the raw `inner.eof` atomic. When the buffer exceeds + // `TCP_DRAIN_MAX_BYTES`, `drain_now` deliberately returns + // `eof = false` and leaves the tail in the buffer so the + // client can pick it up on the next poll. + // + // On budget exhaustion, push an empty (eof=false) + // response per remaining drain rather than `break`-ing — + // every (i, sid) registered as a drain corresponds to an + // op the client expects a reply for. Skipping pushes + // would surface as "missing response in batch" client- + // side, and for connect_data would leak a freshly-opened + // server session (the client never learns the sid, so + // never sends `close`). + for drain in &tcp_drains { + // Reserve at most `TCP_DRAIN_MAX_BYTES`, not the + // entire remaining budget. Mirrors the seq path + // (`process_seq_data_op`): under mixed legacy + + // seq batches, setting budget to 0 here would + // make concurrently-running seq jobs observe + // `take == 0` and return empty responses for an + // op slot whose own drain would have fit in the + // budget once we refunded the unused portion a + // moment later. With the per-iteration cap, seq + // jobs continue to see headroom up to + // `TCP_DRAIN_MAX_BYTES` worth of budget while + // this drain is in flight. + let take = { + let mut b = response_budget_p2.lock().await; + let take = (*b).min(TCP_DRAIN_MAX_BYTES); + *b = b.saturating_sub(take); + take + }; + let mut resp = if take == 0 { + // Budget exhausted by sibling drains (legacy or + // seq). Push an empty placeholder so the client + // sees a reply for this index; the buffered + // bytes stay for the next poll. Don't surface + // raw inner.eof here for the same reason + // `drain_now` withholds it when bytes remain — + // closing the session client-side would drop + // the buffered tail. + tcp_drain_response(drain.sid.clone(), Vec::new(), false) + } else { + let (data, eof) = drain_now(&drain.inner, take).await; + let drained = data.len(); + if drained < take { + *response_budget_p2.lock().await += take - drained; + } + if eof { + tcp_eof_sids.push(drain.sid.clone()); + } + tcp_drain_response(drain.sid.clone(), data, eof) + }; + // Stamp caps on the connect_data first reply so the + // batched HTTPS fast path enables pipelining (matches + // `handle_connect_data_single`'s behavior). Plain + // `data` drains don't carry caps — they're not + // session-establishment replies. + if drain.is_connect_data { + resp.caps = Some(CAPS_PIPELINE_SEQ); + } + p2_results.push((drain.i, resp)); } - results.push((*i, tcp_drain_response(sid.clone(), data, eof))); - remaining_budget = remaining_budget.saturating_sub(drained); - if remaining_budget == 0 { - // Budget exhausted; remaining sessions in `tcp_drains` keep - // their buffered data and pick up next batch. - break; + + // ---- UDP drain ---- + // Same shape as TCP. `drain_udp_now` currently drains + // the full queue with no per-batch cap, so its returned + // `eof` already matches the atomic — driving cleanup + // off the drain return rather than the atomic catches a + // future per-batch UDP cap if added. + for (i, sid, inner) in &udp_drains { + let (packets, eof) = drain_udp_now(inner).await; + if eof { + udp_eof_sids.push(sid.clone()); + } + p2_results.push((*i, udp_drain_response(sid.clone(), packets, eof))); } } - if !tcp_eof_sids.is_empty() { - let mut sessions = state.sessions.lock().await; - for sid in &tcp_eof_sids { - if let Some(s) = sessions.remove(sid) { - s.reader_handle.abort(); - tracing::info!("session {} closed by remote (batch)", sid); - } + + (p2_results, tcp_eof_sids, udp_eof_sids) + }; + + let seq_fut = async move { + let mut out: Vec<(usize, TunnelResponse)> = Vec::new(); + while let Some(join) = seq_data_jobs.join_next().await { + match join { + Ok((i, resp)) => out.push((i, resp)), + Err(e) => tracing::error!("seq-data task panicked: {}", e), } } + out + }; - // ---- UDP drain ---- - // Same shape as TCP. `drain_udp_now` currently drains the full - // queue with no per-batch cap, so its returned `eof` already - // matches the atomic — driving cleanup off the drain return - // is future-proofing: if a UDP per-batch packet cap is ever - // added (mirroring `TCP_DRAIN_MAX_BYTES`), the same data-loss - // trap that motivated the TCP-side fix reappears, and tracking - // eof from the drain return rather than the atomic catches it. - let mut udp_eof_sids: Vec = Vec::new(); - for (i, sid, inner) in &udp_drains { - let (packets, eof) = drain_udp_now(inner).await; - if eof { - udp_eof_sids.push(sid.clone()); + let ((p2_results, p2_tcp_eof_sids, p2_udp_eof_sids), seq_results) = + tokio::join!(phase_2_fut, seq_fut); + + // Process deferred close ops AFTER all seq-data jobs and Phase + // 2 drains have completed. Earlier same-sid seq ops have now + // written their uplink and produced their drain responses, so + // the close can safely tear down the session without racing + // them. Running close inline during dispatch (the pre-fix + // shape) removed the session from `state.sessions` and aborted + // `reader_task` while same-sid seq jobs were still about to + // run on their cloned `Arc`. + for (i, sid) in pending_closes { + let r = handle_close(&state, sid).await; + results.push((i, r)); + } + + // Collect seq-side eof'd sids the same way Phase 2 does, so an + // upstream-closed pipelined session is reaped immediately on + // its eof reply rather than waiting for the idle reaper. + let seq_tcp_eof_sids: Vec = seq_results + .iter() + .filter_map(|(_, r)| { + if r.eof.unwrap_or(false) { + r.sid.clone() + } else { + None + } + }) + .collect(); + + results.extend(p2_results); + results.extend(seq_results); + + // Combined TCP eof cleanup — both legacy Phase 2 and seq paths + // can produce eof'd sids; remove them in a single sessions-map + // lock acquisition. + if !p2_tcp_eof_sids.is_empty() || !seq_tcp_eof_sids.is_empty() { + let mut sessions = state.sessions.lock().await; + for sid in p2_tcp_eof_sids.iter().chain(seq_tcp_eof_sids.iter()) { + if let Some(s) = sessions.remove(sid) { + s.reader_handle.abort(); + tracing::info!("session {} closed by remote (batch)", sid); } - results.push((*i, udp_drain_response(sid.clone(), packets, eof))); } - if !udp_eof_sids.is_empty() { - let mut sessions = state.udp_sessions.lock().await; - for sid in &udp_eof_sids { - if let Some(s) = sessions.remove(sid) { - s.reader_handle.abort(); - tracing::info!("udp session {} closed by remote (batch)", sid); - } + } + if !p2_udp_eof_sids.is_empty() { + let mut sessions = state.udp_sessions.lock().await; + for sid in &p2_udp_eof_sids { + if let Some(s) = sessions.remove(sid) { + s.reader_handle.abort(); + tracing::info!("udp session {} closed by remote (batch)", sid); } } } @@ -1144,17 +1701,27 @@ async fn handle_batch( }; let json = serde_json::to_vec(&batch_resp).unwrap_or_default(); - (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json) + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/json")], + json, + ) } fn tcp_drain_response(sid: String, data: Vec, eof: bool) -> TunnelResponse { TunnelResponse { sid: Some(sid), - d: if data.is_empty() { None } else { Some(B64.encode(&data)) }, + d: if data.is_empty() { + None + } else { + Some(B64.encode(&data)) + }, pkts: None, eof: Some(eof), e: None, code: None, + seq: None, + caps: None, } } @@ -1171,6 +1738,8 @@ fn udp_drain_response(sid: String, packets: Vec>, eof: bool) -> TunnelRe eof: Some(eof), e: None, code: None, + seq: None, + caps: None, } } @@ -1182,6 +1751,316 @@ fn eof_response(sid: String) -> TunnelResponse { eof: Some(true), e: None, code: None, + seq: None, + caps: None, + } +} + +/// RAII guard that advances `expected` and notifies waiters on drop. +/// Holding this across the entire (write, drain) sequence guarantees +/// that subsequent seqs unblock on EVERY exit path, including early +/// returns from write/flush failures. Without it, an upstream socket +/// error mid-pipeline would leave `expected` frozen at the failed +/// seq, stranding all later seqs until each hit `SEQ_WAIT_TIMEOUT` +/// (~30 s) — pushing well-formed batches into client/Apps-Script +/// timeout territory. +struct SeqAdvanceOnDrop<'a> { + state: Option>, + notify: &'a Notify, +} + +impl<'a> Drop for SeqAdvanceOnDrop<'a> { + fn drop(&mut self) { + if let Some(mut s) = self.state.take() { + s.expected = s.expected.saturating_add(1); + // s drops here, releasing the seq_state lock before the + // notify_waiters() call below. Order matters: waiters + // wake and immediately try to lock seq_state, so they + // need to see the bumped value, which means the lock + // release has to happen before the wake. + } + self.notify.notify_waiters(); + } +} + +/// Seq-ordered `data` op processing for the pipelining protocol. +/// +/// The contract: when a `data` op carries a `seq`, the tunnel-node +/// must process the entire (write, wait, drain) sequence for the +/// session in monotonically increasing seq order. Two ops can travel +/// from the client through different deployments/network paths and +/// arrive here out of order — without ordering, the client's +/// per-session reorder buffer would see seq-N's reply containing +/// bytes that should have followed seq-N+1's, silently corrupting +/// the stream. +/// +/// Concurrency: the seq lock (held inside `SeqAdvanceOnDrop`) wraps +/// the *entire* (write, wait_for_drainable, drain) sequence and is +/// released — with `expected` advanced — on every exit path via +/// the guard's `Drop`. This guarantees per-session serialization of +/// drains as well as writes, AND ensures a write/flush error doesn't +/// strand later seqs behind a never-bumped `expected`. +/// +/// Drain budget: callers thread a shared `Arc>` through +/// every seq-data job in a batch (and through Phase 2's +/// `tcp_drains` loop) so the total bytes across all sessions in +/// one batch response stay under `BATCH_RESPONSE_BUDGET`. Without +/// this, M concurrent pipelined sessions would each return up to +/// `TCP_DRAIN_MAX_BYTES`, stacking past Apps Script's 50 MiB ceiling +/// and triggering truncation / parse failures (#863). +/// +/// `had_active_in_batch` propagates the dispatch loop's +/// `had_writes_or_connects` flag so an idle pipelined poll +/// coalesced with active sibling ops uses `ACTIVE_DRAIN_DEADLINE` +/// instead of `LONGPOLL_DEADLINE` — without this, a single empty +/// seq op can hold the whole batch response open for 15 s. +/// +/// `batch_wait` is the shared wait primitive for the whole batch. +/// Every seq job and Phase 2's TCP drain subscribe to its single +/// `Notify`; one push from any session's `reader_task` fans out to +/// `notify_waiters()` and wakes every parked job at once. Without +/// this shared primitive, an idle empty pipelined poll for session +/// B held the entire batch response open up to `LONGPOLL_DEADLINE`, +/// even if session A already had bytes another job was ready to +/// ship — and the previous per-job watcher design also hit the +/// `notify_one` race where a single push only woke one of N parked +/// jobs. +async fn process_seq_data_op( + inner: Arc, + sid: String, + seq: u64, + data_b64: Option, + response_budget: Arc>, + had_active_in_batch: bool, + batch_wait: Arc, +) -> TunnelResponse { + // Decode uplink first so a malformed payload doesn't burn a seq slot. + let uplink = match data_b64 { + Some(ref d) if !d.is_empty() => match B64.decode(d) { + Ok(b) => Some(b), + Err(e) => { + let mut r = TunnelResponse::error(format!("bad base64: {}", e)); + r.seq = Some(seq); + return r; + } + }, + _ => None, + }; + let had_uplink = uplink.as_ref().map(|b| !b.is_empty()).unwrap_or(false); + + // Wait for our seq turn. Returns the seq-state guard *without* + // bumping `expected` — we hold it across the entire op so drain + // serialization is guaranteed. + let seq_guard = match wait_for_seq_turn(&inner, seq, SEQ_WAIT_TIMEOUT).await { + Ok(g) => g, + Err(e) => { + let mut r = TunnelResponse::error(e); + r.seq = Some(seq); + return r; + } + }; + + // From here on, the SeqAdvanceOnDrop guard owns the seq_state + // MutexGuard and bumps `expected` + notify_waiters() on every + // return path — including the early-exit error returns below + // (write/flush failures). Without this, an upstream write + // failure would strand later seqs behind a never-bumped + // `expected` until each hit `SEQ_WAIT_TIMEOUT`. + let _advance = SeqAdvanceOnDrop { + state: Some(seq_guard), + notify: &inner.seq_advance, + }; + + // Refresh last_active on EVERY pipelined op, including empty + // polls. Matches the legacy non-seq `data` branch (line ~984): + // the idle reaper would otherwise close a long-lived session + // that's actively being long-polled by the client (no uplink, + // no recent inbound = stale, even though the client is still + // engaged). + *inner.last_active.lock().await = Instant::now(); + + // Write uplink under the writer lock. The writer lock is a + // sub-lock here — only acquired when we actually have bytes to + // write. Lock order: seq → writer (no other path takes seq + // before writer, so this can't deadlock with the legacy non-seq + // data path which only takes writer). + if let Some(bytes) = uplink { + if !bytes.is_empty() { + let mut writer = inner.writer.lock().await; + if let Err(e) = writer.write_all(&bytes).await { + drop(writer); + let mut r = TunnelResponse::error(format!("write failed: {}", e)); + r.seq = Some(seq); + return r; + } + if let Err(e) = writer.flush().await { + drop(writer); + let mut r = TunnelResponse::error(format!("flush failed: {}", e)); + r.seq = Some(seq); + return r; + } + } + } + + // Pick the drain deadline AND the wait shape based on whether + // this op is active. + // + // ACTIVE (`had_uplink == true`): we just wrote uplink bytes; the + // response we want is THIS session's reply, not anyone else's. + // Wait per-session up to `ACTIVE_DRAIN_DEADLINE`, then run a + // straggler-settle loop (mirroring Phase 2's behavior) so a + // multi-packet response from upstream lands in one drain + // instead of being split across two batches. Subscribing to + // the shared batch_wait here would be wrong: a sibling + // session's bytes could short-circuit our wait, and we'd + // return empty for an active op that just hadn't gotten its + // own reply yet. + // + // IDLE (`had_uplink == false`): empty poll. If the batch has + // any active siblings, use `ACTIVE_DRAIN_DEADLINE` to keep the + // whole batch responsive; otherwise long-poll up to + // `LONGPOLL_DEADLINE`. Either way, subscribe to the SHARED + // `BatchWait` so a sibling getting bytes wakes us — that's + // the whole point of the shared wake design. + if had_uplink { + wait_for_any_drainable(std::slice::from_ref(&inner), ACTIVE_DRAIN_DEADLINE).await; + // Active straggler settle: keep waiting in 10 ms steps + // while bytes keep arriving, capped at STRAGGLER_SETTLE_MAX + // (~1 s). Without this, a multi-packet response that + // arrives a few ms apart gets split across this batch's + // drain and the next, doubling Apps Script round-trips + // for the same logical exchange. + let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX; + let mut prev_len = inner.read_buf.lock().await.len(); + let raw_eof = inner.eof.load(Ordering::Acquire); + if !raw_eof { + loop { + let now = Instant::now(); + if now >= settle_end { + break; + } + let remaining = settle_end - now; + tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await; + let cur_len = inner.read_buf.lock().await.len(); + let cur_eof = inner.eof.load(Ordering::Acquire); + if cur_len == prev_len && !cur_eof { + break; + } + prev_len = cur_len; + if cur_eof { + break; + } + } + } + } else { + let deadline = if had_active_in_batch { + ACTIVE_DRAIN_DEADLINE + } else { + LONGPOLL_DEADLINE + }; + if batch_wait.inners.is_empty() { + wait_for_any_drainable(std::slice::from_ref(&inner), deadline).await; + } else { + batch_wait.wait(deadline).await; + } + } + + // Reserve our share of the batch-wide response budget, drain + // exactly that much (or less), and return any unused budget so + // sibling tasks can use it. `drain_now` already handles the + // per-session `TCP_DRAIN_MAX_BYTES` cap and — critically — leaves + // the tail in the buffer when over the cap, so no bytes are lost. + let take = { + let mut budget = response_budget.lock().await; + let take = (*budget).min(TCP_DRAIN_MAX_BYTES); + *budget = budget.saturating_sub(take); + take + }; + let (data, eof) = if take == 0 { + // Batch budget already exhausted by sibling drains. Don't + // drain any bytes here — they'll come back on the next poll. + // Critically, withhold EOF when the buffer still contains + // bytes: reporting eof=true here would close the session + // client-side and silently drop the buffered tail. This + // mirrors `drain_now`'s "withhold EOF until tail is drained" + // semantics; without it, the budget-exhaustion path becomes + // a data-loss bug on high-throughput pipelined sessions. + let buf_empty = inner.read_buf.lock().await.is_empty(); + let raw_eof = inner.eof.load(Ordering::Acquire); + let safe_eof = raw_eof && buf_empty; + (Vec::new(), safe_eof) + } else { + let (d, e) = drain_now(&inner, take).await; + if d.len() < take { + *response_budget.lock().await += take - d.len(); + } + (d, e) + }; + + // _advance drops here on the success path, bumping expected and + // notifying waiters. The same drop runs on every early return + // above. + let mut resp = tcp_drain_response(sid, data, eof); + resp.seq = Some(seq); + resp +} + +/// Wait for our `seq` to be the next-expected for this session and +/// return the seq-state guard. The caller HOLDS the guard across the +/// entire (write, wait_for_drainable, drain) sequence and only +/// releases it after bumping `expected` — that's what enforces both +/// uplink ordering AND drain ordering on a single shared +/// `read_buf`. An earlier shape that released the guard before the +/// drain phase let two concurrently-running seq tasks race for the +/// read_buf lock; a reader_task push between their two final takes +/// could split bytes across replies in reverse seq order, silently +/// corrupting the stream. +async fn wait_for_seq_turn<'a>( + inner: &'a SessionInner, + seq: u64, + timeout: Duration, +) -> Result, String> { + let start = Instant::now(); + loop { + // Subscribe to the notify *before* taking the seq_state + // lock. `Notify::notify_waiters()` does NOT save a permit + // for waiters that aren't yet registered, so the obvious + // `lock → check → drop → notified()` order races: an + // advance between drop and notified() is lost, and the + // waiter sleeps until SEQ_WAIT_TIMEOUT. `enable()` registers + // the future so any subsequent `notify_waiters()` wakes it, + // even if the wake fires before we reach `.await` below. + let notified = inner.seq_advance.notified(); + tokio::pin!(notified); + notified.as_mut().enable(); + + let state = inner.seq_state.lock().await; + if seq < state.expected { + // Late arrival — likely a client retry of a seq we + // already processed. Drop it; the client's earlier + // reply (if it arrived) handled this op. + return Err(format!( + "seq {} already processed (expected {})", + seq, state.expected + )); + } + if seq == state.expected { + return Ok(state); + } + // seq > expected: drop the seq lock and wait for an advance. + let current_expected = state.expected; + drop(state); + let elapsed = start.elapsed(); + if elapsed >= timeout { + return Err(format!( + "seq {} timed out waiting for {} (waited {:?})", + seq, current_expected, elapsed + )); + } + // The pre-enabled `notified` will resolve immediately if a + // notify_waiters() fired between our enable() and here. + let _ = tokio::time::timeout(timeout - elapsed, notified).await; + // Re-check at top of loop after wake / timeout. } } @@ -1212,7 +2091,11 @@ fn validate_host_port( Ok((host, port)) } -async fn handle_connect(state: &AppState, host: Option, port: Option) -> TunnelResponse { +async fn handle_connect( + state: &AppState, + host: Option, + port: Option, +) -> TunnelResponse { let (host, port) = match validate_host_port(host, port) { Ok(v) => v, Err(r) => return r, @@ -1228,7 +2111,19 @@ async fn handle_connect(state: &AppState, host: Option, port: Option {}:{}", sid, host, port); state.sessions.lock().await.insert(sid.clone(), session); - TunnelResponse { sid: Some(sid), d: None, pkts: None, eof: Some(false), e: None, code: None } + // Advertise pipelining support on the connect reply so the client + // knows it can opt into per-session `data`-op sequence numbers + // for this session. Old clients ignore the unknown `caps` field. + TunnelResponse { + sid: Some(sid), + d: None, + pkts: None, + eof: Some(false), + e: None, + code: None, + seq: None, + caps: Some(CAPS_PIPELINE_SEQ), + } } /// Open a session and write the client's first bytes in one round trip. @@ -1343,17 +2238,31 @@ async fn handle_connect_data_single( tracing::info!("session {} closed by remote", sid); } } + // Same caps advertisement as `handle_connect`. The single-op + // `connect_data` path is the optimistic-bundling fast path — + // surfacing capabilities here means a client opting into both + // optimizations learns about pipelining on the very first reply. TunnelResponse { sid: Some(sid), - d: if data.is_empty() { None } else { Some(B64.encode(&data)) }, + d: if data.is_empty() { + None + } else { + Some(B64.encode(&data)) + }, pkts: None, eof: Some(eof), e: None, code: None, + seq: None, + caps: Some(CAPS_PIPELINE_SEQ), } } -async fn handle_data_single(state: &AppState, sid: Option, data: Option) -> TunnelResponse { +async fn handle_data_single( + state: &AppState, + sid: Option, + data: Option, +) -> TunnelResponse { let sid = match sid { Some(s) if !s.is_empty() => s, _ => return TunnelResponse::error("missing sid"), @@ -1396,9 +2305,17 @@ async fn handle_data_single(state: &AppState, sid: Option, data: Option< } TunnelResponse { sid: Some(sid), - d: if data.is_empty() { None } else { Some(B64.encode(&data)) }, + d: if data.is_empty() { + None + } else { + Some(B64.encode(&data)) + }, pkts: None, - eof: Some(eof), e: None, code: None, + eof: Some(eof), + e: None, + code: None, + seq: None, + caps: None, } } @@ -1415,7 +2332,16 @@ async fn handle_close(state: &AppState, sid: Option) -> TunnelResponse { s.reader_handle.abort(); tracing::info!("udp session {} closed by client", sid); } - TunnelResponse { sid: Some(sid), d: None, pkts: None, eof: Some(true), e: None, code: None } + TunnelResponse { + sid: Some(sid), + d: None, + pkts: None, + eof: Some(true), + e: None, + code: None, + seq: None, + caps: None, + } } // --------------------------------------------------------------------------- @@ -1472,11 +2398,7 @@ async fn cleanup_task( } } if !stale.is_empty() { - tracing::info!( - "cleanup: reaped {}, {} active udp", - stale.len(), - map.len() - ); + tracing::info!("cleanup: reaped {}, {} active udp", stale.len(), map.len()); } } } @@ -1490,8 +2412,7 @@ async fn cleanup_task( async fn main() { tracing_subscriber::fmt() .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| "info".into()), + tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| "info".into()), ) .init(); @@ -1684,9 +2605,8 @@ mod tests { #[tokio::test] async fn connect_data_rejects_missing_host() { let state = fresh_state(); - let resp = handle_connect_data_single( - &state, None, Some(443), Some(B64.encode(b"x")), - ).await; + let resp = + handle_connect_data_single(&state, None, Some(443), Some(B64.encode(b"x"))).await; assert!(resp.e.as_deref().unwrap_or("").contains("missing host")); assert!(state.sessions.lock().await.is_empty()); } @@ -1736,6 +2656,8 @@ mod tests { eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), notify: Notify::new(), + seq_state: Mutex::new(SeqState { expected: 0 }), + seq_advance: Notify::new(), }) } @@ -1771,11 +2693,7 @@ mod tests { // cap path does. let inner = fake_inner().await; // 1 MiB buffered, but caller only has 256 KiB budget left. - inner - .read_buf - .lock() - .await - .resize(1024 * 1024, 0xcd); + inner.read_buf.lock().await.resize(1024 * 1024, 0xcd); let (drained, eof) = drain_now(&inner, 256 * 1024).await; assert_eq!(drained.len(), 256 * 1024); @@ -1793,7 +2711,11 @@ mod tests { #[tokio::test] async fn drain_now_passes_through_when_under_cap() { let inner = fake_inner().await; - inner.read_buf.lock().await.extend_from_slice(b"hello world"); + inner + .read_buf + .lock() + .await + .extend_from_slice(b"hello world"); let (data, eof) = drain_now(&inner, usize::MAX).await; assert_eq!(data, b"hello world"); @@ -1826,7 +2748,11 @@ mod tests { #[tokio::test] async fn wait_for_any_drainable_returns_immediately_when_buffer_has_data() { let inner = fake_inner().await; - inner.read_buf.lock().await.extend_from_slice(b"already here"); + inner + .read_buf + .lock() + .await + .extend_from_slice(b"already here"); let t0 = Instant::now(); wait_for_any_drainable(&[inner], Duration::from_secs(5)).await; @@ -1987,6 +2913,8 @@ mod tests { eof: AtomicBool::new(false), last_active: Mutex::new(Instant::now()), notify: Notify::new(), + seq_state: Mutex::new(SeqState { expected: 0 }), + seq_advance: Notify::new(), }); let _reader_handle = tokio::spawn(reader_task(reader, inner.clone())); @@ -2060,7 +2988,9 @@ mod tests { let resp = handle_batch(State(state.clone()), Bytes::from(body)) .await .into_response(); - let body_bytes = axum::body::to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body_bytes = axum::body::to_bytes(resp.into_body(), usize::MAX) + .await + .unwrap(); serde_json::from_slice(&body_bytes).unwrap() } @@ -2070,10 +3000,7 @@ mod tests { /// at ACTIVE_DRAIN_DEADLINE (350 ms) with no data. #[tokio::test] async fn batch_pure_poll_wakes_on_push() { - let push_port = start_push_server( - Duration::from_millis(150), - b"PUSHED".to_vec(), - ).await; + let push_port = start_push_server(Duration::from_millis(150), b"PUSHED".to_vec()).await; let state = fresh_state(); let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(push_port)).await; let sid = connect_resp.sid.expect("connect should succeed"); @@ -2081,7 +3008,8 @@ mod tests { let body = serde_json::to_vec(&serde_json::json!({ "k": "test-key", "ops": [{"op": "data", "sid": sid}], - })).unwrap(); + })) + .unwrap(); let t0 = Instant::now(); let resp = invoke_handle_batch(&state, body).await; @@ -2099,7 +3027,9 @@ mod tests { ); let r = resp["r"].as_array().expect("response must be an array"); - let d_b64 = r[0]["d"].as_str().expect("response should carry pushed bytes"); + let d_b64 = r[0]["d"] + .as_str() + .expect("response should carry pushed bytes"); let data = B64.decode(d_b64).unwrap(); assert_eq!(&data[..], b"PUSHED"); } @@ -2111,13 +3041,15 @@ mod tests { async fn batch_active_caps_at_active_deadline() { let silent_port = start_silent_server().await; let state = fresh_state(); - let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(silent_port)).await; + let connect_resp = + handle_connect(&state, Some("127.0.0.1".into()), Some(silent_port)).await; let sid = connect_resp.sid.expect("connect should succeed"); let body = serde_json::to_vec(&serde_json::json!({ "k": "test-key", "ops": [{"op": "data", "sid": sid, "d": B64.encode(b"PING")}], - })).unwrap(); + })) + .unwrap(); let t0 = Instant::now(); let _resp = invoke_handle_batch(&state, body).await; @@ -2146,10 +3078,7 @@ mod tests { /// long-poll actually engaged. #[tokio::test] async fn batch_empty_string_payload_engages_long_poll() { - let push_port = start_push_server( - Duration::from_millis(600), - b"DELAYED".to_vec(), - ).await; + let push_port = start_push_server(Duration::from_millis(600), b"DELAYED".to_vec()).await; let state = fresh_state(); let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(push_port)).await; let sid = connect_resp.sid.expect("connect should succeed"); @@ -2157,7 +3086,8 @@ mod tests { let body = serde_json::to_vec(&serde_json::json!({ "k": "test-key", "ops": [{"op": "data", "sid": sid, "d": ""}], - })).unwrap(); + })) + .unwrap(); let t0 = Instant::now(); let resp = invoke_handle_batch(&state, body).await; @@ -2175,7 +3105,8 @@ mod tests { ); let r = resp["r"].as_array().unwrap(); - let d_b64 = r[0]["d"].as_str() + let d_b64 = r[0]["d"] + .as_str() .expect("Some(\"\") payload should have engaged long-poll and delivered DELAYED"); let data = B64.decode(d_b64).unwrap(); assert_eq!(&data[..], b"DELAYED"); @@ -2238,8 +3169,17 @@ mod tests { } let drops = inner.queue_drops.load(Ordering::Relaxed); let queued = inner.packets.lock().await.len(); - assert!(drops >= 1, "expected ≥1 drop, got {} (queued={})", drops, queued); - assert!(queued <= UDP_QUEUE_LIMIT, "queue exceeded limit: {}", queued); + assert!( + drops >= 1, + "expected ≥1 drop, got {} (queued={})", + drops, + queued + ); + assert!( + queued <= UDP_QUEUE_LIMIT, + "queue exceeded limit: {}", + queued + ); } /// Regression for the bug the review caught: a batch mixing UDP and @@ -2466,11 +3406,7 @@ mod tests { // TCP session with bytes already buffered → immediately drainable. let tcp_inner = fake_inner().await; - tcp_inner - .read_buf - .lock() - .await - .extend_from_slice(b"ready"); + tcp_inner.read_buf.lock().await.extend_from_slice(b"ready"); let tcp_sid = "tcp-sid".to_string(); state.sessions.lock().await.insert( tcp_sid.clone(), @@ -2485,15 +3421,11 @@ mod tests { // succeeds; we just never send anything to it. let udp_target = UdpSocket::bind(("127.0.0.1", 0)).await.unwrap(); let udp_port = udp_target.local_addr().unwrap().port(); - let (udp_sid, _udp_inner) = handle_udp_open_phase1( - &state, - Some("127.0.0.1".into()), - Some(udp_port), - None, - ) - .await - .expect("udp open"); - + let (udp_sid, _udp_inner) = + handle_udp_open_phase1(&state, Some("127.0.0.1".into()), Some(udp_port), None) + .await + .expect("udp open"); + // Pure-poll batch (no `d` payload) → had_writes_or_connects = // false → deadline = LONGPOLL_DEADLINE (15 s). Under the // previous tokio::join! wait, the UDP arm would have held the @@ -2521,4 +3453,1175 @@ mod tests { elapsed, ); } + + /// Helper for the seq tests: bump `expected` from N to N+1 the + /// way `process_seq_data_op` does — under the same guard, with a + /// notify_waiters() afterward. Lets unit tests exercise the same + /// state machine without spinning up the full op handler. + async fn release_seq(inner: &SessionInner, mut guard: tokio::sync::MutexGuard<'_, SeqState>) { + guard.expected = guard.expected.saturating_add(1); + drop(guard); + inner.seq_advance.notify_waiters(); + } + + /// Sanity-check: a fresh session starts with `expected = 0`, + /// so seq=0's `wait_for_seq_turn` succeeds immediately. After + /// the simulated release, expected advances to 1. + #[tokio::test] + async fn wait_for_seq_turn_succeeds_when_in_order() { + let inner = fake_inner().await; + let guard = wait_for_seq_turn(&inner, 0, Duration::from_secs(2)) + .await + .expect("seq=0 should claim immediately on fresh session"); + release_seq(&inner, guard).await; + let state = inner.seq_state.lock().await; + assert_eq!(state.expected, 1); + } + + /// The headline correctness invariant: when seq=N+1 arrives at the + /// server before seq=N (different batches taking different paths), + /// the N+1 task waits inside `wait_for_seq_turn` until the N task + /// lands and bumps `expected`. Without this, the per-session + /// reorder buffer on the client would see a reply for seq=N+1 + /// containing bytes that should have followed seq=N's bytes — + /// silent data corruption. + #[tokio::test] + async fn wait_for_seq_turn_blocks_until_earlier_seq_lands() { + let inner = fake_inner().await; + + // Spawn a seq=2 claim — should block (expected is 0). + let inner_for_2 = inner.clone(); + let claim_2 = tokio::spawn(async move { + let g = wait_for_seq_turn(&inner_for_2, 2, Duration::from_secs(5)).await?; + release_seq(&inner_for_2, g).await; + Ok::<(), String>(()) + }); + + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !claim_2.is_finished(), + "seq=2 must not proceed before seqs 0 and 1 land", + ); + + // Land seq=0. + let g0 = wait_for_seq_turn(&inner, 0, Duration::from_secs(2)) + .await + .expect("seq=0 should claim"); + release_seq(&inner, g0).await; + + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !claim_2.is_finished(), + "seq=2 must not proceed with only seq=0 landed (expected==1)", + ); + + // Land seq=1. + let g1 = wait_for_seq_turn(&inner, 1, Duration::from_secs(2)) + .await + .expect("seq=1 should claim"); + release_seq(&inner, g1).await; + + let result = tokio::time::timeout(Duration::from_secs(2), claim_2) + .await + .expect("seq=2 must wake within timeout once seq=1 lands") + .expect("task should not panic"); + result.expect("seq=2 claim should succeed"); + } + + /// Late arrivals (seq < expected) — typically a duplicate retry + /// from the client — must return an error so the caller can echo + /// it as a "skipped" reply instead of double-writing the upstream. + #[tokio::test] + async fn wait_for_seq_turn_rejects_already_processed_seq() { + let inner = fake_inner().await; + inner.seq_state.lock().await.expected = 5; + let r = wait_for_seq_turn(&inner, 3, Duration::from_secs(1)).await; + assert!( + r.is_err(), + "seq=3 should be rejected when expected has already advanced to 5", + ); + } + + /// Hard timeout when an earlier seq simply never arrives (client + /// crashed mid-pipeline, batch dropped, etc.). Without this the + /// task would wait forever and the session would leak until the + /// idle reaper fires. + #[tokio::test] + async fn wait_for_seq_turn_times_out_when_earlier_seq_never_arrives() { + let inner = fake_inner().await; + let r = wait_for_seq_turn(&inner, 3, Duration::from_millis(100)).await; + assert!(r.is_err(), "seq=3 must time out when seqs 0..3 never land",); + } + + /// Two pipelined seq ops for the same session must not race for + /// `read_buf` — the seq lock has to stay held across the entire + /// (write, drain) sequence. Pre-buffer enough bytes that the + /// drain has multiple kernel-FIFO chunks to take, then run two + /// `process_seq_data_op` calls concurrently. The reply with the + /// lower seq must contain the earlier prefix of the buffered + /// bytes; if the lock granularity regressed, both replies can + /// take from the same buffer in arrival-of-final-lock order + /// rather than seq order. + #[tokio::test] + async fn process_seq_data_drains_in_seq_order_under_concurrency() { + let inner = fake_inner().await; + // Pre-load 200 bytes — 100 each so we can clearly attribute + // which prefix landed where if ordering breaks. + { + let mut buf = inner.read_buf.lock().await; + buf.extend_from_slice(&vec![0xAAu8; 100]); + buf.extend_from_slice(&vec![0xBBu8; 100]); + } + // Mark eof so seq=1's wait_for_any_drainable returns + // immediately on empty buffer instead of paying the full + // LONGPOLL_DEADLINE waiting for non-existent data. + inner.eof.store(true, Ordering::Release); + + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + + // Spawn seq=1 first — it should block waiting for seq=0. + let inner_1 = inner.clone(); + let budget_1 = budget.clone(); + let task_1 = tokio::spawn(async move { + process_seq_data_op( + inner_1, + "sid".into(), + 1, + None, + budget_1, + false, + BatchWait::new(Vec::new()), + ) + .await + }); + // Brief wait so seq=1 is definitely parked on its seq lock. + tokio::time::sleep(Duration::from_millis(50)).await; + + // Now run seq=0. It should drain everything currently in + // read_buf (the cap is generous), then seq=1 wakes and gets + // empty (or whatever the reader pushed in between, none here). + let resp_0 = process_seq_data_op( + inner.clone(), + "sid".into(), + 0, + None, + budget, + false, + BatchWait::new(Vec::new()), + ) + .await; + let resp_1 = tokio::time::timeout(Duration::from_secs(2), task_1) + .await + .expect("seq=1 must complete after seq=0") + .expect("task should not panic"); + + assert_eq!(resp_0.seq, Some(0)); + assert_eq!(resp_1.seq, Some(1)); + // seq=0's reply must carry all the buffered bytes (200 bytes, + // well under TCP_DRAIN_MAX_BYTES). seq=1 gets empty. + let d0 = resp_0.d.as_deref().expect("seq=0 should have drained data"); + let d0_bytes = B64.decode(d0).unwrap(); + assert_eq!(d0_bytes.len(), 200); + // Verify byte order within seq=0's reply (kernel-FIFO). + assert!(d0_bytes[..100].iter().all(|&b| b == 0xAA)); + assert!(d0_bytes[100..].iter().all(|&b| b == 0xBB)); + assert!( + resp_1.d.as_deref().map(str::is_empty).unwrap_or(true), + "seq=1 must observe empty buffer after seq=0 drained — \ + non-empty here means the drains raced", + ); + } + + /// `wait_and_drain` empties the whole buffer; `process_seq_data_op` + /// must use `drain_now` (or its equivalent) so any tail beyond + /// the per-session / per-batch cap is left buffered for the next + /// op. Without this, oversized read_bufs silently drop bytes — a + /// data-loss regression on high-throughput pipelined sessions. + #[tokio::test] + async fn process_seq_data_preserves_tail_when_over_cap() { + let inner = fake_inner().await; + let oversized = TCP_DRAIN_MAX_BYTES + 4096; + inner.read_buf.lock().await.resize(oversized, 0xCD); + + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + let resp = process_seq_data_op( + inner.clone(), + "sid".into(), + 0, + None, + budget, + false, + BatchWait::new(Vec::new()), + ) + .await; + + // Reply should carry exactly TCP_DRAIN_MAX_BYTES of the prefix. + let d = resp.d.as_deref().expect("seq=0 should have data"); + let d_bytes = B64.decode(d).unwrap(); + assert_eq!(d_bytes.len(), TCP_DRAIN_MAX_BYTES); + // The 4 KiB tail must remain in the buffer for the next op. + let remaining = inner.read_buf.lock().await.len(); + assert_eq!( + remaining, 4096, + "tail beyond TCP_DRAIN_MAX_BYTES must stay in read_buf for the next \ + op — silently dropping it (the wait_and_drain regression) loses \ + bytes on high-throughput pipelined sessions", + ); + } + + /// `last_active` must be refreshed on EVERY pipelined op — including + /// empty polls — so the idle reaper doesn't close a session that's + /// being actively long-polled by the client. The legacy non-seq + /// `data` branch refreshes unconditionally; the seq path has to + /// match or long-lived idle / server-push sessions get reaped at + /// 300 s even while the client is still engaged. + #[tokio::test] + async fn process_seq_data_refreshes_last_active_on_empty_poll() { + let inner = fake_inner().await; + // Backdate last_active so we can detect a refresh. + let backdated = Instant::now() - Duration::from_secs(120); + *inner.last_active.lock().await = backdated; + + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + // Empty poll — no uplink. Use a tiny deadline so the test + // doesn't sit on LONGPOLL_DEADLINE waiting for non-existent data. + // (process_seq_data_op picks the deadline based on had_uplink, + // which is false here, so we'd actually pay LONGPOLL_DEADLINE + // = 15 s. Mark eof on the session so wait_for_any_drainable + // returns immediately.) + inner.eof.store(true, Ordering::Release); + let _resp = process_seq_data_op( + inner.clone(), + "sid".into(), + 0, + None, + budget, + false, + BatchWait::new(Vec::new()), + ) + .await; + + let after = *inner.last_active.lock().await; + assert!( + after > backdated, + "last_active must refresh on every seq op (was {:?}, now {:?})", + backdated, + after, + ); + } + + /// Two pipelined seq ops draining oversized buffers must share the + /// `BATCH_RESPONSE_BUDGET` so the combined response stays under + /// Apps Script's 50 MiB ceiling (#863). Without the shared budget, + /// each task takes up to TCP_DRAIN_MAX_BYTES (16 MiB), and 3+ + /// concurrent pipelined sessions would stack past the ceiling and + /// cause truncation / parse failures on the wire. + #[tokio::test] + async fn process_seq_data_shares_batch_response_budget() { + let inner_a = fake_inner().await; + let inner_b = fake_inner().await; + // Each session has a maximally oversized buffer. + inner_a + .read_buf + .lock() + .await + .resize(TCP_DRAIN_MAX_BYTES + 1024, 0xAA); + inner_b + .read_buf + .lock() + .await + .resize(TCP_DRAIN_MAX_BYTES + 1024, 0xBB); + + // Tight shared budget: well below 2 × TCP_DRAIN_MAX_BYTES. + // Forces the second drain to take less (or zero) so the + // combined response respects the cap. + let budget_cap = TCP_DRAIN_MAX_BYTES + 4096; + let budget = Arc::new(Mutex::new(budget_cap)); + + let task_a = tokio::spawn({ + let inner = inner_a.clone(); + let budget = budget.clone(); + async move { + process_seq_data_op( + inner, + "sid-a".into(), + 0, + None, + budget, + false, + BatchWait::new(Vec::new()), + ) + .await + } + }); + let task_b = tokio::spawn({ + let inner = inner_b.clone(); + let budget = budget.clone(); + async move { + process_seq_data_op( + inner, + "sid-b".into(), + 0, + None, + budget, + false, + BatchWait::new(Vec::new()), + ) + .await + } + }); + + let resp_a = tokio::time::timeout(Duration::from_secs(5), task_a) + .await + .unwrap() + .unwrap(); + let resp_b = tokio::time::timeout(Duration::from_secs(5), task_b) + .await + .unwrap() + .unwrap(); + + let len_a = resp_a + .d + .as_deref() + .map(|d| B64.decode(d).unwrap().len()) + .unwrap_or(0); + let len_b = resp_b + .d + .as_deref() + .map(|d| B64.decode(d).unwrap().len()) + .unwrap_or(0); + assert!( + len_a + len_b <= budget_cap, + "combined drained bytes ({} + {} = {}) must not exceed shared budget {}", + len_a, + len_b, + len_a + len_b, + budget_cap, + ); + } + + /// Regression: when batch budget is exhausted AND the + /// session has both buffered bytes AND eof, the seq response + /// must NOT report eof=true. Doing so would close the session + /// client-side and silently drop the buffered tail. Mirrors + /// `drain_now`'s "withhold EOF until tail is drained" semantics. + #[tokio::test] + async fn process_seq_data_withholds_eof_when_budget_exhausted_and_buffer_nonempty() { + let inner = fake_inner().await; + // Pre-buffer real bytes AND mark eof. + inner.read_buf.lock().await.extend_from_slice(b"tail-bytes"); + inner.eof.store(true, Ordering::Release); + + // Budget already at zero — simulates a sibling drain having + // consumed it before we got here. + let budget = Arc::new(Mutex::new(0usize)); + let resp = process_seq_data_op( + inner.clone(), + "sid".into(), + 0, + None, + budget, + true, + BatchWait::new(Vec::new()), + ) + .await; + + assert_eq!(resp.seq, Some(0)); + // No data drained (budget = 0). + assert!( + resp.d.as_deref().map(str::is_empty).unwrap_or(true), + "no data should be returned when budget is 0", + ); + // CRITICAL: eof must NOT be true while bytes remain in the + // buffer. Reporting eof=true here is the data-loss bug — + // the client closes the session and drops the buffered tail. + assert_ne!( + resp.eof, + Some(true), + "eof must be withheld while read_buf still has bytes; \ + reporting it now would silently drop the buffered tail \ + on the client side", + ); + // The tail is still in the buffer for the next op. + assert_eq!(inner.read_buf.lock().await.as_slice(), b"tail-bytes"); + } + + /// Regression: every exit path of `process_seq_data_op` + /// past `wait_for_seq_turn` must bump `expected` and notify + /// waiters — including write/flush failures, where the function + /// early-returns with an error response. The `SeqAdvanceOnDrop` + /// guard delivers this; without it, an upstream write failure + /// would strand later seqs behind a never-bumped `expected` + /// until each hit `SEQ_WAIT_TIMEOUT` (~30 s), pushing batches + /// into client / Apps Script timeout territory. + /// + /// We test the guard directly rather than through + /// `process_seq_data_op` because writing to a closed peer or a + /// dropped duplex doesn't reliably error on every platform — + /// the kernel often buffers or blocks instead. The guard's + /// contract is what every error path in `process_seq_data_op` + /// relies on, so testing it in isolation is both stronger and + /// faster. + #[tokio::test] + async fn seq_advance_on_drop_bumps_expected_and_notifies_waiters() { + let inner = fake_inner().await; + assert_eq!(inner.seq_state.lock().await.expected, 0); + + // Park a seq=1 waiter (expected is 0 → it has to wait). + let inner_for_waiter = inner.clone(); + let waiter = tokio::spawn(async move { + wait_for_seq_turn(&inner_for_waiter, 1, Duration::from_secs(2)) + .await + .map(|g| { + drop(g); + }) + }); + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !waiter.is_finished(), + "seq=1 waiter must park while expected=0", + ); + + // Acquire the seq lock and drop a guard immediately — + // simulates the early-return path of process_seq_data_op + // after wait_for_seq_turn succeeds (e.g. write/flush fails + // before reaching the success-path drop at function end). + { + let state = inner.seq_state.lock().await; + let _guard = SeqAdvanceOnDrop { + state: Some(state), + notify: &inner.seq_advance, + }; + // _guard drops here, bumping expected from 0 → 1 and + // calling notify_waiters() on seq_advance. + } + + // expected must be advanced. + assert_eq!( + inner.seq_state.lock().await.expected, + 1, + "guard's Drop must bump `expected` even on the error-return paths", + ); + + // The parked waiter must wake (because notify_waiters fired) + // AND succeed (because expected now matches its seq=1). + // Without notify_waiters in the guard's Drop, the waiter + // would sleep until SEQ_WAIT_TIMEOUT. + let result = tokio::time::timeout(Duration::from_secs(1), waiter) + .await + .expect( + "waiter must wake when guard drops — without \ + notify_waiters() in SeqAdvanceOnDrop's Drop, the \ + waiter sleeps until SEQ_WAIT_TIMEOUT (~30 s)", + ) + .expect("task should not panic"); + assert!(result.is_ok(), "seq=1 should have claimed"); + } + + /// Regression: `wait_for_seq_turn` must register its + /// `Notified` future BEFORE locking and checking `seq_state`, + /// otherwise an advance + notify_waiters that fires between + /// drop(state) and notified() is lost (notify_waiters does not + /// save permits for unregistered waiters), and the waiter + /// sleeps until SEQ_WAIT_TIMEOUT. + /// + /// Reproducing the exact race deterministically requires + /// scheduler-level control we don't have, but we can verify + /// the post-fix property: a notify that fires *while* the + /// waiter is between its lock-and-check and its await still + /// wakes it. We approximate by spawning the waiter, briefly + /// pausing so it reaches the wait, then bumping + notifying; + /// the waiter must complete in well under SEQ_WAIT_TIMEOUT. + #[tokio::test] + async fn wait_for_seq_turn_does_not_miss_notify_after_lock_release() { + let inner = fake_inner().await; + + let inner_clone = inner.clone(); + let waiter = tokio::spawn(async move { + wait_for_seq_turn(&inner_clone, 5, Duration::from_secs(10)) + .await + .map(|g| { + drop(g); + }) + }); + + // Hammer many advance + notify cycles in a tight loop. With + // the missed-notify race, an advance whose notify falls in + // the lock-release-to-await gap would be lost, leaving the + // waiter parked. The pre-enabled Notified guarantees we + // wake on any of these. + for target in 1u64..=5 { + tokio::time::sleep(Duration::from_millis(10)).await; + { + let mut state = inner.seq_state.lock().await; + state.expected = target; + } + inner.seq_advance.notify_waiters(); + } + + let result = tokio::time::timeout(Duration::from_secs(2), waiter) + .await + .expect( + "seq=5 waiter must observe one of the advances; if it \ + doesn't, notify_waiters fired in the lock-release-to- \ + await gap and was lost (the missed-notify race)", + ) + .expect("task should not panic"); + assert!(result.is_ok()); + } + + /// Regression: an idle empty seq poll for session B + /// must NOT hold the batch open behind a sibling session A whose + /// bytes are already buffered. The shared `wait_set` makes A's + /// bytes wake B's wait — without it, B paid `LONGPOLL_DEADLINE` + /// (≈ 15 s) because its own inner stayed empty. + #[tokio::test] + async fn process_seq_data_wakes_when_other_session_in_wait_set_has_bytes() { + let inner_a = fake_inner().await; + let inner_b = fake_inner().await; + + // Session A has bytes ready; session B is idle. + inner_a.read_buf.lock().await.extend_from_slice(b"a-bytes"); + + let batch_wait = BatchWait::new(vec![inner_a.clone(), inner_b.clone()]); + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + + // B's seq op uses had_active_in_batch=false and had_uplink=false, + // so deadline = LONGPOLL_DEADLINE (15 s). Without the shared + // BatchWait, B would sit the full 15 s. With it, A's bytes + // immediately wake B's wait via `notify_waiters()` — and B + // drains its own (empty) buffer in milliseconds. + let t0 = Instant::now(); + let resp_b = process_seq_data_op( + inner_b.clone(), + "sid-b".into(), + 0, + None, + budget, + false, // had_active_in_batch + batch_wait, + ) + .await; + let elapsed = t0.elapsed(); + + assert_eq!(resp_b.seq, Some(0)); + // B's reply must be empty (it has no bytes of its own). + assert!(resp_b.d.as_deref().map(str::is_empty).unwrap_or(true)); + // Must have returned in well under LONGPOLL_DEADLINE — give + // a generous 2 s ceiling for CI jitter. + assert!( + elapsed < Duration::from_secs(2), + "idle seq poll must wake on shared wait set when sibling \ + session has bytes; elapsed={:?} (LONGPOLL_DEADLINE={:?})", + elapsed, + LONGPOLL_DEADLINE, + ); + } + + /// Regression: batches containing seq'd ops must use + /// the pipelined timeout floor on the client side, otherwise + /// `Config::request_timeout_secs` (default 30 s) fires before + /// the server-side worst case (`SEQ_WAIT_TIMEOUT 30s + + /// LONGPOLL_DEADLINE 15s`) and pipelined sessions disconnect + /// where legacy sessions survived. We can't easily test the + /// effective applied timeout end-to-end, but we can assert the + /// floor is at least the server's worst-case wait so the budget + /// stays consistent. + #[tokio::test] + async fn pipelined_timeout_floor_exceeds_server_worst_case_wait() { + // The server's worst-case wait per seq op is SEQ_WAIT_TIMEOUT + // (waiting for an earlier seq) + LONGPOLL_DEADLINE (waiting + // for upstream data inside that op). The client's pipelined + // batch timeout floor must exceed this so a valid slow + // server response doesn't fire a client-side "batch timed + // out" + deployment timeout strike. + let server_worst_case = SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE; + // PIPELINED_BATCH_TIMEOUT_FLOOR is a client-side constant; + // we duplicate the value here to keep this test in the same + // crate. If either side is bumped, this test will fail + // until the client's floor follows. + let pipelined_floor = Duration::from_secs(60); + assert!( + pipelined_floor > server_worst_case, + "pipelined batch timeout floor ({:?}) must exceed server \ + worst-case wait per seq op ({:?} = SEQ_WAIT_TIMEOUT + \ + LONGPOLL_DEADLINE), otherwise a valid slow batch fires \ + a client timeout + deployment strike", + pipelined_floor, + server_worst_case, + ); + } + + /// Regression: when M seq jobs all park on the shared + /// `BatchWait`, a SINGLE push must wake every one of them — + /// not just one. The watcher self-fan-out via + /// `wake.notify_waiters()` is what makes this work; without + /// it, parked jobs would each compete for one wake and the + /// rest would sit until `LONGPOLL_DEADLINE`. + #[tokio::test] + async fn batch_wait_wakes_all_jobs_on_single_push() { + let inner = fake_inner().await; + let batch_wait = BatchWait::new(vec![inner.clone()]); + + // Park 5 concurrent waiters with a generous deadline. They + // must all see the wake from a single push. + let mut tasks = Vec::new(); + for _ in 0..5 { + let bw = batch_wait.clone(); + tasks.push(tokio::spawn(async move { + let t0 = Instant::now(); + bw.wait(LONGPOLL_DEADLINE).await; + t0.elapsed() + })); + } + + // Give all 5 tasks time to park inside `wait` (synchronous + // is_any_drainable check returns false because read_buf is + // empty; they all reach the timeout-wrapped `notified.await`). + tokio::time::sleep(Duration::from_millis(50)).await; + for task in &tasks { + assert!(!task.is_finished(), "all 5 waiters should be parked"); + } + + // Single push to inner.read_buf, single notify_waiters() + // (matching production `reader_task`). BatchWait's watcher + // wakes, sees drainable state, and fans out via + // `wake.notify_waiters()` — all 5 parked waiters wake. + inner.read_buf.lock().await.extend_from_slice(b"go"); + inner.notify.notify_waiters(); + + // Each waiter must complete in well under LONGPOLL_DEADLINE. + for (i, task) in tasks.into_iter().enumerate() { + let elapsed = tokio::time::timeout(Duration::from_secs(2), task) + .await + .unwrap_or_else(|_| panic!( + "waiter {} did not wake within 2s after single push — \ + wake-fan-out regression: only one of N parked jobs woke", + i + )) + .expect("task should not panic"); + assert!( + elapsed < Duration::from_secs(2), + "waiter {} took {:?} (must be << LONGPOLL_DEADLINE = {:?})", + i, + elapsed, + LONGPOLL_DEADLINE, + ); + } + } + + /// `BatchWait` must dedupe inners by Arc pointer — the same + /// session can appear in a batch via multiple paths (e.g. a + /// `connect_data` plus a seq `data` op for the same sid). + /// Spawning two watchers for it would double-fan-out the wake + /// signal and inflate task spawn count linearly with op + /// duplication. The dedup also has to apply to STORED inners + /// (not just watcher count), so `is_any_drainable` doesn't + /// re-lock the same session's read_buf once per duplicate + /// occurrence. + #[tokio::test] + async fn batch_wait_deduplicates_watchers_per_inner() { + let inner = fake_inner().await; + // Same Arc appearing 4 times in the input. + let bw = BatchWait::new(vec![ + inner.clone(), + inner.clone(), + inner.clone(), + inner.clone(), + ]); + assert_eq!( + bw._watchers.len(), + 1, + "BatchWait must spawn one watcher per UNIQUE inner; \ + 4 dup inputs spawned {} watchers", + bw._watchers.len(), + ); + assert_eq!( + bw.inners.len(), + 1, + "BatchWait::inners must be deduped too — otherwise \ + `is_any_drainable` re-locks the same Mutex per duplicate; \ + 4 dup inputs stored {} inners", + bw.inners.len(), + ); + } + + /// Critical regression: with pipelining, seq=N and seq=N+1 for + /// the same session can arrive in DIFFERENT batches. Each batch + /// builds its own `BatchWait`, each with its own watcher on the + /// session's `inner.notify`. A single `notify_one()` only wakes + /// one of those watchers — the other batch's watcher would sit + /// until `LONGPOLL_DEADLINE` even though the data is right + /// there. The fix: `reader_task` now uses `notify_waiters()`, + /// so every parked watcher across every batch wakes on each + /// push. Watchers `enable()` their `Notified` before the + /// synchronous state check to handle the no-permits edge of + /// `notify_waiters()`. + #[tokio::test] + async fn batch_wait_wakes_across_concurrent_batches_on_same_session() { + let inner = fake_inner().await; + + // Two BatchWaits — simulating two concurrent batches that + // both contain ops for the same session. + let bw_a = BatchWait::new(vec![inner.clone()]); + let bw_b = BatchWait::new(vec![inner.clone()]); + + // Park one waiter on each. Without the broadcast fix, only + // ONE of them would wake on a single push. + let bw_a_clone = bw_a.clone(); + let waiter_a = tokio::spawn(async move { + let t0 = Instant::now(); + bw_a_clone.wait(LONGPOLL_DEADLINE).await; + t0.elapsed() + }); + let bw_b_clone = bw_b.clone(); + let waiter_b = tokio::spawn(async move { + let t0 = Instant::now(); + bw_b_clone.wait(LONGPOLL_DEADLINE).await; + t0.elapsed() + }); + + // Give both waiters time to park inside their respective + // BatchWait::wait calls. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!(!waiter_a.is_finished(), "batch A waiter should park"); + assert!(!waiter_b.is_finished(), "batch B waiter should park"); + + // One push, one notify — must wake BOTH batches' watchers. + inner.read_buf.lock().await.extend_from_slice(b"go"); + inner.notify.notify_waiters(); + + let elapsed_a = tokio::time::timeout(Duration::from_secs(2), waiter_a) + .await + .expect("batch A waiter must wake within 2s") + .expect("task A should not panic"); + let elapsed_b = tokio::time::timeout(Duration::from_secs(2), waiter_b) + .await + .expect( + "batch B waiter must ALSO wake within 2s — under \ + `notify_one()` only one of A/B woke and the other \ + sat for LONGPOLL_DEADLINE (~15s)", + ) + .expect("task B should not panic"); + + assert!( + elapsed_a < Duration::from_secs(2), + "batch A waiter took {:?} (must be << LONGPOLL_DEADLINE)", + elapsed_a, + ); + assert!( + elapsed_b < Duration::from_secs(2), + "batch B waiter took {:?} (must be << LONGPOLL_DEADLINE)", + elapsed_b, + ); + } + + /// Pin the documented latency trade-off for `SEQ_WAIT_TIMEOUT`: + /// when one seq op in a batch is stuck waiting for a missing + /// earlier seq, an UNRELATED session's seq op in the same batch + /// completes its own work in milliseconds, but its result sits + /// in `seq_data_jobs` until the stuck job times out — because + /// `handle_batch`'s `tokio::join!` waits for every seq job + /// before returning the response. + /// + /// This test asserts the upper bound on that latency: bounded + /// above by `SEQ_WAIT_TIMEOUT` (the stuck job's max wait), and + /// the unrelated session's per-job processing time is ≪ that. + /// Any future change to either constant — or to a design that + /// allows partial batch responses — should fail this test + /// intentionally so reviewers can confirm the new behavior. + #[tokio::test] + async fn unrelated_seq_session_in_same_batch_is_not_delayed_past_seq_wait() { + // Two independent sessions A and B. A's seq=2 will wait + // (expected=0, no seq=0/1 ever arrives — the + // "lost earlier seq" failure mode). B's seq=0 on a fresh + // session can claim immediately. + let inner_a = fake_inner().await; + let inner_b = fake_inner().await; + // B has data ready and eof set — its drain wait + // short-circuits, so B's job processing time is the + // intrinsic per-op cost (decode + lock + drain). + inner_b.read_buf.lock().await.extend_from_slice(b"b-bytes"); + inner_b.eof.store(true, Ordering::Release); + + let bw = BatchWait::new(vec![inner_a.clone(), inner_b.clone()]); + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + + // Spawn A first so it parks on `seq_state` waiting for + // earlier seqs that never arrive. With the production + // SEQ_WAIT_TIMEOUT this would sit 30 s; for the test we + // just need it to remain pending while B runs. + let inner_a_clone = inner_a.clone(); + let bw_a = bw.clone(); + let budget_a = budget.clone(); + let task_a = tokio::spawn(async move { + // Use seq=2 so wait_for_seq_turn sees seq > expected (0) + // and parks on `seq_advance`. + process_seq_data_op( + inner_a_clone, + "sid-a".into(), + 2, + None, + budget_a, + false, + bw_a, + ) + .await + }); + + // Brief wait so A is definitely parked inside its + // wait_for_seq_turn loop. + tokio::time::sleep(Duration::from_millis(50)).await; + assert!( + !task_a.is_finished(), + "A should be parked waiting for earlier seqs", + ); + + // Now run B. Its INTRINSIC processing time is what we + // pin: even with A blocking on seq_state, B has nothing + // to wait on (different session, fresh seq state, eof + // short-circuits its drain). Its latency must be ≪ + // SEQ_WAIT_TIMEOUT. + let t0 = Instant::now(); + let resp_b = process_seq_data_op( + inner_b.clone(), + "sid-b".into(), + 0, + None, + budget, + false, + bw, + ) + .await; + let elapsed_b = t0.elapsed(); + assert_eq!(resp_b.seq, Some(0)); + + assert!( + elapsed_b < Duration::from_secs(2), + "session B's seq job took {:?} while A was stuck — \ + unrelated sessions must finish in well under \ + SEQ_WAIT_TIMEOUT ({:?}). A regression here means a \ + stuck seq from one session is now blocking unrelated \ + sessions' INTRINSIC processing, not just the batch \ + response join (which is the documented trade-off).", + elapsed_b, + SEQ_WAIT_TIMEOUT, + ); + + // Cancel A so the test doesn't sit SEQ_WAIT_TIMEOUT. + task_a.abort(); + } + + /// Critical regression: a batch with `data(seq=N) + close(same sid)` + /// must run the seq op's (write, drain) BEFORE the close tears + /// down the session. Running close inline during dispatch (the + /// pre-fix shape) removed the session from `state.sessions` + /// and aborted `reader_task` while the deferred seq job was + /// still about to run on its cloned `Arc` — + /// uplink writes raced the upstream socket's read-half being + /// dropped and downlink drains saw zero bytes. The fix: + /// `pending_closes` collects close ops during dispatch and + /// runs them AFTER `seq_data_jobs` complete. + #[tokio::test] + async fn batch_data_seq_then_close_runs_data_first() { + use axum::body::Bytes; + use axum::extract::State; + + // Set up an upstream that captures everything written to it + // so we can verify the seq op's bytes arrived before close + // dropped the connection. + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let upstream_port = listener.local_addr().unwrap().port(); + let received_uplink = Arc::new(Mutex::new(Vec::::new())); + let received_clone = received_uplink.clone(); + tokio::spawn(async move { + if let Ok((mut sock, _)) = listener.accept().await { + let mut buf = vec![0u8; 4096]; + loop { + match sock.read(&mut buf).await { + Ok(0) | Err(_) => break, + Ok(n) => { + received_clone + .lock() + .await + .extend_from_slice(&buf[..n]); + } + } + } + } + }); + + let state = fresh_state(); + + // Open a session via connect (legacy non-seq path so we + // get a sid the batch can target). + let connect_resp = handle_connect( + &state, + Some("127.0.0.1".into()), + Some(upstream_port), + ) + .await; + let sid = match connect_resp { + TunnelResponse { + sid: Some(s), + e: None, + .. + } => s, + other => panic!("connect failed: {:?}", other), + }; + + // Batch: data(seq=0) carrying "PAYLOAD" + close(same sid). + // The dispatch order in the request determines the + // intended effect order — seq=0's write must reach + // upstream BEFORE close tears down the session. + let body = serde_json::json!({ + "k": "test-key", + "ops": [ + {"op": "data", "sid": sid.clone(), "seq": 0, "d": B64.encode(b"PAYLOAD")}, + {"op": "close", "sid": sid.clone()} + ] + }); + let body_bytes = serde_json::to_vec(&body).unwrap(); + let _resp = handle_batch(State(state.clone()), Bytes::from(body_bytes)) + .await + .into_response(); + + // Give the upstream reader time to drain whatever was + // written before the close shut things down. + tokio::time::sleep(Duration::from_millis(100)).await; + + let received = received_uplink.lock().await.clone(); + assert_eq!( + received, + b"PAYLOAD", + "seq=0's uplink bytes must reach upstream before close \ + tears the session down — running close inline (pre-fix) \ + would race the seq job and could lose these bytes", + ); + + // The session should be gone after close. + assert!( + !state.sessions.lock().await.contains_key(&sid), + "close must have removed the session from the state map", + ); + } + + /// Important regression: an ACTIVE seq op (one with uplink + /// bytes) must wait for ITS OWN session's response, not get + /// short-circuited by a sibling session in the same batch + /// already having buffered bytes. Subscribing to the shared + /// `BatchWait` for active ops is wrong — a sibling's bytes + /// would wake us instantly and we'd return empty for an op + /// that just hadn't gotten its own reply yet, doubling the + /// number of round-trips for the same logical exchange. + #[tokio::test] + async fn active_seq_op_waits_for_own_response_not_sibling_bytes() { + let inner_active = fake_inner().await; + let inner_sibling = fake_inner().await; + // Sibling already has bytes ready — the shared wait would + // wake on this immediately if we (incorrectly) subscribed. + inner_sibling + .read_buf + .lock() + .await + .extend_from_slice(b"sibling-bytes"); + // No bytes for the active op yet — its own response would + // come from upstream within ACTIVE_DRAIN_DEADLINE. + // Mark eof to short-circuit the active op's settle so the + // test doesn't actually sit ACTIVE_DRAIN_DEADLINE. + inner_active.eof.store(true, Ordering::Release); + + let bw = BatchWait::new(vec![inner_active.clone(), inner_sibling.clone()]); + let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET)); + + let t0 = Instant::now(); + // Active op: had_uplink = true (we pass non-empty data). + // The op's drain phase must NOT short-circuit on the + // sibling's bytes — its own buffer is empty + eof, so + // it'll drain that and return. + let resp = process_seq_data_op( + inner_active.clone(), + "sid-active".into(), + 0, + Some(B64.encode(b"uplink")), + budget, + true, // had_active_in_batch + bw, + ) + .await; + let elapsed = t0.elapsed(); + + assert_eq!(resp.seq, Some(0)); + // The active op's drain should reflect its OWN buffer + // state (empty + eof), not the sibling's. If the active + // path mistakenly subscribed to batch_wait, the sibling's + // bytes would wake it and the same `inner_active.read_buf` + // would still be empty — so this test wouldn't catch the + // bug just from the bytes returned. What it DOES catch: + // the active op should not be sleeping on the shared wake; + // its time bound is its own per-session wait + settle. + assert!( + elapsed < ACTIVE_DRAIN_DEADLINE * 2, + "active op took {:?}; bound is per-session ACTIVE \ + window ({:?}) plus settle, not LONGPOLL", + elapsed, + ACTIVE_DRAIN_DEADLINE, + ); + } + + /// Regression: a batch ordered as `[close(sid), data(sid)]` + /// must run the close FIRST and the subsequent data must + /// observe a closed session (eof_response). Unconditionally + /// deferring close to after seq jobs would let the data write + /// succeed against the still-open session even though the + /// client asked us to close first — a protocol-ordering + /// regression. Close is now deferred ONLY if there's an + /// earlier same-sid op already deferred. + #[tokio::test] + async fn batch_close_then_data_processes_close_first() { + use axum::body::Bytes; + use axum::extract::State; + + // Upstream that captures whatever bytes we write. + let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap(); + let upstream_port = listener.local_addr().unwrap().port(); + let received_uplink = Arc::new(Mutex::new(Vec::::new())); + let received_clone = received_uplink.clone(); + tokio::spawn(async move { + if let Ok((mut sock, _)) = listener.accept().await { + let mut buf = vec![0u8; 4096]; + loop { + match sock.read(&mut buf).await { + Ok(0) | Err(_) => break, + Ok(n) => { + received_clone + .lock() + .await + .extend_from_slice(&buf[..n]); + } + } + } + } + }); + + let state = fresh_state(); + let connect_resp = handle_connect( + &state, + Some("127.0.0.1".into()), + Some(upstream_port), + ) + .await; + let sid = match connect_resp { + TunnelResponse { + sid: Some(s), + e: None, + .. + } => s, + other => panic!("connect failed: {:?}", other), + }; + + // Batch: close FIRST, then data(seq=0). + let body = serde_json::json!({ + "k": "test-key", + "ops": [ + {"op": "close", "sid": sid.clone()}, + {"op": "data", "sid": sid.clone(), "seq": 0, "d": B64.encode(b"AFTER-CLOSE")} + ] + }); + let body_bytes = serde_json::to_vec(&body).unwrap(); + let _resp = handle_batch(State(state.clone()), Bytes::from(body_bytes)) + .await + .into_response(); + + // Give the upstream reader a moment. + tokio::time::sleep(Duration::from_millis(100)).await; + + // The session was closed BEFORE the data op got to run, so + // upstream should NOT have received the post-close bytes. + let received = received_uplink.lock().await.clone(); + assert!( + received.is_empty() + || !received.windows(b"AFTER-CLOSE".len()).any(|w| w == b"AFTER-CLOSE"), + "close-first ordering was lost: post-close bytes leaked to \ + upstream (got {:?}). With unconditional close-deferral the \ + data op writes 'AFTER-CLOSE' against a session the client \ + told us to close first.", + String::from_utf8_lossy(&received), + ); + } + + /// Regression: Phase 2's drain loop must reserve at most + /// `TCP_DRAIN_MAX_BYTES` from the shared `response_budget`, + /// not the full remaining budget. The previous shape set + /// `*budget = 0` upfront, drained, then refunded — concurrent + /// seq jobs running in the same batch could observe the 0 + /// state and return empty responses for an op that would have + /// fit comfortably in the remaining budget. + #[tokio::test] + async fn phase_2_budget_reservation_caps_at_tcp_drain_max_bytes() { + // Synthetic scenario: a single legacy tcp_drain entry on + // a session with a small buffer. Budget is set to a + // generous value (well above TCP_DRAIN_MAX_BYTES). Run + // the drain in a way that exposes the per-iteration + // reservation. + // + // We can't easily run the Phase 2 closure from a unit + // test (it's defined inline in `handle_batch`), so we + // verify the reservation invariant directly: the seq + // path's `process_seq_data_op` and Phase 2 should both + // cap reservations at TCP_DRAIN_MAX_BYTES. The seq path + // is already covered by `process_seq_data_shares_batch_response_budget`; + // here we assert the same property on a different shape: + // a budget large enough that the Phase 2 drain leaves + // headroom equal to (budget − drained) when its drain + // takes less than `TCP_DRAIN_MAX_BYTES`. + let inner = fake_inner().await; + let bytes_to_drain = 4096usize; + inner + .read_buf + .lock() + .await + .extend_from_slice(&vec![0xAB; bytes_to_drain]); + + let initial_budget = TCP_DRAIN_MAX_BYTES * 4; + let budget = Arc::new(Mutex::new(initial_budget)); + + // Mirror the production reservation pattern. + let take = { + let mut b = budget.lock().await; + let take = (*b).min(TCP_DRAIN_MAX_BYTES); + *b = b.saturating_sub(take); + take + }; + assert_eq!( + take, TCP_DRAIN_MAX_BYTES, + "reservation must be capped at TCP_DRAIN_MAX_BYTES \ + (got {} from initial_budget {})", + take, initial_budget, + ); + // The remaining budget visible to a concurrent seq job + // must still be the full initial_budget − TCP_DRAIN_MAX_BYTES, + // not 0. Pre-fix the budget would have been 0 here. + let remaining_visible = *budget.lock().await; + assert_eq!( + remaining_visible, + initial_budget - TCP_DRAIN_MAX_BYTES, + "Phase 2's reservation must leave headroom for \ + concurrent seq jobs; under the pre-fix shape this \ + would be 0", + ); + + let (data, _eof) = drain_now(&inner, take).await; + if data.len() < take { + *budget.lock().await += take - data.len(); + } + // After refund, the full initial_budget minus actually- + // drained bytes is available again. + assert_eq!( + *budget.lock().await, + initial_budget - bytes_to_drain, + "after refund of unused reservation, budget should be \ + initial − drained", + ); + } }