diff --git a/src/domain_fronter.rs b/src/domain_fronter.rs
index ed8e355..7e6b0ee 100644
--- a/src/domain_fronter.rs
+++ b/src/domain_fronter.rs
@@ -435,8 +435,32 @@ pub struct TunnelResponse {
     /// `e` only when this is `None` and compatibility is needed.
     #[serde(default)]
     pub code: Option<String>,
+    /// Per-session sequence number echoed from the request's `seq` (if any).
+    /// `None` from servers that don't speak the pipelining protocol — the
+    /// client treats absence as "not pipelining-capable" and uses the
+    /// single-in-flight loop. Present and matching the request's seq
+    /// otherwise; the client routes the response into the per-session
+    /// reorder buffer keyed by this value.
+    ///
+    /// `u64` (not `u32`): a long-lived TCP session generating ~100 ops/s
+    /// would hit `u32::MAX` in ~1.4 years, at which point the server's
+    /// `expected` saturates and refuses every subsequent op. `u64`
+    /// pushes that horizon past any realistic hardware lifetime.
+    #[serde(default)]
+    pub seq: Option<u64>,
+    /// Capability bitfield, advertised on `connect` / `connect_data`
+    /// success responses by tunnel-nodes that speak ≥ this protocol
+    /// version. Bit 0 = `CAPS_PIPELINE_SEQ`. Old tunnel-nodes return
+    /// `None`; the client opts into pipelining only when the bit is set.
+    #[serde(default)]
+    pub caps: Option<u32>,
 }
 
+/// Capability bit advertised by a tunnel-node that supports per-session
+/// `data`-op sequence numbers (per-session pipelining). Only checked on
+/// `connect` / `connect_data` success replies.
+pub const CAPS_PIPELINE_SEQ: u32 = 1 << 0;
+
 /// A single op in a batch tunnel request.
 #[derive(Serialize, Clone, Debug)]
 pub struct BatchOp {
@@ -449,6 +473,16 @@ pub struct BatchOp {
     pub port: Option<u16>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub d: Option<String>,
+    /// Per-session monotonic sequence number for `data` ops on a
+    /// pipelining-enabled session. Lets the tunnel-node enforce
+    /// in-order processing of ops that arrive in different batches
+    /// (potentially via different deployments and network paths).
+    /// Skipped from the wire when `None`; old tunnel-nodes that don't
+    /// recognize the field ignore it (serde default + structural-only
+    /// JSON dispatch on the server). `u64` to keep long-lived sessions
+    /// from saturating — see `TunnelResponse::seq` for the math.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub seq: Option<u64>,
 }
 
 /// Batch tunnel response from Apps Script / tunnel node.
@@ -2793,10 +2827,35 @@ impl DomainFronter {
     /// Like `tunnel_batch_request` but targets a specific deployment ID.
     /// Used by the pipeline mux to pin a batch to a deployment whose
     /// per-account concurrency slot has already been acquired.
+    ///
+    /// Backward-compatible 3-arg signature: forwards to
+    /// `tunnel_batch_request_with_timeout` using the configured
+    /// `self.batch_timeout`. New callers that need pipelined seq's
+    /// longer effective budget should use the explicit
+    /// `tunnel_batch_request_with_timeout` directly.
     pub async fn tunnel_batch_request_to(
         &self,
         script_id: &str,
         ops: &[BatchOp],
+    ) -> Result<BatchTunnelResponse, FronterError> {
+        self.tunnel_batch_request_with_timeout(script_id, ops, self.batch_timeout)
+            .await
+    }
+
+    /// Targets a specific deployment with an explicit per-batch
+    /// timeout, applied to both the h2 fast path (via
+    /// `h2_relay_request`'s `response_deadline`) and the h1
+    /// fallback's header read. The caller computes this: legacy
+    /// batches use `self.batch_timeout`, pipelined batches use
+    /// `max(self.batch_timeout, PIPELINED_BATCH_TIMEOUT_FLOOR)` so a
+    /// valid server-side wait of `SEQ_WAIT_TIMEOUT +
+    /// LONGPOLL_DEADLINE` doesn't fire a spurious transport timeout
+    /// AND a deployment timeout strike.
+    pub async fn tunnel_batch_request_with_timeout(
+        &self,
+        script_id: &str,
+        ops: &[BatchOp],
+        batch_timeout: Duration,
     ) -> Result<BatchTunnelResponse, FronterError> {
         let mut map = serde_json::Map::new();
         map.insert("k".into(), Value::String(self.auth_key.clone()));
@@ -2813,11 +2872,14 @@ impl DomainFronter {
         // `data`/`udp_data`/`connect` may have already executed
         // upstream when the response framing failed. Replaying the
         // whole batch on h1 risks duplicating every op in it. Only
-        // fall back when h2 definitely never sent. Honors
-        // user-configured batch_timeout so a slow but legitimate
-        // batch isn't cut off at an arbitrary fixed cap.
+        // fall back when h2 definitely never sent. Honors the
+        // caller-supplied `batch_timeout` (legacy = configured
+        // `request_timeout_secs`; pipelined = max(configured,
+        // PIPELINED_BATCH_TIMEOUT_FLOOR)) so a slow but legitimate
+        // batch isn't cut off at an arbitrary fixed cap, and
+        // pipelined-batch server-side waits stay inside the budget.
         match self
-            .h2_relay_request(&path, payload.clone(), self.batch_timeout)
+            .h2_relay_request(&path, payload.clone(), batch_timeout)
             .await
         {
             Ok((status, _hdrs, _resp_body)) if is_h2_fronting_refusal_status(status) => {
@@ -2865,8 +2927,14 @@ impl DomainFronter {
         entry.stream.write_all(&payload).await?;
         entry.stream.flush().await?;
 
+        // Use `batch_timeout` for the per-read deadline on this h1
+        // fallback. The legacy 10 s default in `read_http_response`
+        // would fire before a pipelined batch's server-side
+        // `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` (~45 s) wait could
+        // legitimately complete — that fired as "batch timed out"
+        // client-side AND recorded a deployment timeout strike.
         let (mut status, mut resp_headers, mut resp_body) =
-            read_http_response(&mut entry.stream).await?;
+            read_http_response_with_timeout(&mut entry.stream, batch_timeout).await?;
 
         // Follow redirect chain
         for _ in 0..5 {
@@ -2879,7 +2947,7 @@ impl DomainFronter {
             );
             entry.stream.write_all(req.as_bytes()).await?;
             entry.stream.flush().await?;
-            let (s, h, b) = read_http_response(&mut entry.stream).await?;
+            let (s, h, b) = read_http_response_with_timeout(&mut entry.stream, batch_timeout).await?;
             status = s; resp_headers = h; resp_body = b;
         }
 
@@ -3665,14 +3733,41 @@ fn parse_redirect(location: &str) -> (String, Option<String>) {
 
 /// Read a single HTTP/1.1 response from the stream. Keep-alive safe: respects
 /// Content-Length or chunked transfer-encoding.
+/// Default first-byte / per-read deadline for `read_http_response`.
+/// Single-tunnel-op and relay paths use this — Apps Script normally
+/// streams headers within ~1 s, so 10 s catches stalls without
+/// false-firing on jitter. The batched-tunnel path overrides via
+/// `read_http_response_with_timeout` because pipelined batches can
+/// legitimately wait `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` before
+/// Apps Script sends a single response byte.
+const DEFAULT_HTTP_READ_TIMEOUT: Duration = Duration::from_secs(10);
+
 async fn read_http_response<S>(stream: &mut S) -> Result<(u16, Vec<(String, String)>, Vec<u8>), FronterError>
+where
+    S: tokio::io::AsyncRead + Unpin,
+{
+    read_http_response_with_timeout(stream, DEFAULT_HTTP_READ_TIMEOUT).await
+}
+
+/// Like [`read_http_response`] but with a caller-controlled per-read
+/// deadline. Used by the batched-tunnel h1 path so a pipelined batch
+/// whose server-side worst-case wait is `SEQ_WAIT_TIMEOUT (30 s) +
+/// LONGPOLL_DEADLINE (15 s)` doesn't get cut off at the legacy 10 s
+/// header read — that fired before the response could legitimately
+/// arrive, surfacing as a "batch timed out" client-side AND
+/// recording a timeout strike against an otherwise-healthy
+/// deployment.
+async fn read_http_response_with_timeout<S>(
+    stream: &mut S,
+    read_timeout: Duration,
+) -> Result<(u16, Vec<(String, String)>, Vec<u8>), FronterError>
 where
     S: tokio::io::AsyncRead + Unpin,
 {
     let mut buf = Vec::with_capacity(8192);
     let mut tmp = [0u8; 8192];
     let header_end = loop {
-        let n = timeout(Duration::from_secs(10), stream.read(&mut tmp)).await
+        let n = timeout(read_timeout, stream.read(&mut tmp)).await
             .map_err(|_| FronterError::Timeout)??;
         if n == 0 {
             return Err(FronterError::BadResponse("connection closed before headers".into()));
diff --git a/src/tunnel_client.rs b/src/tunnel_client.rs
index cb6ce12..7e928ce 100644
--- a/src/tunnel_client.rs
+++ b/src/tunnel_client.rs
@@ -24,7 +24,9 @@ use tokio::io::{AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio::net::TcpStream;
 use tokio::sync::{mpsc, oneshot, Semaphore};
 
-use crate::domain_fronter::{BatchOp, DomainFronter, FronterError, TunnelResponse};
+use crate::domain_fronter::{
+    BatchOp, DomainFronter, FronterError, TunnelResponse, CAPS_PIPELINE_SEQ,
+};
 
 /// Apps Script allows 30 concurrent executions per account / deployment.
 const CONCURRENCY_PER_DEPLOYMENT: usize = 30;
@@ -51,6 +53,38 @@ const MAX_BATCH_OPS: usize = 50;
 /// blocking indefinitely.
 const REPLY_TIMEOUT: Duration = Duration::from_secs(35);
 
+/// Slack added to the effective batch timeout to derive the per-session
+/// pipelined reply watchdog (`TunnelMux::pipelined_reply_timeout`).
+/// Covers queueing stages the batch HTTP timeout doesn't include:
+///
+///   * coalesce wait (≤ `coalesce_max_ms`, default 1 s)
+///   * per-deployment semaphore acquire (typical sub-second under
+///     HTTP/2 multiplex; worst case bounded by another batch_timeout)
+///   * mpsc channel hop into / out of `mux_loop`
+///
+/// Setting this any tighter risks the watchdog firing on legitimate
+/// slow batches that are still inside their HTTP-layer budget; any
+/// looser just delays detection of a genuinely broken mux. 15 s
+/// covers the realistic queueing while keeping the upper bound
+/// finite.
+const PIPELINED_REPLY_SLACK: Duration = Duration::from_secs(15);
+
+/// Floor on the per-batch HTTP timeout for batches that contain at
+/// least one seq'd `data` op. The server-side worst-case wait for
+/// a seq op is `SEQ_WAIT_TIMEOUT (~30 s) + LONGPOLL_DEADLINE (~15 s)
+/// = ~45 s`. The default `Config::request_timeout_secs` is 30 s,
+/// which would fire BEFORE a valid pipelined batch could complete —
+/// triggering "batch timed out", closing the pipelined session, AND
+/// recording a timeout strike against an otherwise-healthy
+/// deployment (eventually blacklisting it). Take `max(configured,
+/// 60 s)` for seq-bearing batches so the legitimate server wait
+/// always fits inside the client budget. The per-session reply
+/// watchdog (`TunnelMux::pipelined_reply_timeout`) is derived from
+/// this same effective batch timeout plus `PIPELINED_REPLY_SLACK`
+/// — the two agree on what counts as "definitely broken" without
+/// the per-session watchdog ever firing on a still-valid batch.
+const PIPELINED_BATCH_TIMEOUT_FLOOR: Duration = Duration::from_secs(60);
+
 /// How long we'll briefly hold the client socket after the local
 /// CONNECT/SOCKS5 handshake, waiting for the client's first bytes (the
 /// TLS ClientHello for HTTPS). Bundling those bytes with the tunnel-node
@@ -80,10 +114,11 @@ const CODE_UNSUPPORTED_OP: &str = "UNSUPPORTED_OP";
 /// Empty poll round-trip latency below which we conclude the tunnel-node
 /// is *not* long-polling (legacy fixed-sleep drain instead). On a
 /// long-poll-capable server an empty poll with no upstream push either
-/// returns near `LONGPOLL_DEADLINE` (~5 s) or comes back early *with*
-/// pushed bytes — neither matches a fast empty reply. Threshold sits
-/// well above the legacy `~350 ms` drain and well below the long-poll
-/// floor, so network jitter on either side won't false-trigger.
+/// returns near `LONGPOLL_DEADLINE` (~15 s on current tunnel-nodes) or
+/// comes back early *with* pushed bytes — neither matches a fast empty
+/// reply. Threshold sits well above the legacy `~350 ms` drain and well
+/// below the long-poll floor, so network jitter on either side won't
+/// false-trigger.
 const LEGACY_DETECT_THRESHOLD: Duration = Duration::from_millis(1500);
 
 /// How long a deployment stays in "legacy / no long-poll" mode after the
@@ -173,6 +208,13 @@ enum MuxMsg {
     Data {
         sid: String,
         data: Bytes,
+        /// Per-session monotonic seq for pipelined sessions. `None` for
+        /// non-pipelined sessions (and any session whose tunnel-node
+        /// didn't advertise `CAPS_PIPELINE_SEQ`). When `Some`, the
+        /// tunnel-node enforces in-order processing and echoes the
+        /// value back in the response so the client can route it into
+        /// its per-session reorder buffer.
+        seq: Option<u64>,
         reply: BatchedReply,
     },
     UdpOpen {
@@ -208,6 +250,9 @@ struct PendingOp {
     /// only `connect_data`, which uses presence of `d` as the signal
     /// that the caller is opting into the bundled-first-bytes flow).
     encode_empty: bool,
+    /// Per-session seq for pipelined `data` ops; `None` otherwise.
+    /// Forwarded verbatim to `BatchOp::seq` by `encode_pending`.
+    seq: Option<u64>,
 }
 
 pub struct TunnelMux {
@@ -280,10 +325,31 @@ pub struct TunnelMux {
     /// `(host, port)`, value is the expiry instant. Plain Mutex<HashMap> is
     /// fine: it's touched once per CONNECT (cheap) and once per failure.
     unreachable_cache: Mutex<HashMap<(String, u16), Instant>>,
+    /// Snapshot of the configured `Config::request_timeout_secs` taken
+    /// at mux construction. Used to derive the per-session pipelined
+    /// reply timeout (see `pipelined_reply_timeout()`) so a user-tuned
+    /// `request_timeout_secs > 60` doesn't cause sessions to close
+    /// before the batch layer can legitimately complete.
+    batch_timeout: Duration,
+    /// Set when ANY pipelined reply observed the server dropping the
+    /// `seq` field — i.e. the request reached an old tunnel-node
+    /// that doesn't speak the pipelining protocol. Once set, new
+    /// sessions skip the `caps` opt-in even if a connect_data reply
+    /// advertises it; the safe fallback is the legacy single-in-flight
+    /// loop. This protects mixed-version multi-deployment configs
+    /// where round-robin can land a session's seq ops on an
+    /// un-upgraded backend after a different deployment advertised
+    /// the bit. The flag is process-wide (not per-deployment): once
+    /// observed, downgrade everyone until restart.
+    pipelining_globally_disabled: AtomicBool,
 }
 
 impl TunnelMux {
-    pub fn start(fronter: Arc<DomainFronter>, coalesce_step_ms: u64, coalesce_max_ms: u64) -> Arc<Self> {
+    pub fn start(
+        fronter: Arc<DomainFronter>,
+        coalesce_step_ms: u64,
+        coalesce_max_ms: u64,
+    ) -> Arc<Self> {
         // Dedupe before snapshotting: the aggregate `all_legacy` gate
         // compares `legacy_deployments.len()` (a HashMap, so unique
         // keys) against this count, so using the raw `num_scripts()`
@@ -308,9 +374,18 @@ impl TunnelMux {
             unique_n,
             CONCURRENCY_PER_DEPLOYMENT
         );
-        let step = if coalesce_step_ms > 0 { coalesce_step_ms } else { DEFAULT_COALESCE_STEP_MS };
-        let max = if coalesce_max_ms > 0 { coalesce_max_ms } else { DEFAULT_COALESCE_MAX_MS };
+        let step = if coalesce_step_ms > 0 {
+            coalesce_step_ms
+        } else {
+            DEFAULT_COALESCE_STEP_MS
+        };
+        let max = if coalesce_max_ms > 0 {
+            coalesce_max_ms
+        } else {
+            DEFAULT_COALESCE_MAX_MS
+        };
         tracing::info!("batch coalesce: step={}ms max={}ms", step, max);
+        let batch_timeout = fronter.batch_timeout();
         let (tx, rx) = mpsc::channel(512);
         tokio::spawn(mux_loop(rx, fronter, step, max));
         Arc::new(Self {
@@ -326,9 +401,64 @@ impl TunnelMux {
             preread_win_total_us: AtomicU64::new(0),
             preread_total_events: AtomicU64::new(0),
             unreachable_cache: Mutex::new(HashMap::new()),
+            batch_timeout,
+            pipelining_globally_disabled: AtomicBool::new(false),
         })
     }
 
+    /// Effective per-session reply timeout for pipelined sessions.
+    /// Has to exceed the actual batch layer's worst-case time —
+    /// otherwise the session's reply watchdog fires before
+    /// `fire_batch` returns, dropping in-flight oneshots and
+    /// forcing avoidable disconnects on legitimate slow batches.
+    ///
+    /// `effective_batch_timeout = max(configured, PIPELINED_BATCH_TIMEOUT_FLOOR)`
+    /// accounts for the server-side worst-case wait
+    /// (`SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE`) plus the h2/h1
+    /// transport. The watchdog covers TWO of those (rather than
+    /// one + small slack) because under per-deployment semaphore
+    /// saturation — all 30 in-flight slots holding long-poll seq
+    /// batches — a fresh op waits up to one full
+    /// `effective_batch_timeout` for a permit BEFORE its own
+    /// `effective_batch_timeout`-bounded request even starts. With
+    /// a one-batch budget the watchdog fires while the op is still
+    /// queued for a permit, dropping the oneshot and closing a
+    /// healthy pipelined session under load.
+    ///
+    /// Additional `PIPELINED_REPLY_SLACK` covers the smaller
+    /// queueing stages: mpsc hop into `mux_loop`, coalesce wait
+    /// (≤ `coalesce_max_ms`), encode/transit overhead. Total
+    /// default budget: 60 s × 2 + 15 s = 135 s. With a configured
+    /// `request_timeout_secs = 120 s`: 120 × 2 + 15 = 255 s.
+    pub(crate) fn pipelined_reply_timeout(&self) -> Duration {
+        let effective_batch = self.batch_timeout.max(PIPELINED_BATCH_TIMEOUT_FLOOR);
+        // 2× to cover saturated-semaphore permit wait + own request.
+        effective_batch * 2 + PIPELINED_REPLY_SLACK
+    }
+
+    /// True once any pipelined reply observed a `None` seq from the
+    /// server, indicating the deployment route reached an old
+    /// tunnel-node that doesn't speak the protocol. Sticky for the
+    /// process lifetime — downgrade all sessions to the legacy loop
+    /// until restart.
+    pub(crate) fn pipelining_disabled(&self) -> bool {
+        self.pipelining_globally_disabled.load(Ordering::Acquire)
+    }
+
+    pub(crate) fn mark_pipelining_disabled(&self) {
+        if !self
+            .pipelining_globally_disabled
+            .swap(true, Ordering::AcqRel)
+        {
+            tracing::warn!(
+                "tunnel-node along the round-robin path dropped the seq \
+                 field on a pipelined reply; disabling pipelining for new \
+                 sessions until restart. Mixed-version config? Make sure \
+                 every script_id forwards to the same upgraded tunnel-node."
+            );
+        }
+    }
+
     async fn send(&self, msg: MuxMsg) {
         let _ = self.tx.send(msg).await;
     }
@@ -591,7 +721,12 @@ impl TunnelMux {
     }
 }
 
-async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, coalesce_step_ms: u64, coalesce_max_ms: u64) {
+async fn mux_loop(
+    mut rx: mpsc::Receiver<MuxMsg>,
+    fronter: Arc<DomainFronter>,
+    coalesce_step_ms: u64,
+    coalesce_max_ms: u64,
+) {
     let coalesce_step = Duration::from_millis(coalesce_step_ms);
     let coalesce_max = Duration::from_millis(coalesce_max_ms);
     // One semaphore per deployment ID, each allowing 30 concurrent requests.
@@ -679,10 +814,18 @@ async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, c
                         port: Some(port),
                         data: Some(data),
                         encode_empty: true,
+                        seq: None,
                     };
-                    accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await;
+                    accum
+                        .push_or_fire(op, op_bytes, reply, &sems, &fronter)
+                        .await;
                 }
-                MuxMsg::Data { sid, data, reply } => {
+                MuxMsg::Data {
+                    sid,
+                    data,
+                    seq,
+                    reply,
+                } => {
                     let op_bytes = encoded_len(data.len());
                     let op = PendingOp {
                         op: "data",
@@ -691,8 +834,11 @@ async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, c
                         port: None,
                         data: if data.is_empty() { None } else { Some(data) },
                         encode_empty: false,
+                        seq,
                     };
-                    accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await;
+                    accum
+                        .push_or_fire(op, op_bytes, reply, &sems, &fronter)
+                        .await;
                 }
                 MuxMsg::UdpOpen {
                     host,
@@ -708,8 +854,11 @@ async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, c
                         port: Some(port),
                         data: if data.is_empty() { None } else { Some(data) },
                         encode_empty: false,
+                        seq: None,
                     };
-                    accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await;
+                    accum
+                        .push_or_fire(op, op_bytes, reply, &sems, &fronter)
+                        .await;
                 }
                 MuxMsg::UdpData { sid, data, reply } => {
                     let op_bytes = encoded_len(data.len());
@@ -720,8 +869,11 @@ async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, c
                         port: None,
                         data: if data.is_empty() { None } else { Some(data) },
                         encode_empty: false,
+                        seq: None,
                     };
-                    accum.push_or_fire(op, op_bytes, reply, &sems, &fronter).await;
+                    accum
+                        .push_or_fire(op, op_bytes, reply, &sems, &fronter)
+                        .await;
                 }
                 MuxMsg::Close { sid } => {
                     close_sids.push(sid);
@@ -739,6 +891,7 @@ async fn mux_loop(mut rx: mpsc::Receiver<MuxMsg>, fronter: Arc<DomainFronter>, c
                 port: None,
                 data: None,
                 encode_empty: false,
+                seq: None,
             });
         }
 
@@ -842,6 +995,7 @@ fn encode_pending(p: PendingOp) -> BatchOp {
         sid: p.sid,
         host: p.host,
         port: p.port,
+        seq: p.seq,
         d,
     }
 }
@@ -880,10 +1034,34 @@ async fn fire_batch(
         // Bounded-wait: if the batch takes longer than the configured
         // batch timeout (Config::request_timeout_secs), all sessions in
         // this batch get an error and can retry.
-        let batch_timeout = f.batch_timeout();
+        //
+        // Pipelined-batch floor: if ANY op in this batch carries a
+        // `seq`, the server-side worst-case wait is
+        // `SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE` (≈ 45 s). The default
+        // configured timeout (30 s) would fire first, surface as
+        // "batch timed out" client-side, AND record a timeout strike
+        // against the deployment. `max(configured, 60 s)` keeps the
+        // client budget aligned with what the server can legitimately
+        // take, without changing legacy-batch behavior.
+        let configured = f.batch_timeout();
+        let has_seq_op = data_ops.iter().any(|op| op.seq.is_some());
+        let batch_timeout = if has_seq_op {
+            configured.max(PIPELINED_BATCH_TIMEOUT_FLOOR)
+        } else {
+            configured
+        };
+        // Pass the effective timeout into `tunnel_batch_request_to`
+        // so the underlying h2 / h1 transports use it too. Without
+        // this, the inner `h2_relay_request` (and h1
+        // `read_http_response`) defaulted to `self.batch_timeout`
+        // (30 s) and fired before our outer 60 s budget could ever
+        // be reached on a pipelined batch — surfacing as "batch
+        // timed out" client-side and recording an undeserved
+        // timeout strike against the deployment. The outer
+        // `tokio::time::timeout` stays as a defensive ceiling.
         let result = tokio::time::timeout(
             batch_timeout,
-            f.tunnel_batch_request_to(&script_id, &data_ops),
+            f.tunnel_batch_request_with_timeout(&script_id, &data_ops, batch_timeout),
         )
         .await;
         tracing::info!(
@@ -1047,22 +1225,51 @@ pub async fn tunnel_connection(
         }
     };
 
-    let (sid, first_resp, pending_client_data) = match initial_data {
+    let (sid, caps, first_resp, pending_client_data) = match initial_data {
         Some(data) => match connect_with_initial_data(host, port, data.clone(), mux).await? {
-            ConnectDataOutcome::Opened { sid, response } => (sid, Some(response), None),
+            ConnectDataOutcome::Opened { sid, response } => {
+                let caps = response.caps;
+                (sid, caps, Some(response), None)
+            }
             ConnectDataOutcome::Unsupported => {
                 mux.mark_connect_data_unsupported();
-                let sid = connect_plain(host, port, mux).await?;
+                let (sid, caps) = connect_plain(host, port, mux).await?;
                 // Replay the buffered ClientHello on the first tunnel_loop
                 // iteration. `Bytes::clone()` is a cheap Arc bump — no
                 // copy of the 64 KB buffer.
-                (sid, None, Some(data))
+                (sid, caps, None, Some(data))
             }
         },
-        None => (connect_plain(host, port, mux).await?, None, None),
+        None => {
+            let (sid, caps) = connect_plain(host, port, mux).await?;
+            (sid, caps, None, None)
+        }
     };
 
-    tracing::info!("tunnel session {} opened for {}:{}", sid, host, port);
+    // Opt into pipelining only when:
+    //   1. THIS connect/connect_data reply advertised the bit, AND
+    //   2. We haven't observed a stale-version reply previously
+    //      (round-robin can land us on an un-upgraded backend after
+    //      a different deployment served the connect with caps set).
+    // The global toggle is sticky for the process lifetime — once
+    // `mark_pipelining_disabled` fires (on a `seq=None` reply for a
+    // pipelined session), every NEW session falls back to the legacy
+    // single-in-flight loop instead of risking another disconnect on
+    // the next mixed-version seq op.
+    let pipeline_advertised = caps.map(|c| c & CAPS_PIPELINE_SEQ != 0).unwrap_or(false);
+    let pipeline = pipeline_advertised && !mux.pipelining_disabled();
+    tracing::info!(
+        "tunnel session {} opened for {}:{} (pipeline={}{})",
+        sid,
+        host,
+        port,
+        pipeline,
+        if pipeline_advertised && !pipeline {
+            ", advertised but disabled (mixed-version backend observed)"
+        } else {
+            ""
+        },
+    );
 
     // Run the first-response write + tunnel_loop inside an async block so
     // any io-error propagates via `?` without bypassing the Close below.
@@ -1087,7 +1294,11 @@ pub async fn tunnel_connection(
                 return Ok(());
             }
         }
-        tunnel_loop(&mut sock, &sid, mux, pending_client_data).await
+        if pipeline {
+            tunnel_loop_pipelined(&mut sock, &sid, mux, pending_client_data).await
+        } else {
+            tunnel_loop(&mut sock, &sid, mux, pending_client_data).await
+        }
     }
     .await;
 
@@ -1104,7 +1315,16 @@ enum ConnectDataOutcome {
     Unsupported,
 }
 
-async fn connect_plain(host: &str, port: u16, mux: &Arc<TunnelMux>) -> std::io::Result<String> {
+/// Open a tunnel session via plain `connect` (no bundled first bytes).
+/// Returns the new sid plus the tunnel-node's `caps` advertisement —
+/// `None` against legacy tunnel-nodes, `Some(bits)` against new ones.
+/// The caller passes `caps` into `tunnel_loop` to pick between the
+/// legacy single-in-flight loop and the pipelined variant.
+async fn connect_plain(
+    host: &str,
+    port: u16,
+    mux: &Arc<TunnelMux>,
+) -> std::io::Result<(String, Option<u32>)> {
     let (reply_tx, reply_rx) = oneshot::channel();
     mux.send(MuxMsg::Connect {
         host: host.to_string(),
@@ -1127,9 +1347,11 @@ async fn connect_plain(host: &str, port: u16, mux: &Arc<TunnelMux>) -> std::io::
                     e.clone(),
                 ));
             }
-            resp.sid.ok_or_else(|| {
+            let caps = resp.caps;
+            let sid = resp.sid.ok_or_else(|| {
                 std::io::Error::new(std::io::ErrorKind::Other, "tunnel connect: no session id")
-            })
+            })?;
+            Ok((sid, caps))
         }
         Ok(Err(e)) => {
             tracing::error!("tunnel connect error for {}:{}: {}", host, port, e);
@@ -1282,7 +1504,7 @@ async fn tunnel_loop(
     loop {
         // Cadence depends on whether the tunnel-node is doing long-poll
         // drains. With long-poll, the server holds empty polls open up
-        // to its `LONGPOLL_DEADLINE` (~5 s currently), so the client
+        // to its `LONGPOLL_DEADLINE` (~15 s on current tunnel-nodes), so the client
         // can keep this read timeout short — the wait is on the wire,
         // not here. Against *legacy* tunnel-nodes (no long-poll, fast
         // empty replies), the same short cadence + always-poll behavior
@@ -1361,6 +1583,7 @@ async fn tunnel_loop(
         mux.send(MuxMsg::Data {
             sid: sid.to_string(),
             data,
+            seq: None,
             reply: reply_tx,
         })
         .await;
@@ -1427,6 +1650,279 @@ async fn tunnel_loop(
     Ok(())
 }
 
+/// Maximum number of `data` ops a pipelined session keeps in flight
+/// at any one time. Two parallel batches per session is enough to
+/// overlap one Apps Script round-trip with the next on a single TCP
+/// stream — that's the throughput win. Higher depths run into
+/// diminishing returns (server-side seq enforcement serializes the
+/// drains anyway) and burn more Apps Script quota on speculative
+/// polls when a session goes idle.
+const PIPELINE_DEPTH: usize = 2;
+
+/// Pipelined variant of `tunnel_loop`: keeps up to `PIPELINE_DEPTH`
+/// `data` ops in flight per session, each tagged with a per-session
+/// monotonic `seq`. The tunnel-node enforces in-order processing so
+/// replies — which may travel back over different deployments and
+/// arrive out of order on the wire — carry bytes that are correctly
+/// sequenced within the session's stream.
+///
+/// The reorder concern is handled implicitly by FIFO `await` on the
+/// in-flight oneshot queue: whichever reply arrived first stays in
+/// its oneshot's buffer until our task reaches its turn in the
+/// queue. Output to the local socket is therefore always in send
+/// order, which (because the server processed the corresponding
+/// uplinks in seq order) matches the upstream byte order.
+async fn tunnel_loop_pipelined(
+    sock: &mut TcpStream,
+    sid: &str,
+    mux: &Arc<TunnelMux>,
+    mut pending_client_data: Option<Bytes>,
+) -> std::io::Result<()> {
+    let (mut reader, mut writer) = sock.split();
+    const READ_CHUNK: usize = 65536;
+    const ZERO_COPY_THRESHOLD: usize = READ_CHUNK / 2;
+    let mut buf = BytesMut::with_capacity(READ_CHUNK);
+    // FIFO of in-flight (seq, oneshot) pairs. Front = oldest seq
+    // still awaiting a response; we always await the front so output
+    // to the local socket is in seq order regardless of which reply
+    // arrived first on the wire. The seq is kept alongside the
+    // receiver so we can verify the server echoed the right value
+    // (defends against a server bug or version skew silently
+    // mis-routing bytes between sessions).
+    let mut in_flight: std::collections::VecDeque<(
+        u64,
+        oneshot::Receiver<Result<(TunnelResponse, String), String>>,
+    )> = std::collections::VecDeque::new();
+    let mut next_send_seq: u64 = 0;
+    let mut consecutive_empty: u32 = 0;
+    // Flag set when the local client half-closes (TCP shutdown send,
+    // returns Ok(0) from read) or its socket errors. Once set, we
+    // stop queuing new uplink ops but continue draining replies for
+    // seqs already in flight — those replies may carry downlink
+    // bytes the server already produced. Returning early on EOF
+    // (the previous shape) dropped the queued oneshot receivers
+    // and silently lost any pending downlink, breaking valid
+    // half-close patterns like HTTP request/response with the
+    // client shutting its write half after the request.
+    let mut client_send_closed = false;
+
+    loop {
+        // SEND PHASE. While we have room and either the client has
+        // data or we're keeping the pipeline full, queue another op.
+        // `consecutive_empty == 0` means the last reply brought
+        // downlink bytes — we treat that as "active transfer" and
+        // pre-fetch with PIPELINE_DEPTH polls. Otherwise (idle) we
+        // keep just one in-flight poll so the server long-polls it
+        // and we don't burn quota on speculative empties.
+        //
+        // When `client_send_closed` is set we skip the send phase
+        // entirely and fall through to RECEIVE — the loop exits
+        // cleanly once `in_flight` drains.
+        let active = consecutive_empty == 0;
+        let target_depth = if active { PIPELINE_DEPTH } else { 1 };
+        while !client_send_closed && in_flight.len() < target_depth {
+            // Decide what data to send: replay any pending preread
+            // bytes first; then attempt a client read. If we have
+            // nothing in flight we *must* send (otherwise the loop
+            // stalls waiting for replies that aren't coming), so
+            // use the legacy escalating timeout. With ops already
+            // in flight we only send when the client has data
+            // *immediately* available — speculative empty polls in
+            // that branch would just burn Apps Script quota.
+            //
+            // `Option<Bytes>::None` from the inner match means EOF
+            // / read-error: set `client_send_closed` and break out
+            // of the send phase. The remaining in_flight ops still
+            // get drained by the RECEIVE PHASE below.
+            let data: Option<Bytes> = if let Some(d) = pending_client_data.take() {
+                Some(d)
+            } else if in_flight.is_empty() {
+                let read_timeout = match (mux.all_servers_legacy(), consecutive_empty) {
+                    (_, 0) => Duration::from_millis(20),
+                    (_, 1) => Duration::from_millis(80),
+                    (_, 2) => Duration::from_millis(200),
+                    (false, _) => Duration::from_millis(500),
+                    (true, _) => Duration::from_secs(30),
+                };
+                buf.reserve(READ_CHUNK);
+                match tokio::time::timeout(read_timeout, reader.read_buf(&mut buf)).await {
+                    Ok(Ok(0)) => None,
+                    Ok(Ok(n)) => {
+                        consecutive_empty = 0;
+                        Some(if n >= ZERO_COPY_THRESHOLD {
+                            buf.split().freeze()
+                        } else {
+                            let owned = Bytes::copy_from_slice(&buf[..n]);
+                            buf.clear();
+                            owned
+                        })
+                    }
+                    Ok(Err(_)) => None,
+                    Err(_) => Some(Bytes::new()),
+                }
+            } else {
+                // Already have ops in flight. Non-blocking read: if
+                // the client has data right now, pipeline another op;
+                // otherwise drop out and await replies.
+                buf.reserve(READ_CHUNK);
+                match tokio::time::timeout(Duration::from_millis(0), reader.read_buf(&mut buf))
+                    .await
+                {
+                    Ok(Ok(0)) => None,
+                    Ok(Ok(n)) => {
+                        consecutive_empty = 0;
+                        Some(if n >= ZERO_COPY_THRESHOLD {
+                            buf.split().freeze()
+                        } else {
+                            let owned = Bytes::copy_from_slice(&buf[..n]);
+                            buf.clear();
+                            owned
+                        })
+                    }
+                    Ok(Err(_)) => None,
+                    Err(_) => break, // no data ready, stop trying to send more
+                }
+            };
+
+            let Some(data) = data else {
+                // EOF or read error. Flag the close and let RECEIVE
+                // drain whatever's already in flight before we exit.
+                client_send_closed = true;
+                break;
+            };
+
+            let seq = next_send_seq;
+            next_send_seq = next_send_seq.saturating_add(1);
+            let (reply_tx, reply_rx) = oneshot::channel();
+            mux.send(MuxMsg::Data {
+                sid: sid.to_string(),
+                data,
+                seq: Some(seq),
+                reply: reply_tx,
+            })
+            .await;
+            in_flight.push_back((seq, reply_rx));
+        }
+
+        // RECEIVE PHASE. Await the front-of-queue reply. Subsequent
+        // replies (whose seqs come later) wait in their oneshot
+        // buffers until we get to them — that's the reorder
+        // mechanism, and it doesn't need its own data structure.
+        let Some((expected_seq, reply_rx)) = in_flight.pop_front() else {
+            // Nothing in flight. If the client has half-closed, we
+            // can exit cleanly now — there are no more replies to
+            // drain. Otherwise (defensive) loop back; the send
+            // phase guarantees at least one op in flight when the
+            // client is still active.
+            if client_send_closed {
+                return Ok(());
+            }
+            continue;
+        };
+        // Derive the per-session reply timeout from the effective
+        // batch timeout (max(configured, PIPELINED_BATCH_TIMEOUT_FLOOR))
+        // plus queueing slack — see `TunnelMux::pipelined_reply_timeout`.
+        // Computing this dynamically (rather than the previous fixed
+        // 60 s constant) closes the session-watchdog vs batch-layer
+        // race: with `request_timeout_secs > 60`, the batch is still
+        // inside its budget while the session watchdog would
+        // otherwise close it, dropping in-flight oneshots.
+        let reply_timeout = mux.pipelined_reply_timeout();
+        let (resp, _script_id) = match tokio::time::timeout(reply_timeout, reply_rx).await {
+            Ok(Ok(Ok((r, sid_used)))) => (r, sid_used),
+            Ok(Ok(Err(e))) => {
+                tracing::debug!("pipelined data error: {}", e);
+                break;
+            }
+            Ok(Err(_)) => break, // channel dropped
+            Err(_) => {
+                // Pipelined timeout has to close the session
+                // rather than retry. Continuing here drops the
+                // oneshot for `expected_seq` — if the server
+                // later completes it, any drained downlink bytes
+                // it would have carried are lost, and subsequent
+                // seq replies happily fill the local socket past
+                // a now-invisible gap (silent stream corruption).
+                // The dynamic `reply_timeout` is sized to exceed
+                // any valid server-side wait + queueing, so
+                // hitting it means the mux dropped a message or
+                // the task panicked — closing is the only
+                // bytes-correct response.
+                tracing::warn!(
+                    "pipelined sess {}: reply timeout for seq {} after {:?}, \
+                         closing session (further ops would risk silent \
+                         corruption from missing downlink bytes)",
+                    &sid[..sid.len().min(8)],
+                    expected_seq,
+                    reply_timeout,
+                );
+                break;
+            }
+        };
+
+        // Verify the server echoed the seq we expected. A mismatch
+        // means either (a) the tunnel-node has a bug, or (b) a
+        // version-skew situation where the response shape changed
+        // — either way we can't trust the bytes belong to this
+        // session position, so close the session rather than
+        // silently writing potentially-misrouted bytes.
+        match resp.seq {
+            Some(s) if s == expected_seq => { /* match — proceed */ }
+            Some(s) => {
+                tracing::error!(
+                    "pipelined sess {}: server echoed seq {} but we expected {}; \
+                     closing session to avoid misordered output",
+                    &sid[..sid.len().min(8)],
+                    s,
+                    expected_seq,
+                );
+                break;
+            }
+            None => {
+                // The reply reached an old tunnel-node along the
+                // round-robin path that doesn't speak the seq
+                // protocol. Globally disable pipelining for new
+                // sessions until the process restarts — the next
+                // round-robin pick could repeat the disconnect, and
+                // staying in legacy mode is strictly safer than
+                // hoping the broken backend rotates out.
+                mux.mark_pipelining_disabled();
+                tracing::error!(
+                    "pipelined sess {}: server reply for seq {} omitted seq field; \
+                     closing session and disabling pipelining for new sessions \
+                     (mixed-version backend along the round-robin path)",
+                    &sid[..sid.len().min(8)],
+                    expected_seq,
+                );
+                break;
+            }
+        }
+
+        if let Some(ref e) = resp.e {
+            tracing::debug!("pipelined tunnel error: {}", e);
+            break;
+        }
+
+        let got_data = match write_tunnel_response(&mut writer, &resp).await? {
+            WriteOutcome::Wrote => true,
+            WriteOutcome::NoData => false,
+            WriteOutcome::BadBase64 => break,
+        };
+
+        if resp.eof.unwrap_or(false) {
+            break;
+        }
+
+        if got_data {
+            consecutive_empty = 0;
+        } else {
+            consecutive_empty = consecutive_empty.saturating_add(1);
+        }
+    }
+
+    Ok(())
+}
+
 enum WriteOutcome {
     Wrote,
     NoData,
@@ -1489,6 +1985,8 @@ mod tests {
             eof: None,
             e: e.map(str::to_string),
             code: code.map(str::to_string),
+            seq: None,
+            caps: None,
         }
     }
 
@@ -1588,11 +2086,7 @@ mod tests {
     #[test]
     fn negative_cache_ignores_non_unreachable_errors() {
         let (mux, _rx) = mux_for_test();
-        mux.record_unreachable_if_match(
-            "example.com",
-            443,
-            "connect failed: connection refused",
-        );
+        mux.record_unreachable_if_match("example.com", 443, "connect failed: connection refused");
         assert!(!mux.is_unreachable("example.com", 443));
     }
 
@@ -1624,9 +2118,10 @@ mod tests {
     async fn negative_cache_skips_outer_relay_errors() {
         let (mux, mut rx) = mux_for_test();
         let mux_for_task = mux.clone();
-        let task = tokio::spawn(async move {
-            connect_plain("real.target.example", 443, &mux_for_task).await
-        });
+        let task =
+            tokio::spawn(
+                async move { connect_plain("real.target.example", 443, &mux_for_task).await },
+            );
 
         // Receive the Connect msg and reply with an outer Err whose string
         // would otherwise match `is_unreachable_error_str`.
@@ -1636,7 +2131,7 @@ mod tests {
             other => panic!("expected Connect, got {:?}", std::mem::discriminant(&other)),
         };
         let _ = reply.send(Err(
-            "relay failed: Network is unreachable (os error 101)".into(),
+            "relay failed: Network is unreachable (os error 101)".into()
         ));
 
         let res = task.await.expect("task");
@@ -1662,11 +2157,7 @@ mod tests {
                 "connect failed: Network is unreachable (os error 101)",
             );
         }
-        let len = mux
-            .unreachable_cache
-            .lock()
-            .map(|g| g.len())
-            .unwrap_or(0);
+        let len = mux.unreachable_cache.lock().map(|g| g.len()).unwrap_or(0);
         assert!(
             len <= UNREACHABLE_CACHE_MAX,
             "cache size {} exceeded cap {}",
@@ -1719,6 +2210,12 @@ mod tests {
             preread_win_total_us: AtomicU64::new(0),
             preread_total_events: AtomicU64::new(0),
             unreachable_cache: Mutex::new(HashMap::new()),
+            // Tests don't go through `start()`, so we pick a
+            // representative configured batch_timeout (matches the
+            // 30s `default_request_timeout_secs`) for derivations
+            // like `pipelined_reply_timeout`.
+            batch_timeout: Duration::from_secs(30),
+            pipelining_globally_disabled: AtomicBool::new(false),
         });
         (mux, rx)
     }
@@ -1757,7 +2254,12 @@ mod tests {
             .expect("mux channel closed unexpectedly");
 
         match msg {
-            MuxMsg::Data { sid, data, reply } => {
+            MuxMsg::Data {
+                sid,
+                data,
+                seq: _,
+                reply,
+            } => {
                 assert_eq!(sid, "sid-under-test");
                 assert_eq!(&data[..], b"CLIENTHELLO");
                 // Reply with eof so tunnel_loop unwinds cleanly.
@@ -1769,6 +2271,8 @@ mod tests {
                         eof: Some(true),
                         e: None,
                         code: None,
+                        seq: None,
+                        caps: None,
                     },
                     "test-script".to_string(),
                 )));
@@ -1838,9 +2342,18 @@ mod tests {
                 ))
                 .expect("mux channel closed unexpectedly");
             match msg {
-                MuxMsg::Data { sid, data, reply } => {
+                MuxMsg::Data {
+                    sid,
+                    data,
+                    seq: _,
+                    reply,
+                } => {
                     assert_eq!(sid, "sid-mixed");
-                    assert!(data.is_empty(), "expected empty poll, got {} bytes", data.len());
+                    assert!(
+                        data.is_empty(),
+                        "expected empty poll, got {} bytes",
+                        data.len()
+                    );
                     let last = i == 5;
                     let _ = reply.send(Ok((
                         TunnelResponse {
@@ -1850,6 +2363,8 @@ mod tests {
                             eof: if last { Some(true) } else { None },
                             e: None,
                             code: None,
+                            seq: None,
+                            caps: None,
                         },
                         "script-A".to_string(),
                     )));
@@ -1866,6 +2381,569 @@ mod tests {
             .expect("tunnel_loop did not exit after eof");
     }
 
+    /// Capability detection: a TunnelResponse whose `caps` field
+    /// includes `CAPS_PIPELINE_SEQ` must enable pipelining for the
+    /// session, including when the response shape is the batched
+    /// `connect_data` reply (the dominant HTTPS fast path). Without
+    /// this end-to-end check, we can regress the bit by serializing
+    /// the wrong response shape on the server (the case the
+    /// reviewer caught: batched connect_data went through
+    /// `tcp_drain_response` which sets `caps: None`).
+    #[test]
+    fn caps_field_drives_pipeline_decision() {
+        // Server response with caps bit set → pipeline opt-in.
+        let resp_pipelined = TunnelResponse {
+            sid: Some("s".into()),
+            d: None,
+            pkts: None,
+            eof: None,
+            e: None,
+            code: None,
+            seq: None,
+            caps: Some(CAPS_PIPELINE_SEQ),
+        };
+        let pipeline = resp_pipelined
+            .caps
+            .map(|c| c & CAPS_PIPELINE_SEQ != 0)
+            .unwrap_or(false);
+        assert!(pipeline, "caps bit set must enable pipelining");
+
+        // Old tunnel-node, no caps field → legacy path.
+        let resp_legacy = TunnelResponse {
+            sid: Some("s".into()),
+            d: None,
+            pkts: None,
+            eof: None,
+            e: None,
+            code: None,
+            seq: None,
+            caps: None,
+        };
+        let pipeline = resp_legacy
+            .caps
+            .map(|c| c & CAPS_PIPELINE_SEQ != 0)
+            .unwrap_or(false);
+        assert!(!pipeline, "absent caps must NOT enable pipelining");
+
+        // Caps present but bit cleared (forward-compat for future
+        // capability bits the client doesn't recognize) → legacy.
+        let resp_other_cap = TunnelResponse {
+            sid: Some("s".into()),
+            d: None,
+            pkts: None,
+            eof: None,
+            e: None,
+            code: None,
+            seq: None,
+            caps: Some(0),
+        };
+        let pipeline = resp_other_cap
+            .caps
+            .map(|c| c & CAPS_PIPELINE_SEQ != 0)
+            .unwrap_or(false);
+        assert!(
+            !pipeline,
+            "caps without the pipeline bit must NOT enable pipelining"
+        );
+    }
+
+    /// `tunnel_loop_pipelined`'s first emitted op for a session must
+    /// carry `seq: Some(0)`. The tunnel-node side relies on seqs being
+    /// per-session monotonic from 0 — if this regresses, the very
+    /// first op deadlocks the server's seq lock (it's waiting for
+    /// expected=0, never sees it).
+    #[tokio::test]
+    async fn pipelined_loop_first_op_carries_seq_zero() {
+        use tokio::net::TcpListener;
+
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 });
+        let _client = TcpStream::connect(addr).await.unwrap();
+        let mut server_side = accept.await.unwrap();
+
+        let (mux, mut rx) = mux_for_test();
+        let pending = Some(Bytes::from_static(b"FIRST"));
+
+        let loop_handle = tokio::spawn({
+            let mux = mux.clone();
+            async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await }
+        });
+
+        let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv())
+            .await
+            .expect("pipelined loop did not emit a message within 2s")
+            .expect("mux channel closed unexpectedly");
+
+        match msg {
+            MuxMsg::Data {
+                sid,
+                data,
+                seq,
+                reply,
+            } => {
+                assert_eq!(sid, "sid-pipe");
+                assert_eq!(&data[..], b"FIRST");
+                assert_eq!(
+                    seq,
+                    Some(0),
+                    "first pipelined op for a session must be seq=0",
+                );
+                let _ = reply.send(Ok((
+                    TunnelResponse {
+                        sid: Some("sid-pipe".into()),
+                        d: None,
+                        pkts: None,
+                        eof: Some(true),
+                        e: None,
+                        code: None,
+                        seq: Some(0),
+                        caps: None,
+                    },
+                    "test-script".to_string(),
+                )));
+            }
+            _ => panic!("expected Data, got something else"),
+        }
+
+        let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await;
+    }
+
+    /// Reordering correctness: when reply for seq=1 arrives at the
+    /// client BEFORE reply for seq=0 (different batches racing back
+    /// through different deployments), `tunnel_loop_pipelined` must
+    /// still write seq=0's bytes to the local socket first. Failure
+    /// mode if this regresses: silent stream corruption — the client
+    /// app sees later bytes before earlier ones.
+    ///
+    /// We force two ops in flight by stuffing both `pending_client_data`
+    /// (becomes seq=0) and a second chunk in the socket kernel buffer
+    /// (read becomes seq=1) before the loop starts. The loop's send
+    /// phase consumes both before entering the receive phase.
+    #[tokio::test]
+    async fn pipelined_loop_writes_in_seq_order_when_replies_arrive_reversed() {
+        use tokio::io::{AsyncReadExt, AsyncWriteExt};
+        use tokio::net::TcpListener;
+
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 });
+        let mut client = TcpStream::connect(addr).await.unwrap();
+        let mut server_side = accept.await.unwrap();
+
+        // Pre-stuff "SECOND" so the loop's non-blocking read picks it
+        // up after consuming pending="FIRST".
+        client.write_all(b"SECOND").await.unwrap();
+        client.flush().await.unwrap();
+        // Brief wait so the bytes definitely land in the kernel buffer.
+        tokio::time::sleep(Duration::from_millis(30)).await;
+
+        let (mux, mut rx) = mux_for_test();
+        let pending = Some(Bytes::from_static(b"FIRST"));
+
+        let loop_handle = tokio::spawn({
+            let mux = mux.clone();
+            async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await }
+        });
+
+        // Receive both ops back-to-back. Loop sends them before
+        // entering the receive phase because PIPELINE_DEPTH=2 and
+        // both reads succeed in the send phase.
+        let msg0 = tokio::time::timeout(Duration::from_secs(2), rx.recv())
+            .await
+            .expect("first op not emitted")
+            .expect("mux channel closed");
+        let reply_0 = match msg0 {
+            MuxMsg::Data {
+                sid,
+                data,
+                seq,
+                reply,
+            } => {
+                assert_eq!(sid, "sid-pipe");
+                assert_eq!(seq, Some(0));
+                assert_eq!(&data[..], b"FIRST");
+                reply
+            }
+            _ => panic!("expected Data 0"),
+        };
+
+        let msg1 = tokio::time::timeout(Duration::from_secs(2), rx.recv())
+            .await
+            .expect("second op not emitted")
+            .expect("mux channel closed");
+        let reply_1 = match msg1 {
+            MuxMsg::Data {
+                sid,
+                data,
+                seq,
+                reply,
+            } => {
+                assert_eq!(sid, "sid-pipe");
+                assert_eq!(seq, Some(1));
+                assert_eq!(&data[..], b"SECOND");
+                reply
+            }
+            _ => panic!("expected Data 1"),
+        };
+
+        // Fire reply for seq=1 FIRST (out of order on the wire) with
+        // downlink bytes "B".
+        let _ = reply_1.send(Ok((
+            TunnelResponse {
+                sid: Some("sid-pipe".into()),
+                d: Some(B64.encode(b"B")),
+                pkts: None,
+                eof: Some(true), // eof so the loop exits after seq=1
+                e: None,
+                code: None,
+                seq: Some(1),
+                caps: None,
+            },
+            "test-script".to_string(),
+        )));
+
+        // Brief delay to let the seq=1 reply settle in its oneshot's
+        // buffer. The loop is still awaiting the seq=0 oneshot at the
+        // front of the in-flight queue, so it MUST NOT have written
+        // anything yet.
+        tokio::time::sleep(Duration::from_millis(30)).await;
+        let mut peek = [0u8; 16];
+        match tokio::time::timeout(Duration::from_millis(20), client.read(&mut peek)).await {
+            Ok(Ok(_)) => panic!(
+                "loop wrote downlink bytes before seq=0 reply arrived — \
+                 reorder invariant broken"
+            ),
+            Ok(Err(_)) | Err(_) => { /* expected: nothing written yet */ }
+        }
+
+        // Now fire reply for seq=0 with "A". Loop should write "A"
+        // then immediately drain seq=1 from its oneshot and write "B".
+        let _ = reply_0.send(Ok((
+            TunnelResponse {
+                sid: Some("sid-pipe".into()),
+                d: Some(B64.encode(b"A")),
+                pkts: None,
+                eof: None,
+                e: None,
+                code: None,
+                seq: Some(0),
+                caps: None,
+            },
+            "test-script".to_string(),
+        )));
+
+        let mut received = vec![0u8; 16];
+        let mut total = 0;
+        // Read until we have both bytes or the loop closes.
+        loop {
+            match tokio::time::timeout(Duration::from_secs(2), client.read(&mut received[total..]))
+                .await
+            {
+                Ok(Ok(0)) => break,
+                Ok(Ok(n)) => {
+                    total += n;
+                    if total >= 2 {
+                        break;
+                    }
+                }
+                Ok(Err(_)) | Err(_) => break,
+            }
+        }
+        assert_eq!(
+            &received[..total],
+            b"AB",
+            "downlink bytes must be written in seq order (got {:?})",
+            &received[..total],
+        );
+
+        let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await;
+    }
+
+    /// Critical regression: `pipelined_reply_timeout` must cover
+    /// BOTH the per-deployment semaphore wait AND the request's own
+    /// budget — under saturation, a fresh op waits up to one full
+    /// `effective_batch_timeout` for a permit before its own
+    /// `effective_batch_timeout`-bounded request even starts. With
+    /// a one-batch budget the watchdog fires while the op is still
+    /// queued for a permit, dropping the oneshot and closing a
+    /// healthy pipelined session under load.
+    ///
+    /// The test pins the `2× + slack` invariant so future changes
+    /// to either the formula or PIPELINED_BATCH_TIMEOUT_FLOOR
+    /// require an intentional update.
+    #[test]
+    fn pipelined_reply_timeout_covers_two_batch_budgets_for_semaphore_saturation() {
+        let (mux, _rx) = mux_for_test();
+        let reply = mux.pipelined_reply_timeout();
+        // Default config has 30 s configured → 60 s floor → 60×2 = 120 s.
+        let effective_batch = mux.batch_timeout.max(PIPELINED_BATCH_TIMEOUT_FLOOR);
+        assert!(
+            reply >= effective_batch * 2,
+            "default-config reply timeout ({:?}) must cover 2× the \
+             effective batch budget ({:?}) so a permit-saturated \
+             pipeline has time to acquire a slot AND complete the \
+             request before the session watchdog fires",
+            reply,
+            effective_batch * 2,
+        );
+
+        // Same invariant under a user-tuned long timeout.
+        let (tx, _rx) = mpsc::channel(16);
+        let custom_batch = Duration::from_secs(120);
+        let mux = Arc::new(TunnelMux {
+            tx,
+            connect_data_unsupported: Arc::new(AtomicBool::new(false)),
+            legacy_deployments: Mutex::new(HashMap::new()),
+            all_legacy: Arc::new(AtomicBool::new(false)),
+            num_scripts: 1,
+            preread_win: AtomicU64::new(0),
+            preread_loss: AtomicU64::new(0),
+            preread_skip_port: AtomicU64::new(0),
+            preread_skip_unsupported: AtomicU64::new(0),
+            preread_win_total_us: AtomicU64::new(0),
+            preread_total_events: AtomicU64::new(0),
+            unreachable_cache: Mutex::new(HashMap::new()),
+            batch_timeout: custom_batch,
+            pipelining_globally_disabled: AtomicBool::new(false),
+        });
+        let reply = mux.pipelined_reply_timeout();
+        assert!(
+            reply >= custom_batch * 2,
+            "with request_timeout_secs={:?}, reply timeout ({:?}) \
+             must cover 2× the budget for permit + request",
+            custom_batch,
+            reply,
+        );
+    }
+
+    /// `mark_pipelining_disabled` must flip the global toggle (sticky
+    /// for the process lifetime) so subsequent connects fall back to
+    /// the legacy loop, even when the connect_data reply still
+    /// advertises `caps`. This protects mixed-version deployments
+    /// where round-robin can land a session's seq ops on an
+    /// un-upgraded backend after a different deployment served the
+    /// connect with caps set.
+    #[test]
+    fn mark_pipelining_disabled_is_sticky_and_overrides_caps() {
+        let (mux, _rx) = mux_for_test();
+        assert!(!mux.pipelining_disabled());
+
+        mux.mark_pipelining_disabled();
+        assert!(mux.pipelining_disabled());
+
+        // Idempotent — re-marking doesn't toggle off.
+        mux.mark_pipelining_disabled();
+        assert!(mux.pipelining_disabled());
+
+        // The actual decision in `tunnel_connection` is
+        // `pipeline_advertised && !mux.pipelining_disabled()`, so
+        // even a caps-advertising connect_data reply must NOT
+        // re-enable pipelining once the toggle is set.
+        let advertised = true;
+        let pipeline = advertised && !mux.pipelining_disabled();
+        assert!(
+            !pipeline,
+            "advertised caps must not re-enable pipelining after the \
+             mixed-version toggle has fired",
+        );
+    }
+
+    /// Critical regression: when the local client half-closes (TCP
+    /// shutdown send → `Ok(0)` on read) AFTER queuing request bytes,
+    /// the loop has ops in flight whose replies may carry downlink
+    /// data the server already produced. Returning early on EOF
+    /// (the previous shape) dropped the queued oneshot receivers
+    /// and silently lost those bytes — breaking valid request/
+    /// response patterns where the client closes its write half
+    /// after sending. The fix: set `client_send_closed` and continue
+    /// draining `in_flight` before returning.
+    #[tokio::test]
+    async fn pipelined_loop_drains_in_flight_on_client_half_close() {
+        use tokio::io::AsyncReadExt;
+        use tokio::io::AsyncWriteExt;
+        use tokio::net::TcpListener;
+
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 });
+        let mut client = TcpStream::connect(addr).await.unwrap();
+        let mut server_side = accept.await.unwrap();
+
+        let (mux, mut rx) = mux_for_test();
+        // Pre-stuff "REQUEST" so the loop's first iteration sends it
+        // as seq=0, then half-closes from the client side. The loop
+        // must still wait for seq=0's reply (with downlink bytes)
+        // before returning.
+        client.write_all(b"REQUEST").await.unwrap();
+        client.flush().await.unwrap();
+        // Half-close the client's write side. Subsequent reads on
+        // server_side will return Ok(0) once the buffered "REQUEST"
+        // is consumed.
+        client.shutdown().await.unwrap();
+        tokio::time::sleep(Duration::from_millis(20)).await;
+
+        let loop_handle = tokio::spawn({
+            let mux = mux.clone();
+            async move {
+                tunnel_loop_pipelined(&mut server_side, "sid-halfclose", &mux, None).await
+            }
+        });
+
+        // The loop emits seq=0 with the buffered "REQUEST" bytes.
+        let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv())
+            .await
+            .expect("first op not emitted")
+            .expect("mux channel closed");
+        let reply = match msg {
+            MuxMsg::Data { sid, data, seq, reply } => {
+                assert_eq!(sid, "sid-halfclose");
+                assert_eq!(seq, Some(0));
+                assert_eq!(&data[..], b"REQUEST");
+                reply
+            }
+            _ => panic!("expected Data 0"),
+        };
+
+        // Brief delay so the loop's next send-phase iteration sees
+        // the half-closed socket (Ok(0)) and flips
+        // `client_send_closed` — but stays parked on the in_flight
+        // oneshot for seq=0.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // The loop MUST still be alive — it's draining the pending
+        // reply, not exiting on the half-close.
+        assert!(
+            !loop_handle.is_finished(),
+            "loop must keep running while in_flight has pending replies; \
+             returning on Ok(0) would drop seq=0's oneshot and lose its \
+             downlink bytes",
+        );
+
+        // Now deliver the reply with downlink bytes "RESPONSE" + eof.
+        let _ = reply.send(Ok((
+            TunnelResponse {
+                sid: Some("sid-halfclose".into()),
+                d: Some(B64.encode(b"RESPONSE")),
+                pkts: None,
+                eof: Some(true),
+                e: None,
+                code: None,
+                seq: Some(0),
+                caps: None,
+            },
+            "test-script".to_string(),
+        )));
+
+        // Loop should write "RESPONSE" to the local socket and exit.
+        let mut received = vec![0u8; 32];
+        let mut total = 0;
+        loop {
+            match tokio::time::timeout(
+                Duration::from_secs(2),
+                client.read(&mut received[total..]),
+            )
+            .await
+            {
+                Ok(Ok(0)) => break,
+                Ok(Ok(n)) => {
+                    total += n;
+                    if total >= 8 {
+                        break;
+                    }
+                }
+                Ok(Err(_)) | Err(_) => break,
+            }
+        }
+        assert_eq!(
+            &received[..total],
+            b"RESPONSE",
+            "downlink bytes must be delivered even after client half-close \
+             (got {:?})",
+            &received[..total],
+        );
+
+        let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle).await;
+    }
+
+    /// A pipelined session whose server reply carries the wrong echoed
+    /// seq must close, not silently write potentially-misrouted bytes
+    /// to the local socket. The reply we'd otherwise honor could
+    /// contain bytes from a different session position (server bug or
+    /// version skew where the response shape changed) — emitting them
+    /// is the silent-corruption failure mode the seq protocol exists
+    /// to prevent.
+    #[tokio::test]
+    async fn pipelined_loop_closes_on_seq_echo_mismatch() {
+        use tokio::io::AsyncReadExt;
+        use tokio::net::TcpListener;
+
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let accept = tokio::spawn(async move { listener.accept().await.unwrap().0 });
+        let mut client = TcpStream::connect(addr).await.unwrap();
+        let mut server_side = accept.await.unwrap();
+
+        let (mux, mut rx) = mux_for_test();
+        let pending = Some(Bytes::from_static(b"FIRST"));
+
+        let loop_handle = tokio::spawn({
+            let mux = mux.clone();
+            async move { tunnel_loop_pipelined(&mut server_side, "sid-pipe", &mux, pending).await }
+        });
+
+        let msg = tokio::time::timeout(Duration::from_secs(2), rx.recv())
+            .await
+            .expect("first op not emitted")
+            .expect("mux channel closed");
+        let reply = match msg {
+            MuxMsg::Data {
+                sid: _,
+                data: _,
+                seq,
+                reply,
+            } => {
+                assert_eq!(seq, Some(0));
+                reply
+            }
+            _ => panic!("expected Data 0"),
+        };
+
+        // Reply with a WRONG seq (echoed seq=99 for our seq=0 op).
+        // The loop should detect the mismatch, close the session,
+        // and write nothing to the local socket.
+        let _ = reply.send(Ok((
+            TunnelResponse {
+                sid: Some("sid-pipe".into()),
+                d: Some(B64.encode(b"BOGUS")), // bytes that would be wrong to write
+                pkts: None,
+                eof: None,
+                e: None,
+                code: None,
+                seq: Some(99),
+                caps: None,
+            },
+            "test-script".to_string(),
+        )));
+
+        // Wait for the loop to terminate.
+        let _ = tokio::time::timeout(Duration::from_secs(2), loop_handle)
+            .await
+            .expect("loop must exit on seq mismatch");
+
+        // Local socket must NOT have received "BOGUS".
+        let mut peek = [0u8; 16];
+        match tokio::time::timeout(Duration::from_millis(100), client.read(&mut peek)).await {
+            Ok(Ok(0)) | Ok(Err(_)) | Err(_) => { /* expected: socket closed or empty */ }
+            Ok(Ok(n)) => panic!(
+                "loop wrote {} bytes despite seq mismatch — should have closed silently \
+                 (got: {:?})",
+                n,
+                &peek[..n]
+            ),
+        }
+    }
+
     /// Once `mark_connect_data_unsupported` is called, future sessions
     /// must see the flag — no per-session repeat of the detect-and-fallback
     /// cost. If this regresses, every new flow pays an extra round trip
@@ -2007,23 +3085,11 @@ mod tests {
     #[test]
     fn should_fire_when_payload_would_exceed_cap() {
         // Exactly at the cap is fine — strict `>`.
-        assert!(!should_fire(
-            10,
-            MAX_BATCH_PAYLOAD_BYTES - 100,
-            100,
-        ));
+        assert!(!should_fire(10, MAX_BATCH_PAYLOAD_BYTES - 100, 100,));
         // One byte over: fire.
-        assert!(should_fire(
-            10,
-            MAX_BATCH_PAYLOAD_BYTES - 100,
-            101,
-        ));
+        assert!(should_fire(10, MAX_BATCH_PAYLOAD_BYTES - 100, 101,));
         // Sum overflow well past the cap: fire.
-        assert!(should_fire(
-            10,
-            MAX_BATCH_PAYLOAD_BYTES,
-            1,
-        ));
+        assert!(should_fire(10, MAX_BATCH_PAYLOAD_BYTES, 1,));
     }
 
     /// Reply indices must point at the slot the op occupies *within its
@@ -2057,6 +3123,7 @@ mod tests {
             port: None,
             data: Some(Bytes::from_static(b"x")),
             encode_empty: false,
+            seq: None,
         };
         let mk_reply = || oneshot::channel::<Result<(TunnelResponse, String), String>>().0;
 
@@ -2068,7 +3135,11 @@ mod tests {
         push_no_fire(&mut accum, mk_op("a2"), 4, mk_reply());
         assert_eq!(accum.pending_ops.len(), 3);
         assert_eq!(
-            accum.data_replies.iter().map(|(i, _)| *i).collect::<Vec<_>>(),
+            accum
+                .data_replies
+                .iter()
+                .map(|(i, _)| *i)
+                .collect::<Vec<_>>(),
             vec![0, 1, 2],
         );
         assert_eq!(accum.payload_bytes, 12);
@@ -2084,7 +3155,11 @@ mod tests {
         push_no_fire(&mut accum, mk_op("b1"), 4, mk_reply());
         assert_eq!(accum.pending_ops.len(), 2);
         assert_eq!(
-            accum.data_replies.iter().map(|(i, _)| *i).collect::<Vec<_>>(),
+            accum
+                .data_replies
+                .iter()
+                .map(|(i, _)| *i)
+                .collect::<Vec<_>>(),
             vec![0, 1],
             "post-flush indices must restart at 0 — otherwise fire_batch's \
              batch_resp.r.get(idx) returns None and every session in the \
@@ -2093,6 +3168,28 @@ mod tests {
         assert_eq!(accum.payload_bytes, 8);
     }
 
+    /// `seq` plumbs from `MuxMsg::Data` → `PendingOp` → `BatchOp` so
+    /// the tunnel-node sees the seq we generated. Without this
+    /// propagation the server would see a None seq and fall back to
+    /// the legacy unordered path — the client would still process
+    /// replies in oneshot-FIFO order, but the bytes inside would be
+    /// in unspecified order (server drained read_buf in batch
+    /// arrival order, not seq order).
+    #[test]
+    fn encode_pending_propagates_seq_to_batch_op() {
+        let op = PendingOp {
+            op: "data",
+            sid: Some("sid".into()),
+            host: None,
+            port: None,
+            data: Some(Bytes::from_static(b"x")),
+            encode_empty: false,
+            seq: Some(42),
+        };
+        let b = encode_pending(op);
+        assert_eq!(b.seq, Some(42));
+    }
+
     #[test]
     fn encode_pending_data_op_with_payload_emits_base64() {
         let op = PendingOp {
@@ -2102,6 +3199,7 @@ mod tests {
             port: None,
             data: Some(Bytes::from_static(b"hello")),
             encode_empty: false,
+            seq: None,
         };
         let b = encode_pending(op);
         assert_eq!(b.op, "data");
@@ -2119,6 +3217,7 @@ mod tests {
             port: None,
             data: None,
             encode_empty: false,
+            seq: None,
         };
         assert!(encode_pending(empty_poll).d.is_none());
 
@@ -2130,6 +3229,7 @@ mod tests {
             port: None,
             data: None,
             encode_empty: false,
+            seq: None,
         };
         assert!(encode_pending(udp_poll).d.is_none());
 
@@ -2141,6 +3241,7 @@ mod tests {
             port: None,
             data: None,
             encode_empty: false,
+            seq: None,
         };
         assert!(encode_pending(close).d.is_none());
     }
@@ -2158,6 +3259,7 @@ mod tests {
             port: Some(443),
             data: Some(Bytes::new()),
             encode_empty: true,
+            seq: None,
         };
         let b = encode_pending(op);
         assert_eq!(b.op, "connect_data");
@@ -2173,6 +3275,7 @@ mod tests {
             port: Some(443),
             data: Some(Bytes::from_static(b"\x16\x03\x01")), // ClientHello prefix
             encode_empty: true,
+            seq: None,
         };
         let b = encode_pending(op);
         assert_eq!(b.d.as_deref(), Some(B64.encode(b"\x16\x03\x01").as_str()));
diff --git a/tunnel-node/src/main.rs b/tunnel-node/src/main.rs
index e3bc3c0..e97d901 100644
--- a/tunnel-node/src/main.rs
+++ b/tunnel-node/src/main.rs
@@ -153,8 +153,59 @@ struct SessionInner {
     /// to wake the drain phase as soon as any session has something to
     /// ship, replacing the old fixed-sleep heuristic.
     notify: Notify,
+    /// Per-session next-expected `data`-op sequence number for the
+    /// pipelining protocol. Only consulted by ops that carry a `seq`
+    /// field; legacy non-seq `data` ops bypass this entirely and use
+    /// the inline first-come-first-served path. Held inside a
+    /// `tokio::sync::Mutex` so the (claim seq → write → drain → bump)
+    /// critical section can `await` without blocking the runtime.
+    seq_state: Mutex<SeqState>,
+    /// Fired (`notify_waiters`) every time `seq_state.expected` advances.
+    /// Tasks waiting for their `seq` turn re-check `expected` after each
+    /// wake; combined with a per-task `SEQ_WAIT_TIMEOUT` this prevents
+    /// indefinite stalls when an earlier seq is lost or the client dies.
+    seq_advance: Notify,
 }
 
+/// Ordering state for per-session `data`-op pipelining. `expected` is
+/// the next seq the session will accept; any op whose `seq < expected`
+/// is treated as a duplicate (likely a client-side retry) and skipped,
+/// any op whose `seq > expected` blocks on `seq_advance` until either
+/// its seq matches or `SEQ_WAIT_TIMEOUT` elapses. New sessions start at
+/// `expected = 0` so the first `data` op the client sends is seq 0.
+struct SeqState {
+    /// Next-expected `data`-op sequence number for this session.
+    /// `u64` rather than `u32` so a long-lived TCP session generating
+    /// ~100 ops/s doesn't saturate `u32::MAX` after ~1.4 years and
+    /// refuse every subsequent op. The wire protocol matches
+    /// (`BatchOp::seq` / `TunnelResponse::seq` are also `u64`).
+    expected: u64,
+}
+
+/// How long a `data` op carrying a `seq` will wait for earlier seqs to
+/// land before giving up and returning an error to the client. 30 s is
+/// well past any normal Apps Script round-trip (~1.5 s) plus reordering
+/// across deployments, but short enough that a genuinely lost seq —
+/// e.g. the client crashed mid-pipeline — doesn't hold every subsequent
+/// op forever. After timeout the client's session reaper / explicit
+/// close cleans up.
+///
+/// Latency trade-off worth flagging: a seq op stuck waiting for an
+/// earlier seq holds an Apps Script execution slot AND the batch's
+/// HTTP response open until either the earlier seq lands or this
+/// timeout fires. `handle_batch` collects every job before
+/// responding, so a single stuck seq from session A can keep
+/// session B's already-ready job's results sitting in
+/// `seq_data_jobs` for up to `SEQ_WAIT_TIMEOUT`. We accept this:
+/// permanent seq loss is rare (it requires the client's earlier
+/// seq batch to vanish in transit), and shortening this would
+/// cause more rejections in the common reordered-arrival case
+/// where seqs land 2-10 s apart across deployments. The
+/// regression test
+/// `unrelated_seq_session_in_same_batch_is_not_delayed_past_seq_wait`
+/// pins the upper bound so any future change here is intentional.
+const SEQ_WAIT_TIMEOUT: Duration = Duration::from_secs(30);
+
 struct ManagedSession {
     inner: Arc<SessionInner>,
     reader_handle: tokio::task::JoinHandle<()>,
@@ -171,6 +222,137 @@ impl ManagedSession {
     }
 }
 
+/// Per-batch shared wait primitive for seq jobs and Phase 2's TCP
+/// drain. Collapses to one watcher per *unique* `SessionInner`
+/// (deduplicated by `Arc::as_ptr`) regardless of how many seq jobs
+/// or batch waiters subscribe — bounded at one task per session,
+/// not the `M × N` (jobs × sessions) the per-job
+/// `wait_for_any_drainable` would have spawned.
+///
+/// Each watcher relays `inner.notify` to a single batch-wide
+/// `Arc<Notify>` via `notify_waiters()`, so a single push wakes
+/// every parked job in the batch. `reader_task` itself fires
+/// `notify_waiters()` (not `notify_one()`), so concurrent batches
+/// observing the same session also all wake on each push — the
+/// `notify_one()` semantics would have left only one of N parked
+/// watchers winning the wake.
+///
+/// Watchers are held in `AbortOnDrop` so they're aborted on every
+/// exit path of `handle_batch` (including cancellation).
+struct BatchWait {
+    /// Fired (`notify_waiters`) when any unique inner's `reader_task`
+    /// pushes data or marks eof. All seq jobs and Phase 2's TCP wait
+    /// subscribe to this single Notify. Held as `Arc<Notify>` so
+    /// the (pre-spawned) watcher tasks can keep firing on it
+    /// without re-borrowing `Self` — the construction order would
+    /// otherwise need an `Arc::get_mut` dance.
+    wake: Arc<Notify>,
+    /// Sessions whose drainability the wait helper checks
+    /// synchronously before parking. Owned (deduplicated) here so
+    /// every caller agrees on the set.
+    inners: Vec<Arc<SessionInner>>,
+    /// One bridge task per unique inner. Each calls
+    /// `inner.notify.notified().await` in a loop and fans out real
+    /// state changes to `wake.notify_waiters()`. Held in
+    /// `AbortOnDrop` so they go away when `BatchWait` drops.
+    _watchers: Vec<AbortOnDrop>,
+}
+
+impl BatchWait {
+    fn new(inners: Vec<Arc<SessionInner>>) -> Arc<Self> {
+        // Deduplicate by Arc pointer. The same session can appear in
+        // a batch via multiple paths — e.g. a `connect_data` plus a
+        // seq `data` op for the same sid. Spawning two watchers for
+        // it would double the wake fan-out work for no benefit.
+        // We also STORE the dedup'd list (not the original `inners`),
+        // so `is_any_drainable` doesn't re-lock the same Mutex once
+        // per duplicate appearance.
+        let mut seen: std::collections::HashSet<usize> =
+            std::collections::HashSet::new();
+        let mut unique: Vec<Arc<SessionInner>> = Vec::with_capacity(inners.len());
+        for inner in &inners {
+            let ptr = Arc::as_ptr(inner) as usize;
+            if seen.insert(ptr) {
+                unique.push(inner.clone());
+            }
+        }
+
+        let wake = Arc::new(Notify::new());
+        let mut watchers: Vec<AbortOnDrop> = Vec::with_capacity(unique.len());
+        for inner in unique.iter().cloned() {
+            let wake = wake.clone();
+            watchers.push(AbortOnDrop(tokio::spawn(async move {
+                loop {
+                    // Pre-`enable()` the next Notified BEFORE
+                    // checking state, so a `notify_waiters()` fired
+                    // in the gap between our previous fan-out and
+                    // this register isn't lost. `reader_task` uses
+                    // `notify_waiters()` (not `notify_one()`) so no
+                    // permits are stored for unregistered waiters
+                    // — the enable() registration is what guarantees
+                    // we observe every push.
+                    let notified = inner.notify.notified();
+                    tokio::pin!(notified);
+                    notified.as_mut().enable();
+
+                    // Filter false wakes: only fan out when the
+                    // session is actually drainable. A wake fired
+                    // for a push that's already been drained by a
+                    // prior batch loops back without disturbing
+                    // batch waiters.
+                    let drainable = inner.eof.load(Ordering::Acquire)
+                        || !inner.read_buf.lock().await.is_empty();
+                    if drainable {
+                        wake.notify_waiters();
+                    }
+
+                    notified.await;
+                }
+            })));
+        }
+
+        Arc::new(Self {
+            wake,
+            inners: unique,
+            _watchers: watchers,
+        })
+    }
+
+    /// Wait until any unique inner is drainable, or `deadline`
+    /// elapses. The pre-armed `Notified` closes the same race
+    /// `wait_for_seq_turn` handles: any wake fired between
+    /// the synchronous drainability check and the await is still
+    /// observed. `notify_waiters()` fans out to ALL parked
+    /// `BatchWait::wait` callers at once, so a single push wakes
+    /// every seq job and Phase 2's TCP wait simultaneously — the
+    /// exact behavior the per-job `wait_for_any_drainable` calls
+    /// failed to deliver under `notify_one()`.
+    async fn wait(&self, deadline: Duration) {
+        if self.inners.is_empty() {
+            return;
+        }
+        let notified = self.wake.notified();
+        tokio::pin!(notified);
+        notified.as_mut().enable();
+        if self.is_any_drainable().await {
+            return;
+        }
+        let _ = tokio::time::timeout(deadline, notified).await;
+    }
+
+    async fn is_any_drainable(&self) -> bool {
+        for inner in &self.inners {
+            if inner.eof.load(Ordering::Acquire) {
+                return true;
+            }
+            if !inner.read_buf.lock().await.is_empty() {
+                return true;
+            }
+        }
+        false
+    }
+}
+
 /// UDP equivalent of `SessionInner`. Holds a *connected* `UdpSocket`
 /// pinned to one `(host, port)` upstream so we don't have to re-resolve
 /// or re-parse the destination on every datagram. `notify` is fired by
@@ -212,12 +394,18 @@ async fn create_session(host: &str, port: u16) -> std::io::Result<ManagedSession
         eof: AtomicBool::new(false),
         last_active: Mutex::new(Instant::now()),
         notify: Notify::new(),
+        seq_state: Mutex::new(SeqState { expected: 0 }),
+        seq_advance: Notify::new(),
     });
 
     let inner_ref = inner.clone();
     let reader_handle = tokio::spawn(reader_task(reader, inner_ref));
 
-    Ok(ManagedSession { inner, reader_handle, udpgw_handle: None })
+    Ok(ManagedSession {
+        inner,
+        reader_handle,
+        udpgw_handle: None,
+    })
 }
 
 /// Create a virtual udpgw session backed by an in-process duplex channel.
@@ -231,13 +419,19 @@ fn create_udpgw_session() -> ManagedSession {
         eof: AtomicBool::new(false),
         last_active: Mutex::new(Instant::now()),
         notify: Notify::new(),
+        seq_state: Mutex::new(SeqState { expected: 0 }),
+        seq_advance: Notify::new(),
     });
 
     let inner_ref = inner.clone();
     let reader_handle = tokio::spawn(reader_task(read_half, inner_ref));
     let udpgw_handle = Some(tokio::spawn(udpgw::udpgw_server_task(server_half)));
 
-    ManagedSession { inner, reader_handle, udpgw_handle }
+    ManagedSession {
+        inner,
+        reader_handle,
+        udpgw_handle,
+    }
 }
 
 async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc<SessionInner>) {
@@ -246,25 +440,37 @@ async fn reader_task(mut reader: impl AsyncRead + Unpin, session: Arc<SessionInn
         match reader.read(&mut buf).await {
             Ok(0) => {
                 session.eof.store(true, Ordering::Release);
-                session.notify.notify_one();
+                session.notify.notify_waiters();
                 break;
             }
             Ok(n) => {
                 // Extend the buffer before notifying. The MutexGuard is
                 // dropped at the end of the statement, *before* the
-                // notify_one call below, so any waiter that wakes on the
-                // notify and then locks read_buf can immediately observe
-                // the new bytes — no torn read where the wake fires but
-                // the buffer still looks empty. Notify::notify_one also
-                // stores a permit if no waiter is currently registered,
-                // so we never lose an edge across the spawn race in
-                // wait_for_any_drainable.
+                // notify call below, so any waiter that wakes on the
+                // notify and then locks read_buf can immediately
+                // observe the new bytes — no torn read where the wake
+                // fires but the buffer still looks empty.
+                //
+                // `notify_waiters()` (not `notify_one()`) so EVERY
+                // parked waiter wakes — required for correctness when
+                // pipelining puts seq=N and seq=N+1 for the same
+                // session in different batches: each batch builds its
+                // own `BatchWait` watcher on this session's notify,
+                // and a single `notify_one()` would only wake one of
+                // them, leaving the other batch's watcher asleep
+                // until `LONGPOLL_DEADLINE`. The cost of dropping the
+                // permit-storage semantics is handled by the
+                // pre-`enable()` pattern in `BatchWait`'s and
+                // `wait_for_any_drainable`'s watchers — they register
+                // their `Notified` future before the synchronous
+                // drainability check, so a wake fired in the gap is
+                // still observed.
                 session.read_buf.lock().await.extend_from_slice(&buf[..n]);
-                session.notify.notify_one();
+                session.notify.notify_waiters();
             }
             Err(_) => {
                 session.eof.store(true, Ordering::Release);
-                session.notify.notify_one();
+                session.notify.notify_waiters();
                 break;
             }
         }
@@ -442,6 +648,13 @@ async fn wait_for_any_drainable(inners: &[Arc<SessionInner>], deadline: Duration
     // race-safety note above. Watchers are held in a Vec of
     // `AbortOnDrop`, so they're aborted on every exit path —
     // including cancellation by an outer `select!`.
+    //
+    // The pre-`enable()` pattern is required since
+    // `reader_task` uses `notify_waiters()` (not `notify_one()`):
+    // there are no stored permits, so a wake fired between the
+    // synchronous state check and parking on `notified.await`
+    // would otherwise be lost. Registering the future before the
+    // check guarantees we observe every push.
     let (tx, mut rx) = mpsc::channel::<()>(1);
     let mut _watchers: Vec<AbortOnDrop> = Vec::with_capacity(inners.len());
     for inner in inners {
@@ -449,17 +662,20 @@ async fn wait_for_any_drainable(inners: &[Arc<SessionInner>], deadline: Duration
         let tx = tx.clone();
         _watchers.push(AbortOnDrop(tokio::spawn(async move {
             loop {
-                inner.notify.notified().await;
+                let notified = inner.notify.notified();
+                tokio::pin!(notified);
+                notified.as_mut().enable();
                 if inner.eof.load(Ordering::Acquire) {
                     break;
                 }
                 if !inner.read_buf.lock().await.is_empty() {
                     break;
                 }
-                // Stale permit (notify fired but state didn't change in
-                // an observable way — e.g., bytes were already drained
-                // by a prior batch). Loop back and wait for a real
-                // notify, don't wake the caller.
+                notified.await;
+                // Wake observed — state may have changed; loop back
+                // and re-check. If the wake was for a push that
+                // another batch already drained, the next iteration
+                // sees buf empty + eof unset and re-arms.
             }
             let _ = tx.try_send(());
         })));
@@ -582,9 +798,15 @@ async fn wait_and_drain(session: &SessionInner, max_wait: Duration) -> (Vec<u8>,
             prev_len = cur_len;
             ever_had_data = true;
         }
-        if is_eof { break; }
-        if Instant::now() >= deadline { break; }
-        if ever_had_data && last_growth.elapsed() > Duration::from_millis(100) { break; }
+        if is_eof {
+            break;
+        }
+        if Instant::now() >= deadline {
+            break;
+        }
+        if ever_had_data && last_growth.elapsed() > Duration::from_millis(100) {
+            break;
+        }
         tokio::time::sleep(Duration::from_millis(10)).await;
     }
 
@@ -626,34 +848,78 @@ struct AppState {
 struct TunnelRequest {
     k: String,
     op: String,
-    #[serde(default)] host: Option<String>,
-    #[serde(default)] port: Option<u16>,
-    #[serde(default)] sid: Option<String>,
-    #[serde(default)] data: Option<String>,
+    #[serde(default)]
+    host: Option<String>,
+    #[serde(default)]
+    port: Option<u16>,
+    #[serde(default)]
+    sid: Option<String>,
+    #[serde(default)]
+    data: Option<String>,
 }
 
 #[derive(Serialize, Clone, Debug)]
 struct TunnelResponse {
-    #[serde(skip_serializing_if = "Option::is_none")] sid: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")] d: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    sid: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    d: Option<String>,
     /// UDP datagrams returned to the client, base64-encoded individually.
     /// `None` for TCP responses; `Some(vec![])` is never serialized
     /// (the field is dropped when empty by the empty-on-None check above).
-    #[serde(skip_serializing_if = "Option::is_none")] pkts: Option<Vec<String>>,
-    #[serde(skip_serializing_if = "Option::is_none")] eof: Option<bool>,
-    #[serde(skip_serializing_if = "Option::is_none")] e: Option<String>,
-    #[serde(skip_serializing_if = "Option::is_none")] code: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pkts: Option<Vec<String>>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    eof: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    e: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    code: Option<String>,
+    /// Per-session sequence number echoed from `BatchOp::seq` for `data`
+    /// ops (pipelining). Lets the client route an out-of-order reply
+    /// into its per-session reorder buffer. Skipped on the wire when
+    /// `None` (the legacy/non-pipelined path). `u64` to keep
+    /// long-lived sessions from saturating — see `SeqState::expected`
+    /// for the math.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    seq: Option<u64>,
+    /// Capability bitfield set on `connect` / `connect_data` success
+    /// replies so the client can opt into protocol features it knows
+    /// the tunnel-node speaks. Bit 0 = `CAPS_PIPELINE_SEQ`. Skipped
+    /// elsewhere (other op replies don't advertise capabilities).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    caps: Option<u32>,
 }
 
+/// Capability bit for per-session `data`-op sequence numbers
+/// (pipelining). Set on `connect` / `connect_data` success replies so
+/// new clients opt into pipelining; old clients ignore the unknown
+/// JSON field.
+const CAPS_PIPELINE_SEQ: u32 = 1 << 0;
+
 impl TunnelResponse {
     fn error(msg: impl Into<String>) -> Self {
-        Self { sid: None, d: None, pkts: None, eof: None, e: Some(msg.into()), code: None }
+        Self {
+            sid: None,
+            d: None,
+            pkts: None,
+            eof: None,
+            e: Some(msg.into()),
+            code: None,
+            seq: None,
+            caps: None,
+        }
     }
     fn unsupported_op(op: &str) -> Self {
         Self {
-            sid: None, d: None, pkts: None, eof: None,
+            sid: None,
+            d: None,
+            pkts: None,
+            eof: None,
             e: Some(format!("unknown op: {}", op)),
             code: Some(CODE_UNSUPPORTED_OP.into()),
+            seq: None,
+            caps: None,
         }
     }
 }
@@ -671,10 +937,24 @@ struct BatchRequest {
 #[derive(Deserialize)]
 struct BatchOp {
     op: String,
-    #[serde(default)] sid: Option<String>,
-    #[serde(default)] host: Option<String>,
-    #[serde(default)] port: Option<u16>,
-    #[serde(default)] d: Option<String>, // base64 data
+    #[serde(default)]
+    sid: Option<String>,
+    #[serde(default)]
+    host: Option<String>,
+    #[serde(default)]
+    port: Option<u16>,
+    #[serde(default)]
+    d: Option<String>, // base64 data
+    /// Per-session monotonic sequence number for `data` ops on a
+    /// pipelining-enabled session. When present, the server enforces
+    /// in-order processing: ops with `seq` arriving out of order are
+    /// held until earlier seqs land (or the session times out). Old
+    /// clients omit the field — `serde(default)` deserializes them
+    /// to `None`, which selects the legacy first-come-first-served
+    /// path. Echoed back in the response's `seq`. `u64` for the same
+    /// long-running-session reason as `SeqState::expected`.
+    #[serde(default)]
+    seq: Option<u64>,
 }
 
 #[derive(Serialize)]
@@ -695,9 +975,7 @@ async fn handle_tunnel(
     }
     let resp: TunnelResponse = match req.op.as_str() {
         "connect" => handle_connect(&state, req.host, req.port).await,
-        "connect_data" => {
-            handle_connect_data_single(&state, req.host, req.port, req.data).await
-        }
+        "connect_data" => handle_connect_data_single(&state, req.host, req.port, req.data).await,
         "data" => handle_data_single(&state, req.sid, req.data).await,
         "close" => handle_close(&state, req.sid).await,
         other => TunnelResponse::unsupported_op(other),
@@ -730,10 +1008,7 @@ fn decoy_or_unauthorized(diagnostic_mode: bool) -> axum::response::Response {
 // Batch handler
 // ---------------------------------------------------------------------------
 
-async fn handle_batch(
-    State(state): State<AppState>,
-    body: Bytes,
-) -> impl IntoResponse {
+async fn handle_batch(State(state): State<AppState>, body: Bytes) -> impl IntoResponse {
     // Decompress if gzipped
     let json_bytes = if body.starts_with(&[0x1f, 0x8b]) {
         match decompress_gzip(&body) {
@@ -741,8 +1016,13 @@ async fn handle_batch(
             Err(e) => {
                 let resp = serde_json::to_vec(&BatchResponse {
                     r: vec![TunnelResponse::error(format!("gzip decode: {}", e))],
-                }).unwrap_or_default();
-                return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp);
+                })
+                .unwrap_or_default();
+                return (
+                    StatusCode::OK,
+                    [(header::CONTENT_TYPE, "application/json")],
+                    resp,
+                );
             }
         }
     } else {
@@ -754,8 +1034,13 @@ async fn handle_batch(
         Err(e) => {
             let resp = serde_json::to_vec(&BatchResponse {
                 r: vec![TunnelResponse::error(format!("bad json: {}", e))],
-            }).unwrap_or_default();
-            return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp);
+            })
+            .unwrap_or_default();
+            return (
+                StatusCode::OK,
+                [(header::CONTENT_TYPE, "application/json")],
+                resp,
+            );
         }
     };
 
@@ -763,8 +1048,13 @@ async fn handle_batch(
         if state.diagnostic_mode {
             let resp = serde_json::to_vec(&BatchResponse {
                 r: vec![TunnelResponse::error("unauthorized")],
-            }).unwrap_or_default();
-            return (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], resp);
+            })
+            .unwrap_or_default();
+            return (
+                StatusCode::OK,
+                [(header::CONTENT_TYPE, "application/json")],
+                resp,
+            );
         }
         // Production: same nginx-404 decoy as the single-op path. See
         // `decoy_or_unauthorized` for rationale.
@@ -773,7 +1063,11 @@ async fn handle_batch(
                     <hr><center>nginx</center>\r\n</body>\r\n</html>\r\n"
             .as_bytes()
             .to_vec();
-        return (StatusCode::NOT_FOUND, [(header::CONTENT_TYPE, "text/html")], body);
+        return (
+            StatusCode::NOT_FOUND,
+            [(header::CONTENT_TYPE, "text/html")],
+            body,
+        );
     }
 
     // Process all ops in two phases.
@@ -797,7 +1091,22 @@ async fn handle_batch(
     // map lock isn't held across the per-session read_buf / packets
     // mutex acquisition — without this, every other batch (and every
     // connect/close op) head-of-line-blocks behind the drain.
-    let mut tcp_drains: Vec<(usize, String, Arc<SessionInner>)> = Vec::new();
+    //
+    // The `is_connect_data` flag distinguishes drains that came from
+    // a successful `connect_data` op (a fresh session) from drains
+    // that came from a regular `data` op (an existing session). The
+    // batched-connect_data response is the FIRST reply the client
+    // sees for a session, so it has to carry `caps` — same as the
+    // single-op `handle_connect_data_single` path. Without this
+    // distinction the batch path silently downgrades every HTTPS
+    // fast-path session to the legacy non-pipelined loop.
+    struct TcpDrain {
+        i: usize,
+        sid: String,
+        inner: Arc<SessionInner>,
+        is_connect_data: bool,
+    }
+    let mut tcp_drains: Vec<TcpDrain> = Vec::new();
     let mut udp_drains: Vec<(usize, String, Arc<UdpSessionInner>)> = Vec::new();
     // True iff the batch contained any op that performed a real action
     // upstream — a new connection or a non-empty data write. A batch of
@@ -811,6 +1120,46 @@ async fn handle_batch(
         UdpOpen(Result<(String, Arc<UdpSessionInner>), TunnelResponse>),
     }
     let mut new_conn_jobs: JoinSet<(usize, NewConn)> = JoinSet::new();
+    // Seq-ordered `data` op specs collected during the dispatch loop
+    // and spawned later, after `had_writes_or_connects` is final and
+    // after `new_conn_jobs` has populated `tcp_drains`. Spawning then
+    // running concurrently with Phase 2 (rather than awaiting all
+    // seq jobs *before* Phase 2) prevents a single idle pipelined
+    // poll from holding the whole batch response open for 15 s while
+    // sibling sessions are ready to ship bytes.
+    struct SeqDataSpec {
+        i: usize,
+        sid: String,
+        seq: u64,
+        data: Option<String>,
+        inner: Arc<SessionInner>,
+    }
+    let mut seq_specs: Vec<SeqDataSpec> = Vec::new();
+    // `close` ops collected during dispatch and processed AFTER
+    // seq-data jobs complete. Running close inline would race
+    // same-sid seq ops by removing the session from `state.sessions`
+    // and aborting `reader_task` before the deferred seq job's
+    // (write, drain) ran on its cloned `Arc<SessionInner>`.
+    let mut pending_closes: Vec<(usize, Option<String>)> = Vec::new();
+    // Sids that have an earlier-in-batch op deferred to Phase 2 or
+    // `seq_data_jobs`. Used by the `close` arm to decide whether
+    // to defer this close (race with the deferred op) or run it
+    // inline (preserve the request's close-first ordering when no
+    // earlier same-sid op exists). Without this check, a batch
+    // ordered as `[close(X), data(seq=0, X)]` would unconditionally
+    // defer the close, letting the data write succeed against a
+    // session the client asked to be closed FIRST — a protocol-
+    // ordering regression vs the pre-pipelining shape.
+    let mut sids_with_pending_deferred: std::collections::HashSet<String> =
+        std::collections::HashSet::new();
+    // Shared response-byte budget for this batch. seq-data jobs run
+    // concurrently with Phase 2's `tcp_drains` loop, so the cap
+    // (#863) must be shared across both code paths — M concurrent
+    // pipelined sessions each grabbing `TCP_DRAIN_MAX_BYTES` would
+    // stack past Apps Script's 50 MiB response ceiling. The Mutex is
+    // held only briefly per reservation; the actual drain runs
+    // unlocked.
+    let response_budget: Arc<Mutex<usize>> = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
 
     for (i, op) in req.ops.iter().enumerate() {
         match op.op.as_str() {
@@ -820,7 +1169,10 @@ async fn handle_batch(
                 let host = op.host.clone();
                 let port = op.port;
                 new_conn_jobs.spawn(async move {
-                    (i, NewConn::Connect(handle_connect(&state, host, port).await))
+                    (
+                        i,
+                        NewConn::Connect(handle_connect(&state, host, port).await),
+                    )
                 });
             }
             "connect_data" => {
@@ -858,7 +1210,10 @@ async fn handle_batch(
             "data" => {
                 let sid = match &op.sid {
                     Some(s) if !s.is_empty() => s.clone(),
-                    _ => { results.push((i, TunnelResponse::error("missing sid"))); continue; }
+                    _ => {
+                        results.push((i, TunnelResponse::error("missing sid")));
+                        continue;
+                    }
                 };
 
                 // Clone the inner under the map lock and release it
@@ -872,6 +1227,35 @@ async fn handle_batch(
                     let sessions = state.sessions.lock().await;
                     sessions.get(&sid).map(|s| s.inner.clone())
                 };
+                // Pipelining branch: a `data` op carrying a seq is
+                // dispatched into the seq_data_jobs JoinSet so its
+                // potentially-long wait for an earlier seq doesn't
+                // block other ops in this batch. The job's
+                // `process_seq_data_op` runs the full (write, drain)
+                // sequence under the per-session seq lock, which is
+                // what enforces downlink-byte ordering across batches.
+                if let Some(seq) = op.seq {
+                    if let Some(inner) = inner {
+                        let had_uplink_b64 =
+                            op.d.as_deref().map(|s| !s.is_empty()).unwrap_or(false);
+                        if had_uplink_b64 {
+                            had_writes_or_connects = true;
+                        }
+                        sids_with_pending_deferred.insert(sid.clone());
+                        seq_specs.push(SeqDataSpec {
+                            i,
+                            sid: sid.clone(),
+                            seq,
+                            data: op.d.clone(),
+                            inner,
+                        });
+                    } else {
+                        let mut r = eof_response(sid);
+                        r.seq = Some(seq);
+                        results.push((i, r));
+                    }
+                    continue;
+                }
                 if let Some(inner) = inner {
                     *inner.last_active.lock().await = Instant::now();
                     if let Some(ref data_b64) = op.d {
@@ -899,7 +1283,13 @@ async fn handle_batch(
                             }
                         }
                     }
-                    tcp_drains.push((i, sid, inner));
+                    sids_with_pending_deferred.insert(sid.clone());
+                    tcp_drains.push(TcpDrain {
+                        i,
+                        sid,
+                        inner,
+                        is_connect_data: false,
+                    });
                 } else {
                     results.push((i, eof_response(sid)));
                 }
@@ -907,7 +1297,10 @@ async fn handle_batch(
             "udp_data" => {
                 let sid = match &op.sid {
                     Some(s) if !s.is_empty() => s.clone(),
-                    _ => { results.push((i, TunnelResponse::error("missing sid"))); continue; }
+                    _ => {
+                        results.push((i, TunnelResponse::error("missing sid")));
+                        continue;
+                    }
                 };
 
                 let inner = {
@@ -942,14 +1335,37 @@ async fn handle_batch(
                     if had_uplink {
                         *inner.last_active.lock().await = Instant::now();
                     }
+                    sids_with_pending_deferred.insert(sid.clone());
                     udp_drains.push((i, sid, inner));
                 } else {
                     results.push((i, eof_response(sid)));
                 }
             }
             "close" => {
-                let r = handle_close(&state, op.sid.clone()).await;
-                results.push((i, r));
+                // Defer ONLY if there's an earlier-in-batch op for
+                // the same sid that's already deferred (seq data,
+                // legacy data, udp_data). Without an earlier
+                // deferred op, running close inline preserves the
+                // request's per-sid op order — important for
+                // `[close, data]` and `[close, data(seq)]` shapes:
+                // the client asked us to close FIRST, then write,
+                // and the write should land on a closed (eof'd)
+                // session, not race the close. With an earlier
+                // deferred op, deferring close prevents the race
+                // where same-sid seq jobs run on a closed session
+                // (writing to an aborted upstream, draining an
+                // empty buffer).
+                let needs_defer = op
+                    .sid
+                    .as_deref()
+                    .map(|s| sids_with_pending_deferred.contains(s))
+                    .unwrap_or(false);
+                if needs_defer {
+                    pending_closes.push((i, op.sid.clone()));
+                } else {
+                    let r = handle_close(&state, op.sid.clone()).await;
+                    results.push((i, r));
+                }
             }
             other => {
                 results.push((i, TunnelResponse::unsupported_op(other)));
@@ -964,7 +1380,16 @@ async fn handle_batch(
         match join {
             Ok((i, NewConn::Connect(r))) => results.push((i, r)),
             Ok((i, NewConn::ConnectData(Ok((sid, inner))))) => {
-                tcp_drains.push((i, sid, inner));
+                // First reply on this session — must carry caps so
+                // the client knows it can opt into the pipelined
+                // tunnel_loop variant. Phase 2's drain stamps the
+                // bit on responses where `is_connect_data` is true.
+                tcp_drains.push(TcpDrain {
+                    i,
+                    sid,
+                    inner,
+                    is_connect_data: true,
+                });
             }
             Ok((i, NewConn::ConnectData(Err(r)))) => results.push((i, r)),
             Ok((i, NewConn::UdpOpen(Ok((sid, inner))))) => {
@@ -977,162 +1402,294 @@ async fn handle_batch(
         }
     }
 
-    // Phase 2: signal-driven wait for any session (TCP or UDP) to have
-    // data, then drain TCP and UDP independently in a single pass each.
-    // Deadlines:
-    //   * `ACTIVE_DRAIN_DEADLINE` (~350 ms) when the batch had real work.
-    //     Typical responses arrive in ms; the wait helpers return on
-    //     the first notify. For active batches we settle for
-    //     `STRAGGLER_SETTLE` so neighbors whose replies trail by a few
-    //     ms aren't reported empty.
-    //   * `LONGPOLL_DEADLINE` for pure-poll batches — held open until
-    //     upstream pushes data. UDP idle polls benefit from this just
-    //     as much as TCP, so the same window applies.
-    if !tcp_drains.is_empty() || !udp_drains.is_empty() {
-        let deadline = if had_writes_or_connects {
-            ACTIVE_DRAIN_DEADLINE
-        } else {
-            LONGPOLL_DEADLINE
-        };
+    // Build the shared wait set: every TCP session that anyone in
+    // this batch will drain (Phase 2 connect_data / data, plus seq
+    // jobs). Build a single shared `BatchWait` so:
+    //   * One watcher task per UNIQUE inner — not the M × N spawn
+    //     the per-job `wait_for_any_drainable` calls produced.
+    //   * `notify_waiters()` fans out to ALL parked jobs at once
+    //     (fixing the `notify_one` race where a single push only
+    //     woke one of N waiters and the rest sat until
+    //     `LONGPOLL_DEADLINE`).
+    let batch_wait: Arc<BatchWait> = BatchWait::new({
+        let mut v: Vec<Arc<SessionInner>> =
+            tcp_drains.iter().map(|d| d.inner.clone()).collect();
+        v.extend(seq_specs.iter().map(|s| s.inner.clone()));
+        v
+    });
 
-        // Phase 1 already gave us each session's Arc<…Inner>, so we
-        // don't need to re-acquire the sessions map lock here. Cloning
-        // the Arc is just a refcount bump.
-        let tcp_inners: Vec<Arc<SessionInner>> =
-            tcp_drains.iter().map(|(_, _, inner)| inner.clone()).collect();
-        let udp_inners: Vec<Arc<UdpSessionInner>> =
-            udp_drains.iter().map(|(_, _, inner)| inner.clone()).collect();
-
-        // Wake on whichever side has work first. The previous
-        // `tokio::join!` was conjunctive — a TCP burst still paid the
-        // UDP deadline in mixed batches because the UDP waiter had to
-        // elapse too. `wait_for_*_drainable` short-circuits on an empty
-        // slice, so we have to skip the empty side; otherwise its
-        // instant return would fire the select arm before the other
-        // side ever got a chance to wait.
-        match (tcp_inners.is_empty(), udp_inners.is_empty()) {
-            (true, true) => {}
-            (false, true) => wait_for_any_drainable(&tcp_inners, deadline).await,
-            (true, false) => wait_for_any_udp_drainable(&udp_inners, deadline).await,
-            (false, false) => {
-                tokio::select! {
-                    _ = wait_for_any_drainable(&tcp_inners, deadline) => {}
-                    _ = wait_for_any_udp_drainable(&udp_inners, deadline) => {}
-                }
-            }
-        }
+    // Spawn seq-ordered `data` ops AFTER `new_conn_jobs` so each
+    // task sees the final value of `had_writes_or_connects` and can
+    // pick the right drain deadline (active vs long-poll). Keeping
+    // these in a JoinSet lets us race them with Phase 2 below, so a
+    // single idle pipelined poll doesn't gate the whole batch
+    // response on `LONGPOLL_DEADLINE`.
+    let mut seq_data_jobs: JoinSet<(usize, TunnelResponse)> = JoinSet::new();
+    for spec in seq_specs {
+        let budget = response_budget.clone();
+        let active = had_writes_or_connects;
+        let bw = batch_wait.clone();
+        seq_data_jobs.spawn(async move {
+            let resp = process_seq_data_op(
+                spec.inner, spec.sid, spec.seq, spec.data, budget, active, bw,
+            )
+            .await;
+            (spec.i, resp)
+        });
+    }
 
-        if had_writes_or_connects {
-            // Adaptive settle: keep waiting in steps while new data
-            // keeps arriving. Break when:
-            //  1. No new data arrived in the last step (burst is over)
-            //  2. STRAGGLER_SETTLE_MAX reached
-            let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX;
-            let mut prev_tcp_bytes: usize = 0;
-            let mut prev_udp_pkts: usize = 0;
-            // Snapshot current buffer sizes.
-            for inner in &tcp_inners {
-                prev_tcp_bytes += inner.read_buf.lock().await.len();
-            }
-            for inner in &udp_inners {
-                prev_udp_pkts += inner.packets.lock().await.len();
-            }
-            loop {
-                let now = Instant::now();
-                if now >= settle_end {
-                    break;
+    // Phase 2 (legacy non-seq drain) and seq-data jobs run
+    // concurrently — neither's wait phase gates the other. Phase 2
+    // waits up to ACTIVE_DRAIN_DEADLINE / LONGPOLL_DEADLINE for
+    // tcp_drains / udp_drains; seq jobs have their own per-session
+    // waits (per-inner for active, shared `BatchWait` for idle).
+    // Both share the same `BatchWait`, so a wake from any session's
+    // `reader_task` resolves both sides simultaneously.
+    //
+    // Note: the HTTP batch response itself is held until BOTH sides
+    // finish — the `tokio::join!` below joins everything before
+    // building the JSON body. The "concurrent" claim is about wait-
+    // phase coupling, not about returning the response early.
+    // Apps Script / the batched protocol surface doesn't support
+    // partial / streamed responses, so all results have to land in
+    // one body.
+    let response_budget_p2 = response_budget.clone();
+    let phase_2_batch_wait = batch_wait.clone();
+    let phase_2_fut = async move {
+        let mut p2_results: Vec<(usize, TunnelResponse)> = Vec::new();
+        let mut tcp_eof_sids: Vec<String> = Vec::new();
+        let mut udp_eof_sids: Vec<String> = Vec::new();
+
+        if !tcp_drains.is_empty() || !udp_drains.is_empty() {
+            let deadline = if had_writes_or_connects {
+                ACTIVE_DRAIN_DEADLINE
+            } else {
+                LONGPOLL_DEADLINE
+            };
+
+            // Phase 1 already gave us each session's Arc<…Inner>, so
+            // we don't need to re-acquire the sessions map lock
+            // here. Cloning the Arc is just a refcount bump.
+            let tcp_inners: Vec<Arc<SessionInner>> =
+                tcp_drains.iter().map(|d| d.inner.clone()).collect();
+            let udp_inners: Vec<Arc<UdpSessionInner>> = udp_drains
+                .iter()
+                .map(|(_, _, inner)| inner.clone())
+                .collect();
+
+            // Wake on whichever side has work first. The TCP wait
+            // is the SHARED `BatchWait` — its watcher tasks bridge
+            // every unique session's `inner.notify` to a single
+            // batch-wide `notify_waiters()`, so a seq session
+            // getting bytes wakes Phase 2 too AND a Phase 2 session
+            // getting bytes wakes every seq job. We only DRAIN our
+            // own `tcp_inners` below; the seq jobs handle theirs.
+            //
+            // The previous `tokio::join!` was conjunctive — a TCP
+            // burst still paid the UDP deadline in mixed batches.
+            // `BatchWait::wait` short-circuits on an empty inner set
+            // (and `wait_for_any_udp_drainable` does the same for
+            // UDP), so we skip the empty side to avoid its instant
+            // return firing the select arm prematurely.
+            let tcp_wait_empty = phase_2_batch_wait.inners.is_empty();
+            match (tcp_wait_empty, udp_inners.is_empty()) {
+                (true, true) => {}
+                (false, true) => phase_2_batch_wait.wait(deadline).await,
+                (true, false) => wait_for_any_udp_drainable(&udp_inners, deadline).await,
+                (false, false) => {
+                    tokio::select! {
+                        _ = phase_2_batch_wait.wait(deadline) => {}
+                        _ = wait_for_any_udp_drainable(&udp_inners, deadline) => {}
+                    }
                 }
-                let remaining = settle_end.duration_since(now);
-                tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await;
+            }
 
-                // Measure current buffer sizes.
-                let mut tcp_bytes: usize = 0;
-                let mut udp_pkts: usize = 0;
+            if had_writes_or_connects {
+                // Adaptive settle: keep waiting in steps while new
+                // data keeps arriving. Break when:
+                //  1. No new data arrived in the last step (burst is over)
+                //  2. STRAGGLER_SETTLE_MAX reached
+                let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX;
+                let mut prev_tcp_bytes: usize = 0;
+                let mut prev_udp_pkts: usize = 0;
                 for inner in &tcp_inners {
-                    tcp_bytes += inner.read_buf.lock().await.len();
+                    prev_tcp_bytes += inner.read_buf.lock().await.len();
                 }
                 for inner in &udp_inners {
-                    udp_pkts += inner.packets.lock().await.len();
+                    prev_udp_pkts += inner.packets.lock().await.len();
                 }
+                loop {
+                    let now = Instant::now();
+                    if now >= settle_end {
+                        break;
+                    }
+                    let remaining = settle_end.duration_since(now);
+                    tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await;
 
-                // No new data since last step — burst is over.
-                if tcp_bytes == prev_tcp_bytes && udp_pkts == prev_udp_pkts {
-                    break;
-                }
+                    let mut tcp_bytes: usize = 0;
+                    let mut udp_pkts: usize = 0;
+                    for inner in &tcp_inners {
+                        tcp_bytes += inner.read_buf.lock().await.len();
+                    }
+                    for inner in &udp_inners {
+                        udp_pkts += inner.packets.lock().await.len();
+                    }
+
+                    if tcp_bytes == prev_tcp_bytes && udp_pkts == prev_udp_pkts {
+                        break;
+                    }
 
-                prev_tcp_bytes = tcp_bytes;
-                prev_udp_pkts = udp_pkts;
+                    prev_tcp_bytes = tcp_bytes;
+                    prev_udp_pkts = udp_pkts;
+                }
             }
-        }
 
-        // ---- TCP drain ----
-        // Drain through each session's already-cloned Arc so the global
-        // sessions map lock isn't held across the per-session
-        // read_buf.lock().await.
-        //
-        // Cleanup is driven off `drain_now`'s returned `eof`, NOT the
-        // raw `inner.eof` atomic. When the buffer exceeds
-        // `TCP_DRAIN_MAX_BYTES`, `drain_now` deliberately returns
-        // `eof = false` and leaves the tail in the buffer so the
-        // client can pick it up on the next poll. The previous cleanup
-        // read the atomic directly, so on a high-throughput session
-        // that closed mid-burst (issue #460-style) it would remove the
-        // session and abort the reader_task with the tail still
-        // buffered, dropping those bytes.
-        let mut tcp_eof_sids: Vec<String> = Vec::new();
-        // Track remaining batch-response budget across all session drains
-        // (#863). Per-session `TCP_DRAIN_MAX_BYTES` alone wasn't enough —
-        // several concurrent sessions each contributing 16 MiB summed past
-        // Apps Script's 50 MiB response ceiling. This cap stops one session
-        // short of the cliff; deferred sessions drain on the next poll.
-        let mut remaining_budget: usize = BATCH_RESPONSE_BUDGET;
-        for (i, sid, inner) in &tcp_drains {
-            let (data, eof) = drain_now(inner, remaining_budget).await;
-            let drained = data.len();
-            if eof {
-                tcp_eof_sids.push(sid.clone());
+            // ---- TCP drain ----
+            // Cleanup is driven off `drain_now`'s returned `eof`, NOT
+            // the raw `inner.eof` atomic. When the buffer exceeds
+            // `TCP_DRAIN_MAX_BYTES`, `drain_now` deliberately returns
+            // `eof = false` and leaves the tail in the buffer so the
+            // client can pick it up on the next poll.
+            //
+            // On budget exhaustion, push an empty (eof=false)
+            // response per remaining drain rather than `break`-ing —
+            // every (i, sid) registered as a drain corresponds to an
+            // op the client expects a reply for. Skipping pushes
+            // would surface as "missing response in batch" client-
+            // side, and for connect_data would leak a freshly-opened
+            // server session (the client never learns the sid, so
+            // never sends `close`).
+            for drain in &tcp_drains {
+                // Reserve at most `TCP_DRAIN_MAX_BYTES`, not the
+                // entire remaining budget. Mirrors the seq path
+                // (`process_seq_data_op`): under mixed legacy +
+                // seq batches, setting budget to 0 here would
+                // make concurrently-running seq jobs observe
+                // `take == 0` and return empty responses for an
+                // op slot whose own drain would have fit in the
+                // budget once we refunded the unused portion a
+                // moment later. With the per-iteration cap, seq
+                // jobs continue to see headroom up to
+                // `TCP_DRAIN_MAX_BYTES` worth of budget while
+                // this drain is in flight.
+                let take = {
+                    let mut b = response_budget_p2.lock().await;
+                    let take = (*b).min(TCP_DRAIN_MAX_BYTES);
+                    *b = b.saturating_sub(take);
+                    take
+                };
+                let mut resp = if take == 0 {
+                    // Budget exhausted by sibling drains (legacy or
+                    // seq). Push an empty placeholder so the client
+                    // sees a reply for this index; the buffered
+                    // bytes stay for the next poll. Don't surface
+                    // raw inner.eof here for the same reason
+                    // `drain_now` withholds it when bytes remain —
+                    // closing the session client-side would drop
+                    // the buffered tail.
+                    tcp_drain_response(drain.sid.clone(), Vec::new(), false)
+                } else {
+                    let (data, eof) = drain_now(&drain.inner, take).await;
+                    let drained = data.len();
+                    if drained < take {
+                        *response_budget_p2.lock().await += take - drained;
+                    }
+                    if eof {
+                        tcp_eof_sids.push(drain.sid.clone());
+                    }
+                    tcp_drain_response(drain.sid.clone(), data, eof)
+                };
+                // Stamp caps on the connect_data first reply so the
+                // batched HTTPS fast path enables pipelining (matches
+                // `handle_connect_data_single`'s behavior). Plain
+                // `data` drains don't carry caps — they're not
+                // session-establishment replies.
+                if drain.is_connect_data {
+                    resp.caps = Some(CAPS_PIPELINE_SEQ);
+                }
+                p2_results.push((drain.i, resp));
             }
-            results.push((*i, tcp_drain_response(sid.clone(), data, eof)));
-            remaining_budget = remaining_budget.saturating_sub(drained);
-            if remaining_budget == 0 {
-                // Budget exhausted; remaining sessions in `tcp_drains` keep
-                // their buffered data and pick up next batch.
-                break;
+
+            // ---- UDP drain ----
+            // Same shape as TCP. `drain_udp_now` currently drains
+            // the full queue with no per-batch cap, so its returned
+            // `eof` already matches the atomic — driving cleanup
+            // off the drain return rather than the atomic catches a
+            // future per-batch UDP cap if added.
+            for (i, sid, inner) in &udp_drains {
+                let (packets, eof) = drain_udp_now(inner).await;
+                if eof {
+                    udp_eof_sids.push(sid.clone());
+                }
+                p2_results.push((*i, udp_drain_response(sid.clone(), packets, eof)));
             }
         }
-        if !tcp_eof_sids.is_empty() {
-            let mut sessions = state.sessions.lock().await;
-            for sid in &tcp_eof_sids {
-                if let Some(s) = sessions.remove(sid) {
-                    s.reader_handle.abort();
-                    tracing::info!("session {} closed by remote (batch)", sid);
-                }
+
+        (p2_results, tcp_eof_sids, udp_eof_sids)
+    };
+
+    let seq_fut = async move {
+        let mut out: Vec<(usize, TunnelResponse)> = Vec::new();
+        while let Some(join) = seq_data_jobs.join_next().await {
+            match join {
+                Ok((i, resp)) => out.push((i, resp)),
+                Err(e) => tracing::error!("seq-data task panicked: {}", e),
             }
         }
+        out
+    };
 
-        // ---- UDP drain ----
-        // Same shape as TCP. `drain_udp_now` currently drains the full
-        // queue with no per-batch cap, so its returned `eof` already
-        // matches the atomic — driving cleanup off the drain return
-        // is future-proofing: if a UDP per-batch packet cap is ever
-        // added (mirroring `TCP_DRAIN_MAX_BYTES`), the same data-loss
-        // trap that motivated the TCP-side fix reappears, and tracking
-        // eof from the drain return rather than the atomic catches it.
-        let mut udp_eof_sids: Vec<String> = Vec::new();
-        for (i, sid, inner) in &udp_drains {
-            let (packets, eof) = drain_udp_now(inner).await;
-            if eof {
-                udp_eof_sids.push(sid.clone());
+    let ((p2_results, p2_tcp_eof_sids, p2_udp_eof_sids), seq_results) =
+        tokio::join!(phase_2_fut, seq_fut);
+
+    // Process deferred close ops AFTER all seq-data jobs and Phase
+    // 2 drains have completed. Earlier same-sid seq ops have now
+    // written their uplink and produced their drain responses, so
+    // the close can safely tear down the session without racing
+    // them. Running close inline during dispatch (the pre-fix
+    // shape) removed the session from `state.sessions` and aborted
+    // `reader_task` while same-sid seq jobs were still about to
+    // run on their cloned `Arc<SessionInner>`.
+    for (i, sid) in pending_closes {
+        let r = handle_close(&state, sid).await;
+        results.push((i, r));
+    }
+
+    // Collect seq-side eof'd sids the same way Phase 2 does, so an
+    // upstream-closed pipelined session is reaped immediately on
+    // its eof reply rather than waiting for the idle reaper.
+    let seq_tcp_eof_sids: Vec<String> = seq_results
+        .iter()
+        .filter_map(|(_, r)| {
+            if r.eof.unwrap_or(false) {
+                r.sid.clone()
+            } else {
+                None
+            }
+        })
+        .collect();
+
+    results.extend(p2_results);
+    results.extend(seq_results);
+
+    // Combined TCP eof cleanup — both legacy Phase 2 and seq paths
+    // can produce eof'd sids; remove them in a single sessions-map
+    // lock acquisition.
+    if !p2_tcp_eof_sids.is_empty() || !seq_tcp_eof_sids.is_empty() {
+        let mut sessions = state.sessions.lock().await;
+        for sid in p2_tcp_eof_sids.iter().chain(seq_tcp_eof_sids.iter()) {
+            if let Some(s) = sessions.remove(sid) {
+                s.reader_handle.abort();
+                tracing::info!("session {} closed by remote (batch)", sid);
             }
-            results.push((*i, udp_drain_response(sid.clone(), packets, eof)));
         }
-        if !udp_eof_sids.is_empty() {
-            let mut sessions = state.udp_sessions.lock().await;
-            for sid in &udp_eof_sids {
-                if let Some(s) = sessions.remove(sid) {
-                    s.reader_handle.abort();
-                    tracing::info!("udp session {} closed by remote (batch)", sid);
-                }
+    }
+    if !p2_udp_eof_sids.is_empty() {
+        let mut sessions = state.udp_sessions.lock().await;
+        for sid in &p2_udp_eof_sids {
+            if let Some(s) = sessions.remove(sid) {
+                s.reader_handle.abort();
+                tracing::info!("udp session {} closed by remote (batch)", sid);
             }
         }
     }
@@ -1144,17 +1701,27 @@ async fn handle_batch(
     };
 
     let json = serde_json::to_vec(&batch_resp).unwrap_or_default();
-    (StatusCode::OK, [(header::CONTENT_TYPE, "application/json")], json)
+    (
+        StatusCode::OK,
+        [(header::CONTENT_TYPE, "application/json")],
+        json,
+    )
 }
 
 fn tcp_drain_response(sid: String, data: Vec<u8>, eof: bool) -> TunnelResponse {
     TunnelResponse {
         sid: Some(sid),
-        d: if data.is_empty() { None } else { Some(B64.encode(&data)) },
+        d: if data.is_empty() {
+            None
+        } else {
+            Some(B64.encode(&data))
+        },
         pkts: None,
         eof: Some(eof),
         e: None,
         code: None,
+        seq: None,
+        caps: None,
     }
 }
 
@@ -1171,6 +1738,8 @@ fn udp_drain_response(sid: String, packets: Vec<Vec<u8>>, eof: bool) -> TunnelRe
         eof: Some(eof),
         e: None,
         code: None,
+        seq: None,
+        caps: None,
     }
 }
 
@@ -1182,6 +1751,316 @@ fn eof_response(sid: String) -> TunnelResponse {
         eof: Some(true),
         e: None,
         code: None,
+        seq: None,
+        caps: None,
+    }
+}
+
+/// RAII guard that advances `expected` and notifies waiters on drop.
+/// Holding this across the entire (write, drain) sequence guarantees
+/// that subsequent seqs unblock on EVERY exit path, including early
+/// returns from write/flush failures. Without it, an upstream socket
+/// error mid-pipeline would leave `expected` frozen at the failed
+/// seq, stranding all later seqs until each hit `SEQ_WAIT_TIMEOUT`
+/// (~30 s) — pushing well-formed batches into client/Apps-Script
+/// timeout territory.
+struct SeqAdvanceOnDrop<'a> {
+    state: Option<tokio::sync::MutexGuard<'a, SeqState>>,
+    notify: &'a Notify,
+}
+
+impl<'a> Drop for SeqAdvanceOnDrop<'a> {
+    fn drop(&mut self) {
+        if let Some(mut s) = self.state.take() {
+            s.expected = s.expected.saturating_add(1);
+            // s drops here, releasing the seq_state lock before the
+            // notify_waiters() call below. Order matters: waiters
+            // wake and immediately try to lock seq_state, so they
+            // need to see the bumped value, which means the lock
+            // release has to happen before the wake.
+        }
+        self.notify.notify_waiters();
+    }
+}
+
+/// Seq-ordered `data` op processing for the pipelining protocol.
+///
+/// The contract: when a `data` op carries a `seq`, the tunnel-node
+/// must process the entire (write, wait, drain) sequence for the
+/// session in monotonically increasing seq order. Two ops can travel
+/// from the client through different deployments/network paths and
+/// arrive here out of order — without ordering, the client's
+/// per-session reorder buffer would see seq-N's reply containing
+/// bytes that should have followed seq-N+1's, silently corrupting
+/// the stream.
+///
+/// Concurrency: the seq lock (held inside `SeqAdvanceOnDrop`) wraps
+/// the *entire* (write, wait_for_drainable, drain) sequence and is
+/// released — with `expected` advanced — on every exit path via
+/// the guard's `Drop`. This guarantees per-session serialization of
+/// drains as well as writes, AND ensures a write/flush error doesn't
+/// strand later seqs behind a never-bumped `expected`.
+///
+/// Drain budget: callers thread a shared `Arc<Mutex<usize>>` through
+/// every seq-data job in a batch (and through Phase 2's
+/// `tcp_drains` loop) so the total bytes across all sessions in
+/// one batch response stay under `BATCH_RESPONSE_BUDGET`. Without
+/// this, M concurrent pipelined sessions would each return up to
+/// `TCP_DRAIN_MAX_BYTES`, stacking past Apps Script's 50 MiB ceiling
+/// and triggering truncation / parse failures (#863).
+///
+/// `had_active_in_batch` propagates the dispatch loop's
+/// `had_writes_or_connects` flag so an idle pipelined poll
+/// coalesced with active sibling ops uses `ACTIVE_DRAIN_DEADLINE`
+/// instead of `LONGPOLL_DEADLINE` — without this, a single empty
+/// seq op can hold the whole batch response open for 15 s.
+///
+/// `batch_wait` is the shared wait primitive for the whole batch.
+/// Every seq job and Phase 2's TCP drain subscribe to its single
+/// `Notify`; one push from any session's `reader_task` fans out to
+/// `notify_waiters()` and wakes every parked job at once. Without
+/// this shared primitive, an idle empty pipelined poll for session
+/// B held the entire batch response open up to `LONGPOLL_DEADLINE`,
+/// even if session A already had bytes another job was ready to
+/// ship — and the previous per-job watcher design also hit the
+/// `notify_one` race where a single push only woke one of N parked
+/// jobs.
+async fn process_seq_data_op(
+    inner: Arc<SessionInner>,
+    sid: String,
+    seq: u64,
+    data_b64: Option<String>,
+    response_budget: Arc<Mutex<usize>>,
+    had_active_in_batch: bool,
+    batch_wait: Arc<BatchWait>,
+) -> TunnelResponse {
+    // Decode uplink first so a malformed payload doesn't burn a seq slot.
+    let uplink = match data_b64 {
+        Some(ref d) if !d.is_empty() => match B64.decode(d) {
+            Ok(b) => Some(b),
+            Err(e) => {
+                let mut r = TunnelResponse::error(format!("bad base64: {}", e));
+                r.seq = Some(seq);
+                return r;
+            }
+        },
+        _ => None,
+    };
+    let had_uplink = uplink.as_ref().map(|b| !b.is_empty()).unwrap_or(false);
+
+    // Wait for our seq turn. Returns the seq-state guard *without*
+    // bumping `expected` — we hold it across the entire op so drain
+    // serialization is guaranteed.
+    let seq_guard = match wait_for_seq_turn(&inner, seq, SEQ_WAIT_TIMEOUT).await {
+        Ok(g) => g,
+        Err(e) => {
+            let mut r = TunnelResponse::error(e);
+            r.seq = Some(seq);
+            return r;
+        }
+    };
+
+    // From here on, the SeqAdvanceOnDrop guard owns the seq_state
+    // MutexGuard and bumps `expected` + notify_waiters() on every
+    // return path — including the early-exit error returns below
+    // (write/flush failures). Without this, an upstream write
+    // failure would strand later seqs behind a never-bumped
+    // `expected` until each hit `SEQ_WAIT_TIMEOUT`.
+    let _advance = SeqAdvanceOnDrop {
+        state: Some(seq_guard),
+        notify: &inner.seq_advance,
+    };
+
+    // Refresh last_active on EVERY pipelined op, including empty
+    // polls. Matches the legacy non-seq `data` branch (line ~984):
+    // the idle reaper would otherwise close a long-lived session
+    // that's actively being long-polled by the client (no uplink,
+    // no recent inbound = stale, even though the client is still
+    // engaged).
+    *inner.last_active.lock().await = Instant::now();
+
+    // Write uplink under the writer lock. The writer lock is a
+    // sub-lock here — only acquired when we actually have bytes to
+    // write. Lock order: seq → writer (no other path takes seq
+    // before writer, so this can't deadlock with the legacy non-seq
+    // data path which only takes writer).
+    if let Some(bytes) = uplink {
+        if !bytes.is_empty() {
+            let mut writer = inner.writer.lock().await;
+            if let Err(e) = writer.write_all(&bytes).await {
+                drop(writer);
+                let mut r = TunnelResponse::error(format!("write failed: {}", e));
+                r.seq = Some(seq);
+                return r;
+            }
+            if let Err(e) = writer.flush().await {
+                drop(writer);
+                let mut r = TunnelResponse::error(format!("flush failed: {}", e));
+                r.seq = Some(seq);
+                return r;
+            }
+        }
+    }
+
+    // Pick the drain deadline AND the wait shape based on whether
+    // this op is active.
+    //
+    // ACTIVE (`had_uplink == true`): we just wrote uplink bytes; the
+    // response we want is THIS session's reply, not anyone else's.
+    // Wait per-session up to `ACTIVE_DRAIN_DEADLINE`, then run a
+    // straggler-settle loop (mirroring Phase 2's behavior) so a
+    // multi-packet response from upstream lands in one drain
+    // instead of being split across two batches. Subscribing to
+    // the shared batch_wait here would be wrong: a sibling
+    // session's bytes could short-circuit our wait, and we'd
+    // return empty for an active op that just hadn't gotten its
+    // own reply yet.
+    //
+    // IDLE (`had_uplink == false`): empty poll. If the batch has
+    // any active siblings, use `ACTIVE_DRAIN_DEADLINE` to keep the
+    // whole batch responsive; otherwise long-poll up to
+    // `LONGPOLL_DEADLINE`. Either way, subscribe to the SHARED
+    // `BatchWait` so a sibling getting bytes wakes us — that's
+    // the whole point of the shared wake design.
+    if had_uplink {
+        wait_for_any_drainable(std::slice::from_ref(&inner), ACTIVE_DRAIN_DEADLINE).await;
+        // Active straggler settle: keep waiting in 10 ms steps
+        // while bytes keep arriving, capped at STRAGGLER_SETTLE_MAX
+        // (~1 s). Without this, a multi-packet response that
+        // arrives a few ms apart gets split across this batch's
+        // drain and the next, doubling Apps Script round-trips
+        // for the same logical exchange.
+        let settle_end = Instant::now() + STRAGGLER_SETTLE_MAX;
+        let mut prev_len = inner.read_buf.lock().await.len();
+        let raw_eof = inner.eof.load(Ordering::Acquire);
+        if !raw_eof {
+            loop {
+                let now = Instant::now();
+                if now >= settle_end {
+                    break;
+                }
+                let remaining = settle_end - now;
+                tokio::time::sleep(STRAGGLER_SETTLE_STEP.min(remaining)).await;
+                let cur_len = inner.read_buf.lock().await.len();
+                let cur_eof = inner.eof.load(Ordering::Acquire);
+                if cur_len == prev_len && !cur_eof {
+                    break;
+                }
+                prev_len = cur_len;
+                if cur_eof {
+                    break;
+                }
+            }
+        }
+    } else {
+        let deadline = if had_active_in_batch {
+            ACTIVE_DRAIN_DEADLINE
+        } else {
+            LONGPOLL_DEADLINE
+        };
+        if batch_wait.inners.is_empty() {
+            wait_for_any_drainable(std::slice::from_ref(&inner), deadline).await;
+        } else {
+            batch_wait.wait(deadline).await;
+        }
+    }
+
+    // Reserve our share of the batch-wide response budget, drain
+    // exactly that much (or less), and return any unused budget so
+    // sibling tasks can use it. `drain_now` already handles the
+    // per-session `TCP_DRAIN_MAX_BYTES` cap and — critically — leaves
+    // the tail in the buffer when over the cap, so no bytes are lost.
+    let take = {
+        let mut budget = response_budget.lock().await;
+        let take = (*budget).min(TCP_DRAIN_MAX_BYTES);
+        *budget = budget.saturating_sub(take);
+        take
+    };
+    let (data, eof) = if take == 0 {
+        // Batch budget already exhausted by sibling drains. Don't
+        // drain any bytes here — they'll come back on the next poll.
+        // Critically, withhold EOF when the buffer still contains
+        // bytes: reporting eof=true here would close the session
+        // client-side and silently drop the buffered tail. This
+        // mirrors `drain_now`'s "withhold EOF until tail is drained"
+        // semantics; without it, the budget-exhaustion path becomes
+        // a data-loss bug on high-throughput pipelined sessions.
+        let buf_empty = inner.read_buf.lock().await.is_empty();
+        let raw_eof = inner.eof.load(Ordering::Acquire);
+        let safe_eof = raw_eof && buf_empty;
+        (Vec::new(), safe_eof)
+    } else {
+        let (d, e) = drain_now(&inner, take).await;
+        if d.len() < take {
+            *response_budget.lock().await += take - d.len();
+        }
+        (d, e)
+    };
+
+    // _advance drops here on the success path, bumping expected and
+    // notifying waiters. The same drop runs on every early return
+    // above.
+    let mut resp = tcp_drain_response(sid, data, eof);
+    resp.seq = Some(seq);
+    resp
+}
+
+/// Wait for our `seq` to be the next-expected for this session and
+/// return the seq-state guard. The caller HOLDS the guard across the
+/// entire (write, wait_for_drainable, drain) sequence and only
+/// releases it after bumping `expected` — that's what enforces both
+/// uplink ordering AND drain ordering on a single shared
+/// `read_buf`. An earlier shape that released the guard before the
+/// drain phase let two concurrently-running seq tasks race for the
+/// read_buf lock; a reader_task push between their two final takes
+/// could split bytes across replies in reverse seq order, silently
+/// corrupting the stream.
+async fn wait_for_seq_turn<'a>(
+    inner: &'a SessionInner,
+    seq: u64,
+    timeout: Duration,
+) -> Result<tokio::sync::MutexGuard<'a, SeqState>, String> {
+    let start = Instant::now();
+    loop {
+        // Subscribe to the notify *before* taking the seq_state
+        // lock. `Notify::notify_waiters()` does NOT save a permit
+        // for waiters that aren't yet registered, so the obvious
+        // `lock → check → drop → notified()` order races: an
+        // advance between drop and notified() is lost, and the
+        // waiter sleeps until SEQ_WAIT_TIMEOUT. `enable()` registers
+        // the future so any subsequent `notify_waiters()` wakes it,
+        // even if the wake fires before we reach `.await` below.
+        let notified = inner.seq_advance.notified();
+        tokio::pin!(notified);
+        notified.as_mut().enable();
+
+        let state = inner.seq_state.lock().await;
+        if seq < state.expected {
+            // Late arrival — likely a client retry of a seq we
+            // already processed. Drop it; the client's earlier
+            // reply (if it arrived) handled this op.
+            return Err(format!(
+                "seq {} already processed (expected {})",
+                seq, state.expected
+            ));
+        }
+        if seq == state.expected {
+            return Ok(state);
+        }
+        // seq > expected: drop the seq lock and wait for an advance.
+        let current_expected = state.expected;
+        drop(state);
+        let elapsed = start.elapsed();
+        if elapsed >= timeout {
+            return Err(format!(
+                "seq {} timed out waiting for {} (waited {:?})",
+                seq, current_expected, elapsed
+            ));
+        }
+        // The pre-enabled `notified` will resolve immediately if a
+        // notify_waiters() fired between our enable() and here.
+        let _ = tokio::time::timeout(timeout - elapsed, notified).await;
+        // Re-check at top of loop after wake / timeout.
     }
 }
 
@@ -1212,7 +2091,11 @@ fn validate_host_port(
     Ok((host, port))
 }
 
-async fn handle_connect(state: &AppState, host: Option<String>, port: Option<u16>) -> TunnelResponse {
+async fn handle_connect(
+    state: &AppState,
+    host: Option<String>,
+    port: Option<u16>,
+) -> TunnelResponse {
     let (host, port) = match validate_host_port(host, port) {
         Ok(v) => v,
         Err(r) => return r,
@@ -1228,7 +2111,19 @@ async fn handle_connect(state: &AppState, host: Option<String>, port: Option<u16
     let sid = uuid::Uuid::new_v4().to_string();
     tracing::info!("session {} -> {}:{}", sid, host, port);
     state.sessions.lock().await.insert(sid.clone(), session);
-    TunnelResponse { sid: Some(sid), d: None, pkts: None, eof: Some(false), e: None, code: None }
+    // Advertise pipelining support on the connect reply so the client
+    // knows it can opt into per-session `data`-op sequence numbers
+    // for this session. Old clients ignore the unknown `caps` field.
+    TunnelResponse {
+        sid: Some(sid),
+        d: None,
+        pkts: None,
+        eof: Some(false),
+        e: None,
+        code: None,
+        seq: None,
+        caps: Some(CAPS_PIPELINE_SEQ),
+    }
 }
 
 /// Open a session and write the client's first bytes in one round trip.
@@ -1343,17 +2238,31 @@ async fn handle_connect_data_single(
             tracing::info!("session {} closed by remote", sid);
         }
     }
+    // Same caps advertisement as `handle_connect`. The single-op
+    // `connect_data` path is the optimistic-bundling fast path —
+    // surfacing capabilities here means a client opting into both
+    // optimizations learns about pipelining on the very first reply.
     TunnelResponse {
         sid: Some(sid),
-        d: if data.is_empty() { None } else { Some(B64.encode(&data)) },
+        d: if data.is_empty() {
+            None
+        } else {
+            Some(B64.encode(&data))
+        },
         pkts: None,
         eof: Some(eof),
         e: None,
         code: None,
+        seq: None,
+        caps: Some(CAPS_PIPELINE_SEQ),
     }
 }
 
-async fn handle_data_single(state: &AppState, sid: Option<String>, data: Option<String>) -> TunnelResponse {
+async fn handle_data_single(
+    state: &AppState,
+    sid: Option<String>,
+    data: Option<String>,
+) -> TunnelResponse {
     let sid = match sid {
         Some(s) if !s.is_empty() => s,
         _ => return TunnelResponse::error("missing sid"),
@@ -1396,9 +2305,17 @@ async fn handle_data_single(state: &AppState, sid: Option<String>, data: Option<
     }
     TunnelResponse {
         sid: Some(sid),
-        d: if data.is_empty() { None } else { Some(B64.encode(&data)) },
+        d: if data.is_empty() {
+            None
+        } else {
+            Some(B64.encode(&data))
+        },
         pkts: None,
-        eof: Some(eof), e: None, code: None,
+        eof: Some(eof),
+        e: None,
+        code: None,
+        seq: None,
+        caps: None,
     }
 }
 
@@ -1415,7 +2332,16 @@ async fn handle_close(state: &AppState, sid: Option<String>) -> TunnelResponse {
         s.reader_handle.abort();
         tracing::info!("udp session {} closed by client", sid);
     }
-    TunnelResponse { sid: Some(sid), d: None, pkts: None, eof: Some(true), e: None, code: None }
+    TunnelResponse {
+        sid: Some(sid),
+        d: None,
+        pkts: None,
+        eof: Some(true),
+        e: None,
+        code: None,
+        seq: None,
+        caps: None,
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -1472,11 +2398,7 @@ async fn cleanup_task(
                 }
             }
             if !stale.is_empty() {
-                tracing::info!(
-                    "cleanup: reaped {}, {} active udp",
-                    stale.len(),
-                    map.len()
-                );
+                tracing::info!("cleanup: reaped {}, {} active udp", stale.len(), map.len());
             }
         }
     }
@@ -1490,8 +2412,7 @@ async fn cleanup_task(
 async fn main() {
     tracing_subscriber::fmt()
         .with_env_filter(
-            tracing_subscriber::EnvFilter::try_from_default_env()
-                .unwrap_or_else(|_| "info".into()),
+            tracing_subscriber::EnvFilter::try_from_default_env().unwrap_or_else(|_| "info".into()),
         )
         .init();
 
@@ -1684,9 +2605,8 @@ mod tests {
     #[tokio::test]
     async fn connect_data_rejects_missing_host() {
         let state = fresh_state();
-        let resp = handle_connect_data_single(
-            &state, None, Some(443), Some(B64.encode(b"x")),
-        ).await;
+        let resp =
+            handle_connect_data_single(&state, None, Some(443), Some(B64.encode(b"x"))).await;
         assert!(resp.e.as_deref().unwrap_or("").contains("missing host"));
         assert!(state.sessions.lock().await.is_empty());
     }
@@ -1736,6 +2656,8 @@ mod tests {
             eof: AtomicBool::new(false),
             last_active: Mutex::new(Instant::now()),
             notify: Notify::new(),
+            seq_state: Mutex::new(SeqState { expected: 0 }),
+            seq_advance: Notify::new(),
         })
     }
 
@@ -1771,11 +2693,7 @@ mod tests {
         // cap path does.
         let inner = fake_inner().await;
         // 1 MiB buffered, but caller only has 256 KiB budget left.
-        inner
-            .read_buf
-            .lock()
-            .await
-            .resize(1024 * 1024, 0xcd);
+        inner.read_buf.lock().await.resize(1024 * 1024, 0xcd);
 
         let (drained, eof) = drain_now(&inner, 256 * 1024).await;
         assert_eq!(drained.len(), 256 * 1024);
@@ -1793,7 +2711,11 @@ mod tests {
     #[tokio::test]
     async fn drain_now_passes_through_when_under_cap() {
         let inner = fake_inner().await;
-        inner.read_buf.lock().await.extend_from_slice(b"hello world");
+        inner
+            .read_buf
+            .lock()
+            .await
+            .extend_from_slice(b"hello world");
 
         let (data, eof) = drain_now(&inner, usize::MAX).await;
         assert_eq!(data, b"hello world");
@@ -1826,7 +2748,11 @@ mod tests {
     #[tokio::test]
     async fn wait_for_any_drainable_returns_immediately_when_buffer_has_data() {
         let inner = fake_inner().await;
-        inner.read_buf.lock().await.extend_from_slice(b"already here");
+        inner
+            .read_buf
+            .lock()
+            .await
+            .extend_from_slice(b"already here");
 
         let t0 = Instant::now();
         wait_for_any_drainable(&[inner], Duration::from_secs(5)).await;
@@ -1987,6 +2913,8 @@ mod tests {
             eof: AtomicBool::new(false),
             last_active: Mutex::new(Instant::now()),
             notify: Notify::new(),
+            seq_state: Mutex::new(SeqState { expected: 0 }),
+            seq_advance: Notify::new(),
         });
         let _reader_handle = tokio::spawn(reader_task(reader, inner.clone()));
 
@@ -2060,7 +2988,9 @@ mod tests {
         let resp = handle_batch(State(state.clone()), Bytes::from(body))
             .await
             .into_response();
-        let body_bytes = axum::body::to_bytes(resp.into_body(), usize::MAX).await.unwrap();
+        let body_bytes = axum::body::to_bytes(resp.into_body(), usize::MAX)
+            .await
+            .unwrap();
         serde_json::from_slice(&body_bytes).unwrap()
     }
 
@@ -2070,10 +3000,7 @@ mod tests {
     /// at ACTIVE_DRAIN_DEADLINE (350 ms) with no data.
     #[tokio::test]
     async fn batch_pure_poll_wakes_on_push() {
-        let push_port = start_push_server(
-            Duration::from_millis(150),
-            b"PUSHED".to_vec(),
-        ).await;
+        let push_port = start_push_server(Duration::from_millis(150), b"PUSHED".to_vec()).await;
         let state = fresh_state();
         let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(push_port)).await;
         let sid = connect_resp.sid.expect("connect should succeed");
@@ -2081,7 +3008,8 @@ mod tests {
         let body = serde_json::to_vec(&serde_json::json!({
             "k": "test-key",
             "ops": [{"op": "data", "sid": sid}],
-        })).unwrap();
+        }))
+        .unwrap();
 
         let t0 = Instant::now();
         let resp = invoke_handle_batch(&state, body).await;
@@ -2099,7 +3027,9 @@ mod tests {
         );
 
         let r = resp["r"].as_array().expect("response must be an array");
-        let d_b64 = r[0]["d"].as_str().expect("response should carry pushed bytes");
+        let d_b64 = r[0]["d"]
+            .as_str()
+            .expect("response should carry pushed bytes");
         let data = B64.decode(d_b64).unwrap();
         assert_eq!(&data[..], b"PUSHED");
     }
@@ -2111,13 +3041,15 @@ mod tests {
     async fn batch_active_caps_at_active_deadline() {
         let silent_port = start_silent_server().await;
         let state = fresh_state();
-        let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(silent_port)).await;
+        let connect_resp =
+            handle_connect(&state, Some("127.0.0.1".into()), Some(silent_port)).await;
         let sid = connect_resp.sid.expect("connect should succeed");
 
         let body = serde_json::to_vec(&serde_json::json!({
             "k": "test-key",
             "ops": [{"op": "data", "sid": sid, "d": B64.encode(b"PING")}],
-        })).unwrap();
+        }))
+        .unwrap();
 
         let t0 = Instant::now();
         let _resp = invoke_handle_batch(&state, body).await;
@@ -2146,10 +3078,7 @@ mod tests {
     /// long-poll actually engaged.
     #[tokio::test]
     async fn batch_empty_string_payload_engages_long_poll() {
-        let push_port = start_push_server(
-            Duration::from_millis(600),
-            b"DELAYED".to_vec(),
-        ).await;
+        let push_port = start_push_server(Duration::from_millis(600), b"DELAYED".to_vec()).await;
         let state = fresh_state();
         let connect_resp = handle_connect(&state, Some("127.0.0.1".into()), Some(push_port)).await;
         let sid = connect_resp.sid.expect("connect should succeed");
@@ -2157,7 +3086,8 @@ mod tests {
         let body = serde_json::to_vec(&serde_json::json!({
             "k": "test-key",
             "ops": [{"op": "data", "sid": sid, "d": ""}],
-        })).unwrap();
+        }))
+        .unwrap();
 
         let t0 = Instant::now();
         let resp = invoke_handle_batch(&state, body).await;
@@ -2175,7 +3105,8 @@ mod tests {
         );
 
         let r = resp["r"].as_array().unwrap();
-        let d_b64 = r[0]["d"].as_str()
+        let d_b64 = r[0]["d"]
+            .as_str()
             .expect("Some(\"\") payload should have engaged long-poll and delivered DELAYED");
         let data = B64.decode(d_b64).unwrap();
         assert_eq!(&data[..], b"DELAYED");
@@ -2238,8 +3169,17 @@ mod tests {
         }
         let drops = inner.queue_drops.load(Ordering::Relaxed);
         let queued = inner.packets.lock().await.len();
-        assert!(drops >= 1, "expected ≥1 drop, got {} (queued={})", drops, queued);
-        assert!(queued <= UDP_QUEUE_LIMIT, "queue exceeded limit: {}", queued);
+        assert!(
+            drops >= 1,
+            "expected ≥1 drop, got {} (queued={})",
+            drops,
+            queued
+        );
+        assert!(
+            queued <= UDP_QUEUE_LIMIT,
+            "queue exceeded limit: {}",
+            queued
+        );
     }
 
     /// Regression for the bug the review caught: a batch mixing UDP and
@@ -2466,11 +3406,7 @@ mod tests {
 
         // TCP session with bytes already buffered → immediately drainable.
         let tcp_inner = fake_inner().await;
-        tcp_inner
-            .read_buf
-            .lock()
-            .await
-            .extend_from_slice(b"ready");
+        tcp_inner.read_buf.lock().await.extend_from_slice(b"ready");
         let tcp_sid = "tcp-sid".to_string();
         state.sessions.lock().await.insert(
             tcp_sid.clone(),
@@ -2485,15 +3421,11 @@ mod tests {
         // succeeds; we just never send anything to it.
         let udp_target = UdpSocket::bind(("127.0.0.1", 0)).await.unwrap();
         let udp_port = udp_target.local_addr().unwrap().port();
-        let (udp_sid, _udp_inner) = handle_udp_open_phase1(
-            &state,
-            Some("127.0.0.1".into()),
-            Some(udp_port),
-            None,
-        )
-        .await
-        .expect("udp open");
-
+        let (udp_sid, _udp_inner) =
+            handle_udp_open_phase1(&state, Some("127.0.0.1".into()), Some(udp_port), None)
+                .await
+                .expect("udp open");
+
         // Pure-poll batch (no `d` payload) → had_writes_or_connects =
         // false → deadline = LONGPOLL_DEADLINE (15 s). Under the
         // previous tokio::join! wait, the UDP arm would have held the
@@ -2521,4 +3453,1175 @@ mod tests {
             elapsed,
         );
     }
+
+    /// Helper for the seq tests: bump `expected` from N to N+1 the
+    /// way `process_seq_data_op` does — under the same guard, with a
+    /// notify_waiters() afterward. Lets unit tests exercise the same
+    /// state machine without spinning up the full op handler.
+    async fn release_seq(inner: &SessionInner, mut guard: tokio::sync::MutexGuard<'_, SeqState>) {
+        guard.expected = guard.expected.saturating_add(1);
+        drop(guard);
+        inner.seq_advance.notify_waiters();
+    }
+
+    /// Sanity-check: a fresh session starts with `expected = 0`,
+    /// so seq=0's `wait_for_seq_turn` succeeds immediately. After
+    /// the simulated release, expected advances to 1.
+    #[tokio::test]
+    async fn wait_for_seq_turn_succeeds_when_in_order() {
+        let inner = fake_inner().await;
+        let guard = wait_for_seq_turn(&inner, 0, Duration::from_secs(2))
+            .await
+            .expect("seq=0 should claim immediately on fresh session");
+        release_seq(&inner, guard).await;
+        let state = inner.seq_state.lock().await;
+        assert_eq!(state.expected, 1);
+    }
+
+    /// The headline correctness invariant: when seq=N+1 arrives at the
+    /// server before seq=N (different batches taking different paths),
+    /// the N+1 task waits inside `wait_for_seq_turn` until the N task
+    /// lands and bumps `expected`. Without this, the per-session
+    /// reorder buffer on the client would see a reply for seq=N+1
+    /// containing bytes that should have followed seq=N's bytes —
+    /// silent data corruption.
+    #[tokio::test]
+    async fn wait_for_seq_turn_blocks_until_earlier_seq_lands() {
+        let inner = fake_inner().await;
+
+        // Spawn a seq=2 claim — should block (expected is 0).
+        let inner_for_2 = inner.clone();
+        let claim_2 = tokio::spawn(async move {
+            let g = wait_for_seq_turn(&inner_for_2, 2, Duration::from_secs(5)).await?;
+            release_seq(&inner_for_2, g).await;
+            Ok::<(), String>(())
+        });
+
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            !claim_2.is_finished(),
+            "seq=2 must not proceed before seqs 0 and 1 land",
+        );
+
+        // Land seq=0.
+        let g0 = wait_for_seq_turn(&inner, 0, Duration::from_secs(2))
+            .await
+            .expect("seq=0 should claim");
+        release_seq(&inner, g0).await;
+
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            !claim_2.is_finished(),
+            "seq=2 must not proceed with only seq=0 landed (expected==1)",
+        );
+
+        // Land seq=1.
+        let g1 = wait_for_seq_turn(&inner, 1, Duration::from_secs(2))
+            .await
+            .expect("seq=1 should claim");
+        release_seq(&inner, g1).await;
+
+        let result = tokio::time::timeout(Duration::from_secs(2), claim_2)
+            .await
+            .expect("seq=2 must wake within timeout once seq=1 lands")
+            .expect("task should not panic");
+        result.expect("seq=2 claim should succeed");
+    }
+
+    /// Late arrivals (seq < expected) — typically a duplicate retry
+    /// from the client — must return an error so the caller can echo
+    /// it as a "skipped" reply instead of double-writing the upstream.
+    #[tokio::test]
+    async fn wait_for_seq_turn_rejects_already_processed_seq() {
+        let inner = fake_inner().await;
+        inner.seq_state.lock().await.expected = 5;
+        let r = wait_for_seq_turn(&inner, 3, Duration::from_secs(1)).await;
+        assert!(
+            r.is_err(),
+            "seq=3 should be rejected when expected has already advanced to 5",
+        );
+    }
+
+    /// Hard timeout when an earlier seq simply never arrives (client
+    /// crashed mid-pipeline, batch dropped, etc.). Without this the
+    /// task would wait forever and the session would leak until the
+    /// idle reaper fires.
+    #[tokio::test]
+    async fn wait_for_seq_turn_times_out_when_earlier_seq_never_arrives() {
+        let inner = fake_inner().await;
+        let r = wait_for_seq_turn(&inner, 3, Duration::from_millis(100)).await;
+        assert!(r.is_err(), "seq=3 must time out when seqs 0..3 never land",);
+    }
+
+    /// Two pipelined seq ops for the same session must not race for
+    /// `read_buf` — the seq lock has to stay held across the entire
+    /// (write, drain) sequence. Pre-buffer enough bytes that the
+    /// drain has multiple kernel-FIFO chunks to take, then run two
+    /// `process_seq_data_op` calls concurrently. The reply with the
+    /// lower seq must contain the earlier prefix of the buffered
+    /// bytes; if the lock granularity regressed, both replies can
+    /// take from the same buffer in arrival-of-final-lock order
+    /// rather than seq order.
+    #[tokio::test]
+    async fn process_seq_data_drains_in_seq_order_under_concurrency() {
+        let inner = fake_inner().await;
+        // Pre-load 200 bytes — 100 each so we can clearly attribute
+        // which prefix landed where if ordering breaks.
+        {
+            let mut buf = inner.read_buf.lock().await;
+            buf.extend_from_slice(&vec![0xAAu8; 100]);
+            buf.extend_from_slice(&vec![0xBBu8; 100]);
+        }
+        // Mark eof so seq=1's wait_for_any_drainable returns
+        // immediately on empty buffer instead of paying the full
+        // LONGPOLL_DEADLINE waiting for non-existent data.
+        inner.eof.store(true, Ordering::Release);
+
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+
+        // Spawn seq=1 first — it should block waiting for seq=0.
+        let inner_1 = inner.clone();
+        let budget_1 = budget.clone();
+        let task_1 = tokio::spawn(async move {
+            process_seq_data_op(
+                inner_1,
+                "sid".into(),
+                1,
+                None,
+                budget_1,
+                false,
+                BatchWait::new(Vec::new()),
+            )
+            .await
+        });
+        // Brief wait so seq=1 is definitely parked on its seq lock.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+
+        // Now run seq=0. It should drain everything currently in
+        // read_buf (the cap is generous), then seq=1 wakes and gets
+        // empty (or whatever the reader pushed in between, none here).
+        let resp_0 = process_seq_data_op(
+            inner.clone(),
+            "sid".into(),
+            0,
+            None,
+            budget,
+            false,
+            BatchWait::new(Vec::new()),
+        )
+        .await;
+        let resp_1 = tokio::time::timeout(Duration::from_secs(2), task_1)
+            .await
+            .expect("seq=1 must complete after seq=0")
+            .expect("task should not panic");
+
+        assert_eq!(resp_0.seq, Some(0));
+        assert_eq!(resp_1.seq, Some(1));
+        // seq=0's reply must carry all the buffered bytes (200 bytes,
+        // well under TCP_DRAIN_MAX_BYTES). seq=1 gets empty.
+        let d0 = resp_0.d.as_deref().expect("seq=0 should have drained data");
+        let d0_bytes = B64.decode(d0).unwrap();
+        assert_eq!(d0_bytes.len(), 200);
+        // Verify byte order within seq=0's reply (kernel-FIFO).
+        assert!(d0_bytes[..100].iter().all(|&b| b == 0xAA));
+        assert!(d0_bytes[100..].iter().all(|&b| b == 0xBB));
+        assert!(
+            resp_1.d.as_deref().map(str::is_empty).unwrap_or(true),
+            "seq=1 must observe empty buffer after seq=0 drained — \
+             non-empty here means the drains raced",
+        );
+    }
+
+    /// `wait_and_drain` empties the whole buffer; `process_seq_data_op`
+    /// must use `drain_now` (or its equivalent) so any tail beyond
+    /// the per-session / per-batch cap is left buffered for the next
+    /// op. Without this, oversized read_bufs silently drop bytes — a
+    /// data-loss regression on high-throughput pipelined sessions.
+    #[tokio::test]
+    async fn process_seq_data_preserves_tail_when_over_cap() {
+        let inner = fake_inner().await;
+        let oversized = TCP_DRAIN_MAX_BYTES + 4096;
+        inner.read_buf.lock().await.resize(oversized, 0xCD);
+
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+        let resp = process_seq_data_op(
+            inner.clone(),
+            "sid".into(),
+            0,
+            None,
+            budget,
+            false,
+            BatchWait::new(Vec::new()),
+        )
+        .await;
+
+        // Reply should carry exactly TCP_DRAIN_MAX_BYTES of the prefix.
+        let d = resp.d.as_deref().expect("seq=0 should have data");
+        let d_bytes = B64.decode(d).unwrap();
+        assert_eq!(d_bytes.len(), TCP_DRAIN_MAX_BYTES);
+        // The 4 KiB tail must remain in the buffer for the next op.
+        let remaining = inner.read_buf.lock().await.len();
+        assert_eq!(
+            remaining, 4096,
+            "tail beyond TCP_DRAIN_MAX_BYTES must stay in read_buf for the next \
+             op — silently dropping it (the wait_and_drain regression) loses \
+             bytes on high-throughput pipelined sessions",
+        );
+    }
+
+    /// `last_active` must be refreshed on EVERY pipelined op — including
+    /// empty polls — so the idle reaper doesn't close a session that's
+    /// being actively long-polled by the client. The legacy non-seq
+    /// `data` branch refreshes unconditionally; the seq path has to
+    /// match or long-lived idle / server-push sessions get reaped at
+    /// 300 s even while the client is still engaged.
+    #[tokio::test]
+    async fn process_seq_data_refreshes_last_active_on_empty_poll() {
+        let inner = fake_inner().await;
+        // Backdate last_active so we can detect a refresh.
+        let backdated = Instant::now() - Duration::from_secs(120);
+        *inner.last_active.lock().await = backdated;
+
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+        // Empty poll — no uplink. Use a tiny deadline so the test
+        // doesn't sit on LONGPOLL_DEADLINE waiting for non-existent data.
+        // (process_seq_data_op picks the deadline based on had_uplink,
+        // which is false here, so we'd actually pay LONGPOLL_DEADLINE
+        // = 15 s. Mark eof on the session so wait_for_any_drainable
+        // returns immediately.)
+        inner.eof.store(true, Ordering::Release);
+        let _resp = process_seq_data_op(
+            inner.clone(),
+            "sid".into(),
+            0,
+            None,
+            budget,
+            false,
+            BatchWait::new(Vec::new()),
+        )
+        .await;
+
+        let after = *inner.last_active.lock().await;
+        assert!(
+            after > backdated,
+            "last_active must refresh on every seq op (was {:?}, now {:?})",
+            backdated,
+            after,
+        );
+    }
+
+    /// Two pipelined seq ops draining oversized buffers must share the
+    /// `BATCH_RESPONSE_BUDGET` so the combined response stays under
+    /// Apps Script's 50 MiB ceiling (#863). Without the shared budget,
+    /// each task takes up to TCP_DRAIN_MAX_BYTES (16 MiB), and 3+
+    /// concurrent pipelined sessions would stack past the ceiling and
+    /// cause truncation / parse failures on the wire.
+    #[tokio::test]
+    async fn process_seq_data_shares_batch_response_budget() {
+        let inner_a = fake_inner().await;
+        let inner_b = fake_inner().await;
+        // Each session has a maximally oversized buffer.
+        inner_a
+            .read_buf
+            .lock()
+            .await
+            .resize(TCP_DRAIN_MAX_BYTES + 1024, 0xAA);
+        inner_b
+            .read_buf
+            .lock()
+            .await
+            .resize(TCP_DRAIN_MAX_BYTES + 1024, 0xBB);
+
+        // Tight shared budget: well below 2 × TCP_DRAIN_MAX_BYTES.
+        // Forces the second drain to take less (or zero) so the
+        // combined response respects the cap.
+        let budget_cap = TCP_DRAIN_MAX_BYTES + 4096;
+        let budget = Arc::new(Mutex::new(budget_cap));
+
+        let task_a = tokio::spawn({
+            let inner = inner_a.clone();
+            let budget = budget.clone();
+            async move {
+                process_seq_data_op(
+                    inner,
+                    "sid-a".into(),
+                    0,
+                    None,
+                    budget,
+                    false,
+                    BatchWait::new(Vec::new()),
+                )
+                .await
+            }
+        });
+        let task_b = tokio::spawn({
+            let inner = inner_b.clone();
+            let budget = budget.clone();
+            async move {
+                process_seq_data_op(
+                    inner,
+                    "sid-b".into(),
+                    0,
+                    None,
+                    budget,
+                    false,
+                    BatchWait::new(Vec::new()),
+                )
+                .await
+            }
+        });
+
+        let resp_a = tokio::time::timeout(Duration::from_secs(5), task_a)
+            .await
+            .unwrap()
+            .unwrap();
+        let resp_b = tokio::time::timeout(Duration::from_secs(5), task_b)
+            .await
+            .unwrap()
+            .unwrap();
+
+        let len_a = resp_a
+            .d
+            .as_deref()
+            .map(|d| B64.decode(d).unwrap().len())
+            .unwrap_or(0);
+        let len_b = resp_b
+            .d
+            .as_deref()
+            .map(|d| B64.decode(d).unwrap().len())
+            .unwrap_or(0);
+        assert!(
+            len_a + len_b <= budget_cap,
+            "combined drained bytes ({} + {} = {}) must not exceed shared budget {}",
+            len_a,
+            len_b,
+            len_a + len_b,
+            budget_cap,
+        );
+    }
+
+    /// Regression: when batch budget is exhausted AND the
+    /// session has both buffered bytes AND eof, the seq response
+    /// must NOT report eof=true. Doing so would close the session
+    /// client-side and silently drop the buffered tail. Mirrors
+    /// `drain_now`'s "withhold EOF until tail is drained" semantics.
+    #[tokio::test]
+    async fn process_seq_data_withholds_eof_when_budget_exhausted_and_buffer_nonempty() {
+        let inner = fake_inner().await;
+        // Pre-buffer real bytes AND mark eof.
+        inner.read_buf.lock().await.extend_from_slice(b"tail-bytes");
+        inner.eof.store(true, Ordering::Release);
+
+        // Budget already at zero — simulates a sibling drain having
+        // consumed it before we got here.
+        let budget = Arc::new(Mutex::new(0usize));
+        let resp = process_seq_data_op(
+            inner.clone(),
+            "sid".into(),
+            0,
+            None,
+            budget,
+            true,
+            BatchWait::new(Vec::new()),
+        )
+        .await;
+
+        assert_eq!(resp.seq, Some(0));
+        // No data drained (budget = 0).
+        assert!(
+            resp.d.as_deref().map(str::is_empty).unwrap_or(true),
+            "no data should be returned when budget is 0",
+        );
+        // CRITICAL: eof must NOT be true while bytes remain in the
+        // buffer. Reporting eof=true here is the data-loss bug —
+        // the client closes the session and drops the buffered tail.
+        assert_ne!(
+            resp.eof,
+            Some(true),
+            "eof must be withheld while read_buf still has bytes; \
+             reporting it now would silently drop the buffered tail \
+             on the client side",
+        );
+        // The tail is still in the buffer for the next op.
+        assert_eq!(inner.read_buf.lock().await.as_slice(), b"tail-bytes");
+    }
+
+    /// Regression: every exit path of `process_seq_data_op`
+    /// past `wait_for_seq_turn` must bump `expected` and notify
+    /// waiters — including write/flush failures, where the function
+    /// early-returns with an error response. The `SeqAdvanceOnDrop`
+    /// guard delivers this; without it, an upstream write failure
+    /// would strand later seqs behind a never-bumped `expected`
+    /// until each hit `SEQ_WAIT_TIMEOUT` (~30 s), pushing batches
+    /// into client / Apps Script timeout territory.
+    ///
+    /// We test the guard directly rather than through
+    /// `process_seq_data_op` because writing to a closed peer or a
+    /// dropped duplex doesn't reliably error on every platform —
+    /// the kernel often buffers or blocks instead. The guard's
+    /// contract is what every error path in `process_seq_data_op`
+    /// relies on, so testing it in isolation is both stronger and
+    /// faster.
+    #[tokio::test]
+    async fn seq_advance_on_drop_bumps_expected_and_notifies_waiters() {
+        let inner = fake_inner().await;
+        assert_eq!(inner.seq_state.lock().await.expected, 0);
+
+        // Park a seq=1 waiter (expected is 0 → it has to wait).
+        let inner_for_waiter = inner.clone();
+        let waiter = tokio::spawn(async move {
+            wait_for_seq_turn(&inner_for_waiter, 1, Duration::from_secs(2))
+                .await
+                .map(|g| {
+                    drop(g);
+                })
+        });
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            !waiter.is_finished(),
+            "seq=1 waiter must park while expected=0",
+        );
+
+        // Acquire the seq lock and drop a guard immediately —
+        // simulates the early-return path of process_seq_data_op
+        // after wait_for_seq_turn succeeds (e.g. write/flush fails
+        // before reaching the success-path drop at function end).
+        {
+            let state = inner.seq_state.lock().await;
+            let _guard = SeqAdvanceOnDrop {
+                state: Some(state),
+                notify: &inner.seq_advance,
+            };
+            // _guard drops here, bumping expected from 0 → 1 and
+            // calling notify_waiters() on seq_advance.
+        }
+
+        // expected must be advanced.
+        assert_eq!(
+            inner.seq_state.lock().await.expected,
+            1,
+            "guard's Drop must bump `expected` even on the error-return paths",
+        );
+
+        // The parked waiter must wake (because notify_waiters fired)
+        // AND succeed (because expected now matches its seq=1).
+        // Without notify_waiters in the guard's Drop, the waiter
+        // would sleep until SEQ_WAIT_TIMEOUT.
+        let result = tokio::time::timeout(Duration::from_secs(1), waiter)
+            .await
+            .expect(
+                "waiter must wake when guard drops — without \
+                 notify_waiters() in SeqAdvanceOnDrop's Drop, the \
+                 waiter sleeps until SEQ_WAIT_TIMEOUT (~30 s)",
+            )
+            .expect("task should not panic");
+        assert!(result.is_ok(), "seq=1 should have claimed");
+    }
+
+    /// Regression: `wait_for_seq_turn` must register its
+    /// `Notified` future BEFORE locking and checking `seq_state`,
+    /// otherwise an advance + notify_waiters that fires between
+    /// drop(state) and notified() is lost (notify_waiters does not
+    /// save permits for unregistered waiters), and the waiter
+    /// sleeps until SEQ_WAIT_TIMEOUT.
+    ///
+    /// Reproducing the exact race deterministically requires
+    /// scheduler-level control we don't have, but we can verify
+    /// the post-fix property: a notify that fires *while* the
+    /// waiter is between its lock-and-check and its await still
+    /// wakes it. We approximate by spawning the waiter, briefly
+    /// pausing so it reaches the wait, then bumping + notifying;
+    /// the waiter must complete in well under SEQ_WAIT_TIMEOUT.
+    #[tokio::test]
+    async fn wait_for_seq_turn_does_not_miss_notify_after_lock_release() {
+        let inner = fake_inner().await;
+
+        let inner_clone = inner.clone();
+        let waiter = tokio::spawn(async move {
+            wait_for_seq_turn(&inner_clone, 5, Duration::from_secs(10))
+                .await
+                .map(|g| {
+                    drop(g);
+                })
+        });
+
+        // Hammer many advance + notify cycles in a tight loop. With
+        // the missed-notify race, an advance whose notify falls in
+        // the lock-release-to-await gap would be lost, leaving the
+        // waiter parked. The pre-enabled Notified guarantees we
+        // wake on any of these.
+        for target in 1u64..=5 {
+            tokio::time::sleep(Duration::from_millis(10)).await;
+            {
+                let mut state = inner.seq_state.lock().await;
+                state.expected = target;
+            }
+            inner.seq_advance.notify_waiters();
+        }
+
+        let result = tokio::time::timeout(Duration::from_secs(2), waiter)
+            .await
+            .expect(
+                "seq=5 waiter must observe one of the advances; if it \
+                 doesn't, notify_waiters fired in the lock-release-to- \
+                 await gap and was lost (the missed-notify race)",
+            )
+            .expect("task should not panic");
+        assert!(result.is_ok());
+    }
+
+    /// Regression: an idle empty seq poll for session B
+    /// must NOT hold the batch open behind a sibling session A whose
+    /// bytes are already buffered. The shared `wait_set` makes A's
+    /// bytes wake B's wait — without it, B paid `LONGPOLL_DEADLINE`
+    /// (≈ 15 s) because its own inner stayed empty.
+    #[tokio::test]
+    async fn process_seq_data_wakes_when_other_session_in_wait_set_has_bytes() {
+        let inner_a = fake_inner().await;
+        let inner_b = fake_inner().await;
+
+        // Session A has bytes ready; session B is idle.
+        inner_a.read_buf.lock().await.extend_from_slice(b"a-bytes");
+
+        let batch_wait = BatchWait::new(vec![inner_a.clone(), inner_b.clone()]);
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+
+        // B's seq op uses had_active_in_batch=false and had_uplink=false,
+        // so deadline = LONGPOLL_DEADLINE (15 s). Without the shared
+        // BatchWait, B would sit the full 15 s. With it, A's bytes
+        // immediately wake B's wait via `notify_waiters()` — and B
+        // drains its own (empty) buffer in milliseconds.
+        let t0 = Instant::now();
+        let resp_b = process_seq_data_op(
+            inner_b.clone(),
+            "sid-b".into(),
+            0,
+            None,
+            budget,
+            false, // had_active_in_batch
+            batch_wait,
+        )
+        .await;
+        let elapsed = t0.elapsed();
+
+        assert_eq!(resp_b.seq, Some(0));
+        // B's reply must be empty (it has no bytes of its own).
+        assert!(resp_b.d.as_deref().map(str::is_empty).unwrap_or(true));
+        // Must have returned in well under LONGPOLL_DEADLINE — give
+        // a generous 2 s ceiling for CI jitter.
+        assert!(
+            elapsed < Duration::from_secs(2),
+            "idle seq poll must wake on shared wait set when sibling \
+             session has bytes; elapsed={:?} (LONGPOLL_DEADLINE={:?})",
+            elapsed,
+            LONGPOLL_DEADLINE,
+        );
+    }
+
+    /// Regression: batches containing seq'd ops must use
+    /// the pipelined timeout floor on the client side, otherwise
+    /// `Config::request_timeout_secs` (default 30 s) fires before
+    /// the server-side worst case (`SEQ_WAIT_TIMEOUT 30s +
+    /// LONGPOLL_DEADLINE 15s`) and pipelined sessions disconnect
+    /// where legacy sessions survived. We can't easily test the
+    /// effective applied timeout end-to-end, but we can assert the
+    /// floor is at least the server's worst-case wait so the budget
+    /// stays consistent.
+    #[tokio::test]
+    async fn pipelined_timeout_floor_exceeds_server_worst_case_wait() {
+        // The server's worst-case wait per seq op is SEQ_WAIT_TIMEOUT
+        // (waiting for an earlier seq) + LONGPOLL_DEADLINE (waiting
+        // for upstream data inside that op). The client's pipelined
+        // batch timeout floor must exceed this so a valid slow
+        // server response doesn't fire a client-side "batch timed
+        // out" + deployment timeout strike.
+        let server_worst_case = SEQ_WAIT_TIMEOUT + LONGPOLL_DEADLINE;
+        // PIPELINED_BATCH_TIMEOUT_FLOOR is a client-side constant;
+        // we duplicate the value here to keep this test in the same
+        // crate. If either side is bumped, this test will fail
+        // until the client's floor follows.
+        let pipelined_floor = Duration::from_secs(60);
+        assert!(
+            pipelined_floor > server_worst_case,
+            "pipelined batch timeout floor ({:?}) must exceed server \
+             worst-case wait per seq op ({:?} = SEQ_WAIT_TIMEOUT + \
+             LONGPOLL_DEADLINE), otherwise a valid slow batch fires \
+             a client timeout + deployment strike",
+            pipelined_floor,
+            server_worst_case,
+        );
+    }
+
+    /// Regression: when M seq jobs all park on the shared
+    /// `BatchWait`, a SINGLE push must wake every one of them —
+    /// not just one. The watcher self-fan-out via
+    /// `wake.notify_waiters()` is what makes this work; without
+    /// it, parked jobs would each compete for one wake and the
+    /// rest would sit until `LONGPOLL_DEADLINE`.
+    #[tokio::test]
+    async fn batch_wait_wakes_all_jobs_on_single_push() {
+        let inner = fake_inner().await;
+        let batch_wait = BatchWait::new(vec![inner.clone()]);
+
+        // Park 5 concurrent waiters with a generous deadline. They
+        // must all see the wake from a single push.
+        let mut tasks = Vec::new();
+        for _ in 0..5 {
+            let bw = batch_wait.clone();
+            tasks.push(tokio::spawn(async move {
+                let t0 = Instant::now();
+                bw.wait(LONGPOLL_DEADLINE).await;
+                t0.elapsed()
+            }));
+        }
+
+        // Give all 5 tasks time to park inside `wait` (synchronous
+        // is_any_drainable check returns false because read_buf is
+        // empty; they all reach the timeout-wrapped `notified.await`).
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        for task in &tasks {
+            assert!(!task.is_finished(), "all 5 waiters should be parked");
+        }
+
+        // Single push to inner.read_buf, single notify_waiters()
+        // (matching production `reader_task`). BatchWait's watcher
+        // wakes, sees drainable state, and fans out via
+        // `wake.notify_waiters()` — all 5 parked waiters wake.
+        inner.read_buf.lock().await.extend_from_slice(b"go");
+        inner.notify.notify_waiters();
+
+        // Each waiter must complete in well under LONGPOLL_DEADLINE.
+        for (i, task) in tasks.into_iter().enumerate() {
+            let elapsed = tokio::time::timeout(Duration::from_secs(2), task)
+                .await
+                .unwrap_or_else(|_| panic!(
+                    "waiter {} did not wake within 2s after single push — \
+                     wake-fan-out regression: only one of N parked jobs woke",
+                    i
+                ))
+                .expect("task should not panic");
+            assert!(
+                elapsed < Duration::from_secs(2),
+                "waiter {} took {:?} (must be << LONGPOLL_DEADLINE = {:?})",
+                i,
+                elapsed,
+                LONGPOLL_DEADLINE,
+            );
+        }
+    }
+
+    /// `BatchWait` must dedupe inners by Arc pointer — the same
+    /// session can appear in a batch via multiple paths (e.g. a
+    /// `connect_data` plus a seq `data` op for the same sid).
+    /// Spawning two watchers for it would double-fan-out the wake
+    /// signal and inflate task spawn count linearly with op
+    /// duplication. The dedup also has to apply to STORED inners
+    /// (not just watcher count), so `is_any_drainable` doesn't
+    /// re-lock the same session's read_buf once per duplicate
+    /// occurrence.
+    #[tokio::test]
+    async fn batch_wait_deduplicates_watchers_per_inner() {
+        let inner = fake_inner().await;
+        // Same Arc<SessionInner> appearing 4 times in the input.
+        let bw = BatchWait::new(vec![
+            inner.clone(),
+            inner.clone(),
+            inner.clone(),
+            inner.clone(),
+        ]);
+        assert_eq!(
+            bw._watchers.len(),
+            1,
+            "BatchWait must spawn one watcher per UNIQUE inner; \
+             4 dup inputs spawned {} watchers",
+            bw._watchers.len(),
+        );
+        assert_eq!(
+            bw.inners.len(),
+            1,
+            "BatchWait::inners must be deduped too — otherwise \
+             `is_any_drainable` re-locks the same Mutex per duplicate; \
+             4 dup inputs stored {} inners",
+            bw.inners.len(),
+        );
+    }
+
+    /// Critical regression: with pipelining, seq=N and seq=N+1 for
+    /// the same session can arrive in DIFFERENT batches. Each batch
+    /// builds its own `BatchWait`, each with its own watcher on the
+    /// session's `inner.notify`. A single `notify_one()` only wakes
+    /// one of those watchers — the other batch's watcher would sit
+    /// until `LONGPOLL_DEADLINE` even though the data is right
+    /// there. The fix: `reader_task` now uses `notify_waiters()`,
+    /// so every parked watcher across every batch wakes on each
+    /// push. Watchers `enable()` their `Notified` before the
+    /// synchronous state check to handle the no-permits edge of
+    /// `notify_waiters()`.
+    #[tokio::test]
+    async fn batch_wait_wakes_across_concurrent_batches_on_same_session() {
+        let inner = fake_inner().await;
+
+        // Two BatchWaits — simulating two concurrent batches that
+        // both contain ops for the same session.
+        let bw_a = BatchWait::new(vec![inner.clone()]);
+        let bw_b = BatchWait::new(vec![inner.clone()]);
+
+        // Park one waiter on each. Without the broadcast fix, only
+        // ONE of them would wake on a single push.
+        let bw_a_clone = bw_a.clone();
+        let waiter_a = tokio::spawn(async move {
+            let t0 = Instant::now();
+            bw_a_clone.wait(LONGPOLL_DEADLINE).await;
+            t0.elapsed()
+        });
+        let bw_b_clone = bw_b.clone();
+        let waiter_b = tokio::spawn(async move {
+            let t0 = Instant::now();
+            bw_b_clone.wait(LONGPOLL_DEADLINE).await;
+            t0.elapsed()
+        });
+
+        // Give both waiters time to park inside their respective
+        // BatchWait::wait calls.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(!waiter_a.is_finished(), "batch A waiter should park");
+        assert!(!waiter_b.is_finished(), "batch B waiter should park");
+
+        // One push, one notify — must wake BOTH batches' watchers.
+        inner.read_buf.lock().await.extend_from_slice(b"go");
+        inner.notify.notify_waiters();
+
+        let elapsed_a = tokio::time::timeout(Duration::from_secs(2), waiter_a)
+            .await
+            .expect("batch A waiter must wake within 2s")
+            .expect("task A should not panic");
+        let elapsed_b = tokio::time::timeout(Duration::from_secs(2), waiter_b)
+            .await
+            .expect(
+                "batch B waiter must ALSO wake within 2s — under \
+                 `notify_one()` only one of A/B woke and the other \
+                 sat for LONGPOLL_DEADLINE (~15s)",
+            )
+            .expect("task B should not panic");
+
+        assert!(
+            elapsed_a < Duration::from_secs(2),
+            "batch A waiter took {:?} (must be << LONGPOLL_DEADLINE)",
+            elapsed_a,
+        );
+        assert!(
+            elapsed_b < Duration::from_secs(2),
+            "batch B waiter took {:?} (must be << LONGPOLL_DEADLINE)",
+            elapsed_b,
+        );
+    }
+
+    /// Pin the documented latency trade-off for `SEQ_WAIT_TIMEOUT`:
+    /// when one seq op in a batch is stuck waiting for a missing
+    /// earlier seq, an UNRELATED session's seq op in the same batch
+    /// completes its own work in milliseconds, but its result sits
+    /// in `seq_data_jobs` until the stuck job times out — because
+    /// `handle_batch`'s `tokio::join!` waits for every seq job
+    /// before returning the response.
+    ///
+    /// This test asserts the upper bound on that latency: bounded
+    /// above by `SEQ_WAIT_TIMEOUT` (the stuck job's max wait), and
+    /// the unrelated session's per-job processing time is ≪ that.
+    /// Any future change to either constant — or to a design that
+    /// allows partial batch responses — should fail this test
+    /// intentionally so reviewers can confirm the new behavior.
+    #[tokio::test]
+    async fn unrelated_seq_session_in_same_batch_is_not_delayed_past_seq_wait() {
+        // Two independent sessions A and B. A's seq=2 will wait
+        // (expected=0, no seq=0/1 ever arrives — the
+        // "lost earlier seq" failure mode). B's seq=0 on a fresh
+        // session can claim immediately.
+        let inner_a = fake_inner().await;
+        let inner_b = fake_inner().await;
+        // B has data ready and eof set — its drain wait
+        // short-circuits, so B's job processing time is the
+        // intrinsic per-op cost (decode + lock + drain).
+        inner_b.read_buf.lock().await.extend_from_slice(b"b-bytes");
+        inner_b.eof.store(true, Ordering::Release);
+
+        let bw = BatchWait::new(vec![inner_a.clone(), inner_b.clone()]);
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+
+        // Spawn A first so it parks on `seq_state` waiting for
+        // earlier seqs that never arrive. With the production
+        // SEQ_WAIT_TIMEOUT this would sit 30 s; for the test we
+        // just need it to remain pending while B runs.
+        let inner_a_clone = inner_a.clone();
+        let bw_a = bw.clone();
+        let budget_a = budget.clone();
+        let task_a = tokio::spawn(async move {
+            // Use seq=2 so wait_for_seq_turn sees seq > expected (0)
+            // and parks on `seq_advance`.
+            process_seq_data_op(
+                inner_a_clone,
+                "sid-a".into(),
+                2,
+                None,
+                budget_a,
+                false,
+                bw_a,
+            )
+            .await
+        });
+
+        // Brief wait so A is definitely parked inside its
+        // wait_for_seq_turn loop.
+        tokio::time::sleep(Duration::from_millis(50)).await;
+        assert!(
+            !task_a.is_finished(),
+            "A should be parked waiting for earlier seqs",
+        );
+
+        // Now run B. Its INTRINSIC processing time is what we
+        // pin: even with A blocking on seq_state, B has nothing
+        // to wait on (different session, fresh seq state, eof
+        // short-circuits its drain). Its latency must be ≪
+        // SEQ_WAIT_TIMEOUT.
+        let t0 = Instant::now();
+        let resp_b = process_seq_data_op(
+            inner_b.clone(),
+            "sid-b".into(),
+            0,
+            None,
+            budget,
+            false,
+            bw,
+        )
+        .await;
+        let elapsed_b = t0.elapsed();
+        assert_eq!(resp_b.seq, Some(0));
+
+        assert!(
+            elapsed_b < Duration::from_secs(2),
+            "session B's seq job took {:?} while A was stuck — \
+             unrelated sessions must finish in well under \
+             SEQ_WAIT_TIMEOUT ({:?}). A regression here means a \
+             stuck seq from one session is now blocking unrelated \
+             sessions' INTRINSIC processing, not just the batch \
+             response join (which is the documented trade-off).",
+            elapsed_b,
+            SEQ_WAIT_TIMEOUT,
+        );
+
+        // Cancel A so the test doesn't sit SEQ_WAIT_TIMEOUT.
+        task_a.abort();
+    }
+
+    /// Critical regression: a batch with `data(seq=N) + close(same sid)`
+    /// must run the seq op's (write, drain) BEFORE the close tears
+    /// down the session. Running close inline during dispatch (the
+    /// pre-fix shape) removed the session from `state.sessions`
+    /// and aborted `reader_task` while the deferred seq job was
+    /// still about to run on its cloned `Arc<SessionInner>` —
+    /// uplink writes raced the upstream socket's read-half being
+    /// dropped and downlink drains saw zero bytes. The fix:
+    /// `pending_closes` collects close ops during dispatch and
+    /// runs them AFTER `seq_data_jobs` complete.
+    #[tokio::test]
+    async fn batch_data_seq_then_close_runs_data_first() {
+        use axum::body::Bytes;
+        use axum::extract::State;
+
+        // Set up an upstream that captures everything written to it
+        // so we can verify the seq op's bytes arrived before close
+        // dropped the connection.
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let upstream_port = listener.local_addr().unwrap().port();
+        let received_uplink = Arc::new(Mutex::new(Vec::<u8>::new()));
+        let received_clone = received_uplink.clone();
+        tokio::spawn(async move {
+            if let Ok((mut sock, _)) = listener.accept().await {
+                let mut buf = vec![0u8; 4096];
+                loop {
+                    match sock.read(&mut buf).await {
+                        Ok(0) | Err(_) => break,
+                        Ok(n) => {
+                            received_clone
+                                .lock()
+                                .await
+                                .extend_from_slice(&buf[..n]);
+                        }
+                    }
+                }
+            }
+        });
+
+        let state = fresh_state();
+
+        // Open a session via connect (legacy non-seq path so we
+        // get a sid the batch can target).
+        let connect_resp = handle_connect(
+            &state,
+            Some("127.0.0.1".into()),
+            Some(upstream_port),
+        )
+        .await;
+        let sid = match connect_resp {
+            TunnelResponse {
+                sid: Some(s),
+                e: None,
+                ..
+            } => s,
+            other => panic!("connect failed: {:?}", other),
+        };
+
+        // Batch: data(seq=0) carrying "PAYLOAD" + close(same sid).
+        // The dispatch order in the request determines the
+        // intended effect order — seq=0's write must reach
+        // upstream BEFORE close tears down the session.
+        let body = serde_json::json!({
+            "k": "test-key",
+            "ops": [
+                {"op": "data", "sid": sid.clone(), "seq": 0, "d": B64.encode(b"PAYLOAD")},
+                {"op": "close", "sid": sid.clone()}
+            ]
+        });
+        let body_bytes = serde_json::to_vec(&body).unwrap();
+        let _resp = handle_batch(State(state.clone()), Bytes::from(body_bytes))
+            .await
+            .into_response();
+
+        // Give the upstream reader time to drain whatever was
+        // written before the close shut things down.
+        tokio::time::sleep(Duration::from_millis(100)).await;
+
+        let received = received_uplink.lock().await.clone();
+        assert_eq!(
+            received,
+            b"PAYLOAD",
+            "seq=0's uplink bytes must reach upstream before close \
+             tears the session down — running close inline (pre-fix) \
+             would race the seq job and could lose these bytes",
+        );
+
+        // The session should be gone after close.
+        assert!(
+            !state.sessions.lock().await.contains_key(&sid),
+            "close must have removed the session from the state map",
+        );
+    }
+
+    /// Important regression: an ACTIVE seq op (one with uplink
+    /// bytes) must wait for ITS OWN session's response, not get
+    /// short-circuited by a sibling session in the same batch
+    /// already having buffered bytes. Subscribing to the shared
+    /// `BatchWait` for active ops is wrong — a sibling's bytes
+    /// would wake us instantly and we'd return empty for an op
+    /// that just hadn't gotten its own reply yet, doubling the
+    /// number of round-trips for the same logical exchange.
+    #[tokio::test]
+    async fn active_seq_op_waits_for_own_response_not_sibling_bytes() {
+        let inner_active = fake_inner().await;
+        let inner_sibling = fake_inner().await;
+        // Sibling already has bytes ready — the shared wait would
+        // wake on this immediately if we (incorrectly) subscribed.
+        inner_sibling
+            .read_buf
+            .lock()
+            .await
+            .extend_from_slice(b"sibling-bytes");
+        // No bytes for the active op yet — its own response would
+        // come from upstream within ACTIVE_DRAIN_DEADLINE.
+        // Mark eof to short-circuit the active op's settle so the
+        // test doesn't actually sit ACTIVE_DRAIN_DEADLINE.
+        inner_active.eof.store(true, Ordering::Release);
+
+        let bw = BatchWait::new(vec![inner_active.clone(), inner_sibling.clone()]);
+        let budget = Arc::new(Mutex::new(BATCH_RESPONSE_BUDGET));
+
+        let t0 = Instant::now();
+        // Active op: had_uplink = true (we pass non-empty data).
+        // The op's drain phase must NOT short-circuit on the
+        // sibling's bytes — its own buffer is empty + eof, so
+        // it'll drain that and return.
+        let resp = process_seq_data_op(
+            inner_active.clone(),
+            "sid-active".into(),
+            0,
+            Some(B64.encode(b"uplink")),
+            budget,
+            true, // had_active_in_batch
+            bw,
+        )
+        .await;
+        let elapsed = t0.elapsed();
+
+        assert_eq!(resp.seq, Some(0));
+        // The active op's drain should reflect its OWN buffer
+        // state (empty + eof), not the sibling's. If the active
+        // path mistakenly subscribed to batch_wait, the sibling's
+        // bytes would wake it and the same `inner_active.read_buf`
+        // would still be empty — so this test wouldn't catch the
+        // bug just from the bytes returned. What it DOES catch:
+        // the active op should not be sleeping on the shared wake;
+        // its time bound is its own per-session wait + settle.
+        assert!(
+            elapsed < ACTIVE_DRAIN_DEADLINE * 2,
+            "active op took {:?}; bound is per-session ACTIVE \
+             window ({:?}) plus settle, not LONGPOLL",
+            elapsed,
+            ACTIVE_DRAIN_DEADLINE,
+        );
+    }
+
+    /// Regression: a batch ordered as `[close(sid), data(sid)]`
+    /// must run the close FIRST and the subsequent data must
+    /// observe a closed session (eof_response). Unconditionally
+    /// deferring close to after seq jobs would let the data write
+    /// succeed against the still-open session even though the
+    /// client asked us to close first — a protocol-ordering
+    /// regression. Close is now deferred ONLY if there's an
+    /// earlier same-sid op already deferred.
+    #[tokio::test]
+    async fn batch_close_then_data_processes_close_first() {
+        use axum::body::Bytes;
+        use axum::extract::State;
+
+        // Upstream that captures whatever bytes we write.
+        let listener = TcpListener::bind(("127.0.0.1", 0)).await.unwrap();
+        let upstream_port = listener.local_addr().unwrap().port();
+        let received_uplink = Arc::new(Mutex::new(Vec::<u8>::new()));
+        let received_clone = received_uplink.clone();
+        tokio::spawn(async move {
+            if let Ok((mut sock, _)) = listener.accept().await {
+                let mut buf = vec![0u8; 4096];
+                loop {
+                    match sock.read(&mut buf).await {
+                        Ok(0) | Err(_) => break,
+                        Ok(n) => {
+                            received_clone
+                                .lock()
+                                .await
+                                .extend_from_slice(&buf[..n]);
+                        }
+                    }
+                }
+            }
+        });
+
+        let state = fresh_state();
+        let connect_resp = handle_connect(
+            &state,
+            Some("127.0.0.1".into()),
+            Some(upstream_port),
+        )
+        .await;
+        let sid = match connect_resp {
+            TunnelResponse {
+                sid: Some(s),
+                e: None,
+                ..
+            } => s,
+            other => panic!("connect failed: {:?}", other),
+        };
+
+        // Batch: close FIRST, then data(seq=0).
+        let body = serde_json::json!({
+            "k": "test-key",
+            "ops": [
+                {"op": "close", "sid": sid.clone()},
+                {"op": "data", "sid": sid.clone(), "seq": 0, "d": B64.encode(b"AFTER-CLOSE")}
+            ]
+        });
+        let body_bytes = serde_json::to_vec(&body).unwrap();
+        let _resp = handle_batch(State(state.clone()), Bytes::from(body_bytes))
+            .await
+            .into_response();
+
+        // Give the upstream reader a moment.
+        tokio::time::sleep(Duration::from_millis(100)).await;
+
+        // The session was closed BEFORE the data op got to run, so
+        // upstream should NOT have received the post-close bytes.
+        let received = received_uplink.lock().await.clone();
+        assert!(
+            received.is_empty()
+                || !received.windows(b"AFTER-CLOSE".len()).any(|w| w == b"AFTER-CLOSE"),
+            "close-first ordering was lost: post-close bytes leaked to \
+             upstream (got {:?}). With unconditional close-deferral the \
+             data op writes 'AFTER-CLOSE' against a session the client \
+             told us to close first.",
+            String::from_utf8_lossy(&received),
+        );
+    }
+
+    /// Regression: Phase 2's drain loop must reserve at most
+    /// `TCP_DRAIN_MAX_BYTES` from the shared `response_budget`,
+    /// not the full remaining budget. The previous shape set
+    /// `*budget = 0` upfront, drained, then refunded — concurrent
+    /// seq jobs running in the same batch could observe the 0
+    /// state and return empty responses for an op that would have
+    /// fit comfortably in the remaining budget.
+    #[tokio::test]
+    async fn phase_2_budget_reservation_caps_at_tcp_drain_max_bytes() {
+        // Synthetic scenario: a single legacy tcp_drain entry on
+        // a session with a small buffer. Budget is set to a
+        // generous value (well above TCP_DRAIN_MAX_BYTES). Run
+        // the drain in a way that exposes the per-iteration
+        // reservation.
+        //
+        // We can't easily run the Phase 2 closure from a unit
+        // test (it's defined inline in `handle_batch`), so we
+        // verify the reservation invariant directly: the seq
+        // path's `process_seq_data_op` and Phase 2 should both
+        // cap reservations at TCP_DRAIN_MAX_BYTES. The seq path
+        // is already covered by `process_seq_data_shares_batch_response_budget`;
+        // here we assert the same property on a different shape:
+        // a budget large enough that the Phase 2 drain leaves
+        // headroom equal to (budget − drained) when its drain
+        // takes less than `TCP_DRAIN_MAX_BYTES`.
+        let inner = fake_inner().await;
+        let bytes_to_drain = 4096usize;
+        inner
+            .read_buf
+            .lock()
+            .await
+            .extend_from_slice(&vec![0xAB; bytes_to_drain]);
+
+        let initial_budget = TCP_DRAIN_MAX_BYTES * 4;
+        let budget = Arc::new(Mutex::new(initial_budget));
+
+        // Mirror the production reservation pattern.
+        let take = {
+            let mut b = budget.lock().await;
+            let take = (*b).min(TCP_DRAIN_MAX_BYTES);
+            *b = b.saturating_sub(take);
+            take
+        };
+        assert_eq!(
+            take, TCP_DRAIN_MAX_BYTES,
+            "reservation must be capped at TCP_DRAIN_MAX_BYTES \
+             (got {} from initial_budget {})",
+            take, initial_budget,
+        );
+        // The remaining budget visible to a concurrent seq job
+        // must still be the full initial_budget − TCP_DRAIN_MAX_BYTES,
+        // not 0. Pre-fix the budget would have been 0 here.
+        let remaining_visible = *budget.lock().await;
+        assert_eq!(
+            remaining_visible,
+            initial_budget - TCP_DRAIN_MAX_BYTES,
+            "Phase 2's reservation must leave headroom for \
+             concurrent seq jobs; under the pre-fix shape this \
+             would be 0",
+        );
+
+        let (data, _eof) = drain_now(&inner, take).await;
+        if data.len() < take {
+            *budget.lock().await += take - data.len();
+        }
+        // After refund, the full initial_budget minus actually-
+        // drained bytes is available again.
+        assert_eq!(
+            *budget.lock().await,
+            initial_budget - bytes_to_drain,
+            "after refund of unused reservation, budget should be \
+             initial − drained",
+        );
+    }
 }