stablekernel · joshua-temple · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.github/workflows/build-cli.yaml b/.github/workflows/build-cli.yaml
@@ -64,6 +64,21 @@ jobs:
           cache: true
           cache-dependency-path: e2e/go.sum
 
+      - name: Widen Docker network address pool
+        # Each scenario creates its own docker network, and the daemon's
+        # default address pool is small. Even with synchronous per-scenario
+        # network teardown, brief cleanup lag under serial load can leave the
+        # pool short and fail a later scenario at setup with "all predefined
+        # address pools have been fully subnetted". Carving /24 subnets out of
+        # a 10.99.0.0/16 base yields 256 networks of headroom, well above the
+        # handful in flight at once, so cleanup lag can never exhaust the pool.
+        run: |
+          sudo mkdir -p /etc/docker
+          echo '{"default-address-pools":[{"base":"10.99.0.0/16","size":24}]}' | sudo tee /etc/docker/daemon.json
+          sudo systemctl restart docker
+          # Wait for the daemon to come back before the tests start.
+          timeout 60 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
+
       - name: Run E2E tests
         working-directory: e2e
         # Run scenarios serially (-parallel 1). The 4-core / ~7.9GB runner is

diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go
@@ -858,7 +858,18 @@ func (h *Harness) getProjectRoot() (string, error) {
 	}
 }
 
-// Cleanup terminates all containers
+// Cleanup terminates all containers and removes the scenario's docker network.
+//
+// Network removal is synchronous and verified, not fire-and-forget: a single
+// scenario allocates a /16 (or /24) from the daemon's address pool, and across
+// the serial scenario suite a leaked network drains that pool until a late
+// scenario cannot allocate one and dies at setup ("all predefined address
+// pools have been fully subnetted"). The retry layer defers Cleanup per
+// attempt, so every attempt - including a failed one - releases its network
+// here. Because the act and gitea containers detach from the network during
+// their own teardown, Remove can briefly observe an "active endpoints" error;
+// a short bounded retry lets that detach settle so the count returns to
+// baseline rather than growing monotonically.
 func (h *Harness) Cleanup() {
 	ctx := context.Background()
 	if h.act != nil {
@@ -868,6 +879,52 @@ func (h *Harness) Cleanup() {
 		_ = h.gitea.Terminate(ctx)
 	}
 	if h.network != nil {
-		_ = h.network.Remove(ctx)
+		h.removeNetwork(ctx)
+	}
+}
+
+// removeNetwork removes the scenario network, waiting for and checking the
+// result, and logs (rather than swallows) a terminal failure so a genuine leak
+// is visible in the test output.
+//
+// act runs each job by spawning a NESTED container over the docker socket and
+// attaching it to this network. Those job containers are act's children, not
+// testcontainers-managed, so terminating the act runner does not reap them;
+// one can outlive the runner and hold the network open, failing Remove with an
+// "active endpoints" error. So before each Remove attempt we force-remove any
+// container still attached to the network, then retry briefly to let the
+// detach settle. This keeps the network count flat across the suite instead of
+// leaking one network per scenario whose job container lingered.
+func (h *Harness) removeNetwork(ctx context.Context) {
+	const attempts = 5
+	var err error
+	for i := 0; i < attempts; i++ {
+		if i > 0 {
+			h.disconnectNetworkContainers(ctx)
+		}
+		if err = h.network.Remove(ctx); err == nil {
+			return
+		}
+		time.Sleep(500 * time.Millisecond)
+	}
+	if h.t != nil {
+		h.t.Logf("warning: failed to remove docker network %q after %d attempts: %v",
+			h.networkName, attempts, err)
+	}
+}
+
+// disconnectNetworkContainers force-removes every container still attached to
+// the scenario network so it can be removed. It targets act's nested job
+// containers, which are not testcontainers-managed and so survive the act
+// runner's termination. Best effort: a container that is already gone or a
+// docker hiccup must not abort cleanup.
+func (h *Harness) disconnectNetworkContainers(ctx context.Context) {
+	out, err := exec.CommandContext(ctx, "docker", "network", "inspect",
+		"--format", "{{range .Containers}}{{.Name}} {{end}}", h.networkName).Output()
+	if err != nil {
+		return
+	}
+	for _, name := range strings.Fields(string(out)) {
+		_ = exec.CommandContext(ctx, "docker", "rm", "-f", name).Run()
 	}
 }
diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go
@@ -3,15 +3,32 @@ package harness
 import (
 	"context"
 	"fmt"
+	"os/exec"
 	"testing"
+	"time"
 )
 
 // scenarioRetryAttempts bounds how many times a single multi-step scenario is
 // run end to end. Each attempt runs against a fresh gitea repo and fresh act
 // containers, so a retry starts from a clean slate with no partial mutation
 // carried over. Only transient act/docker execution failures consume an
 // attempt; real assertion or job-level failures fail on the first attempt.
-const scenarioRetryAttempts = 3
+//
+// Five attempts gives contention-driven transients a couple more chances under
+// heavy CI load: the recovery logs show several scenarios passing on attempt 2
+// or 3, so the mechanism works and the extra headroom covers the slowest tail.
+const scenarioRetryAttempts = 5
+
+// scenarioRetryBackoff is the pause between scenario attempts. It lets a burst
+// of container/docker contention subside before the next clean-slate attempt
+// rather than retrying instantly into the same pressure. It is a var, not a
+// const, so unit tests can zero it to avoid real sleeps.
+var scenarioRetryBackoff = 5 * time.Second
+
+// pruneBetweenAttempts reclaims orphaned docker networks between attempts. It
+// is a var so unit tests can replace it with a no-op (and so the production
+// path stays a single, testable seam over the docker CLI).
+var pruneBetweenAttempts = pruneDockerNetworks
 
 // logger is the minimal logging surface a scenario attempt needs. *testing.T
 // satisfies it, and unit tests can supply a fake to assert on retry logging.
@@ -48,6 +65,17 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
 		if n < scenarioRetryAttempts {
 			log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
 				name, n, scenarioRetryAttempts, err)
+			// Reclaim any docker networks orphaned by the failed attempt and
+			// pause so contention can subside before the next clean-slate
+			// attempt. Both are best effort: the attempt's own Cleanup already
+			// removes its network, and a prune failure must not mask the
+			// scenario result.
+			pruneBetweenAttempts(ctx)
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(scenarioRetryBackoff):
+			}
 			continue
 		}
 		log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
@@ -56,6 +84,14 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
 	return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
 }
 
+// pruneDockerNetworks best-effort reclaims unused docker networks between
+// scenario attempts so a network orphaned by a failed attempt cannot
+// accumulate and exhaust the daemon's address pool. Any error is intentionally
+// ignored: this is a defensive reclaim, not a correctness requirement.
+func pruneDockerNetworks(ctx context.Context) {
+	_ = exec.CommandContext(ctx, "docker", "network", "prune", "-f").Run()
+}
+
 // RunMultiStepScenario runs a whole multi-step scenario with a bounded
 // scenario-level retry on transient act/docker execution failures. Each attempt
 // builds a fresh harness (new docker network, gitea container + repo, act

diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go
@@ -36,6 +36,26 @@ func transientErr() error {
 	return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
 }
 
+// fastRetries removes the inter-attempt pause and stubs out the docker network
+// prune for the duration of a test so the retry loop neither sleeps nor shells
+// out to docker in unit tests, and restores both on cleanup. It records how
+// many times the prune seam fired so callers can assert prune-between-attempts
+// behaviour. These globals are shared, so a test using this must not run in
+// parallel.
+func fastRetries(t *testing.T) *int {
+	t.Helper()
+	prevBackoff := scenarioRetryBackoff
+	prevPrune := pruneBetweenAttempts
+	prunes := 0
+	scenarioRetryBackoff = 0
+	pruneBetweenAttempts = func(context.Context) { prunes++ }
+	t.Cleanup(func() {
+		scenarioRetryBackoff = prevBackoff
+		pruneBetweenAttempts = prevPrune
+	})
+	return &prunes
+}
+
 func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
 	t.Parallel()
 
@@ -57,7 +77,7 @@ func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
 }
 
 func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
-	t.Parallel()
+	prunes := fastRetries(t)
 
 	log := &fakeLogger{}
 	calls := 0
@@ -74,6 +94,11 @@ func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
 	if calls != 2 {
 		t.Fatalf("attempts = %d, want 2", calls)
 	}
+	// One failed attempt preceded the recovery, so the prune-between-attempts
+	// seam must have fired exactly once.
+	if *prunes != 1 {
+		t.Fatalf("prune-between-attempts fired %d times, want 1", *prunes)
+	}
 	if !log.contains("transient act/docker execution failure on attempt 1") {
 		t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
 	}
@@ -130,7 +155,7 @@ func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
 }
 
 func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
-	t.Parallel()
+	prunes := fastRetries(t)
 
 	log := &fakeLogger{}
 	calls := 0
@@ -144,6 +169,10 @@ func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T
 	if calls != scenarioRetryAttempts {
 		t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
 	}
+	// A prune runs between attempts but not after the final one.
+	if *prunes != scenarioRetryAttempts-1 {
+		t.Fatalf("prune-between-attempts fired %d times, want %d", *prunes, scenarioRetryAttempts-1)
+	}
 	if !IsTransientWorkflowError(err) {
 		t.Fatalf("final error should still wrap the transient sentinel: %v", err)
 	}