From cf5a154df0b20aef52f62db1705bae0b33a78ebc Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 22:14:45 -0400
Subject: [PATCH 1/4] fix(e2e): remove docker network leak with synchronous
 verified teardown

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/harness.go | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go
index dda8032..71d40c6 100644
--- a/e2e/harness/harness.go
+++ b/e2e/harness/harness.go
@@ -858,7 +858,18 @@ func (h *Harness) getProjectRoot() (string, error) {
 	}
 }
 
-// Cleanup terminates all containers
+// Cleanup terminates all containers and removes the scenario's docker network.
+//
+// Network removal is synchronous and verified, not fire-and-forget: a single
+// scenario allocates a /16 (or /24) from the daemon's address pool, and across
+// the serial scenario suite a leaked network drains that pool until a late
+// scenario cannot allocate one and dies at setup ("all predefined address
+// pools have been fully subnetted"). The retry layer defers Cleanup per
+// attempt, so every attempt - including a failed one - releases its network
+// here. Because the act and gitea containers detach from the network during
+// their own teardown, Remove can briefly observe an "active endpoints" error;
+// a short bounded retry lets that detach settle so the count returns to
+// baseline rather than growing monotonically.
 func (h *Harness) Cleanup() {
 	ctx := context.Background()
 	if h.act != nil {
@@ -868,6 +879,25 @@ func (h *Harness) Cleanup() {
 		_ = h.gitea.Terminate(ctx)
 	}
 	if h.network != nil {
-		_ = h.network.Remove(ctx)
+		h.removeNetwork(ctx)
+	}
+}
+
+// removeNetwork removes the scenario network, waiting for and checking the
+// result. It retries briefly so a container that is still detaching cannot
+// leave the network behind, and logs (rather than swallows) a terminal
+// failure so a genuine leak is visible in the test output.
+func (h *Harness) removeNetwork(ctx context.Context) {
+	const attempts = 5
+	var err error
+	for i := 0; i < attempts; i++ {
+		if err = h.network.Remove(ctx); err == nil {
+			return
+		}
+		time.Sleep(500 * time.Millisecond)
+	}
+	if h.t != nil {
+		h.t.Logf("warning: failed to remove docker network %q after %d attempts: %v",
+			h.networkName, attempts, err)
 	}
 }

From 0091d7b1e02221fae1cae3842dc6ebe12caa4378 Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 22:14:45 -0400
Subject: [PATCH 2/4] fix(e2e): raise scenario retry cap to 5 with backoff and
 network prune

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/scenario_retry.go      | 38 +++++++++++++++++++++++++++++-
 e2e/harness/scenario_retry_test.go | 33 ++++++++++++++++++++++++--
 2 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go
index f9c7876..0440a15 100644
--- a/e2e/harness/scenario_retry.go
+++ b/e2e/harness/scenario_retry.go
@@ -3,7 +3,9 @@ package harness
 import (
 	"context"
 	"fmt"
+	"os/exec"
 	"testing"
+	"time"
 )
 
 // scenarioRetryAttempts bounds how many times a single multi-step scenario is
@@ -11,7 +13,22 @@ import (
 // containers, so a retry starts from a clean slate with no partial mutation
 // carried over. Only transient act/docker execution failures consume an
 // attempt; real assertion or job-level failures fail on the first attempt.
-const scenarioRetryAttempts = 3
+//
+// Five attempts gives contention-driven transients a couple more chances under
+// heavy CI load: the recovery logs show several scenarios passing on attempt 2
+// or 3, so the mechanism works and the extra headroom covers the slowest tail.
+const scenarioRetryAttempts = 5
+
+// scenarioRetryBackoff is the pause between scenario attempts. It lets a burst
+// of container/docker contention subside before the next clean-slate attempt
+// rather than retrying instantly into the same pressure. It is a var, not a
+// const, so unit tests can zero it to avoid real sleeps.
+var scenarioRetryBackoff = 5 * time.Second
+
+// pruneBetweenAttempts reclaims orphaned docker networks between attempts. It
+// is a var so unit tests can replace it with a no-op (and so the production
+// path stays a single, testable seam over the docker CLI).
+var pruneBetweenAttempts = pruneDockerNetworks
 
 // logger is the minimal logging surface a scenario attempt needs. *testing.T
 // satisfies it, and unit tests can supply a fake to assert on retry logging.
@@ -48,6 +65,17 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
 		if n < scenarioRetryAttempts {
 			log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
 				name, n, scenarioRetryAttempts, err)
+			// Reclaim any docker networks orphaned by the failed attempt and
+			// pause so contention can subside before the next clean-slate
+			// attempt. Both are best effort: the attempt's own Cleanup already
+			// removes its network, and a prune failure must not mask the
+			// scenario result.
+			pruneBetweenAttempts(ctx)
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(scenarioRetryBackoff):
+			}
 			continue
 		}
 		log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
@@ -56,6 +84,14 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
 	return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
 }
 
+// pruneDockerNetworks best-effort reclaims unused docker networks between
+// scenario attempts so a network orphaned by a failed attempt cannot
+// accumulate and exhaust the daemon's address pool. Any error is intentionally
+// ignored: this is a defensive reclaim, not a correctness requirement.
+func pruneDockerNetworks(ctx context.Context) {
+	_ = exec.CommandContext(ctx, "docker", "network", "prune", "-f").Run()
+}
+
 // RunMultiStepScenario runs a whole multi-step scenario with a bounded
 // scenario-level retry on transient act/docker execution failures. Each attempt
 // builds a fresh harness (new docker network, gitea container + repo, act
diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go
index e146d46..afb5c46 100644
--- a/e2e/harness/scenario_retry_test.go
+++ b/e2e/harness/scenario_retry_test.go
@@ -36,6 +36,26 @@ func transientErr() error {
 	return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
 }
 
+// fastRetries removes the inter-attempt pause and stubs out the docker network
+// prune for the duration of a test so the retry loop neither sleeps nor shells
+// out to docker in unit tests, and restores both on cleanup. It records how
+// many times the prune seam fired so callers can assert prune-between-attempts
+// behaviour. These globals are shared, so a test using this must not run in
+// parallel.
+func fastRetries(t *testing.T) *int {
+	t.Helper()
+	prevBackoff := scenarioRetryBackoff
+	prevPrune := pruneBetweenAttempts
+	prunes := 0
+	scenarioRetryBackoff = 0
+	pruneBetweenAttempts = func(context.Context) { prunes++ }
+	t.Cleanup(func() {
+		scenarioRetryBackoff = prevBackoff
+		pruneBetweenAttempts = prevPrune
+	})
+	return &prunes
+}
+
 func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
 	t.Parallel()
 
@@ -57,7 +77,7 @@ func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
 }
 
 func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
-	t.Parallel()
+	prunes := fastRetries(t)
 
 	log := &fakeLogger{}
 	calls := 0
@@ -74,6 +94,11 @@ func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
 	if calls != 2 {
 		t.Fatalf("attempts = %d, want 2", calls)
 	}
+	// One failed attempt preceded the recovery, so the prune-between-attempts
+	// seam must have fired exactly once.
+	if *prunes != 1 {
+		t.Fatalf("prune-between-attempts fired %d times, want 1", *prunes)
+	}
 	if !log.contains("transient act/docker execution failure on attempt 1") {
 		t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
 	}
@@ -130,7 +155,7 @@ func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
 }
 
 func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
-	t.Parallel()
+	prunes := fastRetries(t)
 
 	log := &fakeLogger{}
 	calls := 0
@@ -144,6 +169,10 @@ func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T
 	if calls != scenarioRetryAttempts {
 		t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
 	}
+	// A prune runs between attempts but not after the final one.
+	if *prunes != scenarioRetryAttempts-1 {
+		t.Fatalf("prune-between-attempts fired %d times, want %d", *prunes, scenarioRetryAttempts-1)
+	}
 	if !IsTransientWorkflowError(err) {
 		t.Fatalf("final error should still wrap the transient sentinel: %v", err)
 	}

From d7c137ffb91fb5c34786c56565ce48b402d5e5ff Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 22:14:45 -0400
Subject: [PATCH 3/4] ci: widen docker address pool for e2e network headroom

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 .github/workflows/build-cli.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/.github/workflows/build-cli.yaml b/.github/workflows/build-cli.yaml
index c77c686..b293385 100644
--- a/.github/workflows/build-cli.yaml
+++ b/.github/workflows/build-cli.yaml
@@ -64,6 +64,21 @@ jobs:
           cache: true
           cache-dependency-path: e2e/go.sum
 
+      - name: Widen Docker network address pool
+        # Each scenario creates its own docker network, and the daemon's
+        # default address pool is small. Even with synchronous per-scenario
+        # network teardown, brief cleanup lag under serial load can leave the
+        # pool short and fail a later scenario at setup with "all predefined
+        # address pools have been fully subnetted". Carving /24 subnets out of
+        # a 10.99.0.0/16 base yields 256 networks of headroom, well above the
+        # handful in flight at once, so cleanup lag can never exhaust the pool.
+        run: |
+          sudo mkdir -p /etc/docker
+          echo '{"default-address-pools":[{"base":"10.99.0.0/16","size":24}]}' | sudo tee /etc/docker/daemon.json
+          sudo systemctl restart docker
+          # Wait for the daemon to come back before the tests start.
+          timeout 60 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'
+
       - name: Run E2E tests
         working-directory: e2e
         # Run scenarios serially (-parallel 1). The 4-core / ~7.9GB runner is

From 869998e0a8913693b2dc744d4033eddaf6343d45 Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 22:24:12 -0400
Subject: [PATCH 4/4] fix(e2e): reap act-spawned job containers blocking
 network removal

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/harness.go | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go
index 71d40c6..098bf41 100644
--- a/e2e/harness/harness.go
+++ b/e2e/harness/harness.go
@@ -884,13 +884,24 @@ func (h *Harness) Cleanup() {
 }
 
 // removeNetwork removes the scenario network, waiting for and checking the
-// result. It retries briefly so a container that is still detaching cannot
-// leave the network behind, and logs (rather than swallows) a terminal
-// failure so a genuine leak is visible in the test output.
+// result, and logs (rather than swallows) a terminal failure so a genuine leak
+// is visible in the test output.
+//
+// act runs each job by spawning a NESTED container over the docker socket and
+// attaching it to this network. Those job containers are act's children, not
+// testcontainers-managed, so terminating the act runner does not reap them;
+// one can outlive the runner and hold the network open, failing Remove with an
+// "active endpoints" error. So before each Remove attempt we force-remove any
+// container still attached to the network, then retry briefly to let the
+// detach settle. This keeps the network count flat across the suite instead of
+// leaking one network per scenario whose job container lingered.
 func (h *Harness) removeNetwork(ctx context.Context) {
 	const attempts = 5
 	var err error
 	for i := 0; i < attempts; i++ {
+		if i > 0 {
+			h.disconnectNetworkContainers(ctx)
+		}
 		if err = h.network.Remove(ctx); err == nil {
 			return
 		}
@@ -901,3 +912,19 @@ func (h *Harness) removeNetwork(ctx context.Context) {
 			h.networkName, attempts, err)
 	}
 }
+
+// disconnectNetworkContainers force-removes every container still attached to
+// the scenario network so it can be removed. It targets act's nested job
+// containers, which are not testcontainers-managed and so survive the act
+// runner's termination. Best effort: a container that is already gone or a
+// docker hiccup must not abort cleanup.
+func (h *Harness) disconnectNetworkContainers(ctx context.Context) {
+	out, err := exec.CommandContext(ctx, "docker", "network", "inspect",
+		"--format", "{{range .Containers}}{{.Name}} {{end}}", h.networkName).Output()
+	if err != nil {
+		return
+	}
+	for _, name := range strings.Fields(string(out)) {
+		_ = exec.CommandContext(ctx, "docker", "rm", "-f", name).Run()
+	}
+}