From cf5a154df0b20aef52f62db1705bae0b33a78ebc Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 22:14:45 -0400 Subject: [PATCH 1/4] fix(e2e): remove docker network leak with synchronous verified teardown Signed-off-by: Joshua Temple --- e2e/harness/harness.go | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go index dda8032..71d40c6 100644 --- a/e2e/harness/harness.go +++ b/e2e/harness/harness.go @@ -858,7 +858,18 @@ func (h *Harness) getProjectRoot() (string, error) { } } -// Cleanup terminates all containers +// Cleanup terminates all containers and removes the scenario's docker network. +// +// Network removal is synchronous and verified, not fire-and-forget: a single +// scenario allocates a /16 (or /24) from the daemon's address pool, and across +// the serial scenario suite a leaked network drains that pool until a late +// scenario cannot allocate one and dies at setup ("all predefined address +// pools have been fully subnetted"). The retry layer defers Cleanup per +// attempt, so every attempt - including a failed one - releases its network +// here. Because the act and gitea containers detach from the network during +// their own teardown, Remove can briefly observe an "active endpoints" error; +// a short bounded retry lets that detach settle so the count returns to +// baseline rather than growing monotonically. func (h *Harness) Cleanup() { ctx := context.Background() if h.act != nil { @@ -868,6 +879,25 @@ func (h *Harness) Cleanup() { _ = h.gitea.Terminate(ctx) } if h.network != nil { - _ = h.network.Remove(ctx) + h.removeNetwork(ctx) + } +} + +// removeNetwork removes the scenario network, waiting for and checking the +// result. It retries briefly so a container that is still detaching cannot +// leave the network behind, and logs (rather than swallows) a terminal +// failure so a genuine leak is visible in the test output. +func (h *Harness) removeNetwork(ctx context.Context) { + const attempts = 5 + var err error + for i := 0; i < attempts; i++ { + if err = h.network.Remove(ctx); err == nil { + return + } + time.Sleep(500 * time.Millisecond) + } + if h.t != nil { + h.t.Logf("warning: failed to remove docker network %q after %d attempts: %v", + h.networkName, attempts, err) } } From 0091d7b1e02221fae1cae3842dc6ebe12caa4378 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 22:14:45 -0400 Subject: [PATCH 2/4] fix(e2e): raise scenario retry cap to 5 with backoff and network prune Signed-off-by: Joshua Temple --- e2e/harness/scenario_retry.go | 38 +++++++++++++++++++++++++++++- e2e/harness/scenario_retry_test.go | 33 ++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 3 deletions(-) diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go index f9c7876..0440a15 100644 --- a/e2e/harness/scenario_retry.go +++ b/e2e/harness/scenario_retry.go @@ -3,7 +3,9 @@ package harness import ( "context" "fmt" + "os/exec" "testing" + "time" ) // scenarioRetryAttempts bounds how many times a single multi-step scenario is @@ -11,7 +13,22 @@ import ( // containers, so a retry starts from a clean slate with no partial mutation // carried over. Only transient act/docker execution failures consume an // attempt; real assertion or job-level failures fail on the first attempt. -const scenarioRetryAttempts = 3 +// +// Five attempts gives contention-driven transients a couple more chances under +// heavy CI load: the recovery logs show several scenarios passing on attempt 2 +// or 3, so the mechanism works and the extra headroom covers the slowest tail. +const scenarioRetryAttempts = 5 + +// scenarioRetryBackoff is the pause between scenario attempts. It lets a burst +// of container/docker contention subside before the next clean-slate attempt +// rather than retrying instantly into the same pressure. It is a var, not a +// const, so unit tests can zero it to avoid real sleeps. +var scenarioRetryBackoff = 5 * time.Second + +// pruneBetweenAttempts reclaims orphaned docker networks between attempts. It +// is a var so unit tests can replace it with a no-op (and so the production +// path stays a single, testable seam over the docker CLI). +var pruneBetweenAttempts = pruneDockerNetworks // logger is the minimal logging surface a scenario attempt needs. *testing.T // satisfies it, and unit tests can supply a fake to assert on retry logging. @@ -48,6 +65,17 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt if n < scenarioRetryAttempts { log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v", name, n, scenarioRetryAttempts, err) + // Reclaim any docker networks orphaned by the failed attempt and + // pause so contention can subside before the next clean-slate + // attempt. Both are best effort: the attempt's own Cleanup already + // removes its network, and a prune failure must not mask the + // scenario result. + pruneBetweenAttempts(ctx) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(scenarioRetryBackoff): + } continue } log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v", @@ -56,6 +84,14 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr) } +// pruneDockerNetworks best-effort reclaims unused docker networks between +// scenario attempts so a network orphaned by a failed attempt cannot +// accumulate and exhaust the daemon's address pool. Any error is intentionally +// ignored: this is a defensive reclaim, not a correctness requirement. +func pruneDockerNetworks(ctx context.Context) { + _ = exec.CommandContext(ctx, "docker", "network", "prune", "-f").Run() +} + // RunMultiStepScenario runs a whole multi-step scenario with a bounded // scenario-level retry on transient act/docker execution failures. Each attempt // builds a fresh harness (new docker network, gitea container + repo, act diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go index e146d46..afb5c46 100644 --- a/e2e/harness/scenario_retry_test.go +++ b/e2e/harness/scenario_retry_test.go @@ -36,6 +36,26 @@ func transientErr() error { return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow) } +// fastRetries removes the inter-attempt pause and stubs out the docker network +// prune for the duration of a test so the retry loop neither sleeps nor shells +// out to docker in unit tests, and restores both on cleanup. It records how +// many times the prune seam fired so callers can assert prune-between-attempts +// behaviour. These globals are shared, so a test using this must not run in +// parallel. +func fastRetries(t *testing.T) *int { + t.Helper() + prevBackoff := scenarioRetryBackoff + prevPrune := pruneBetweenAttempts + prunes := 0 + scenarioRetryBackoff = 0 + pruneBetweenAttempts = func(context.Context) { prunes++ } + t.Cleanup(func() { + scenarioRetryBackoff = prevBackoff + pruneBetweenAttempts = prevPrune + }) + return &prunes +} + func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) { t.Parallel() @@ -57,7 +77,7 @@ func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) { } func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) { - t.Parallel() + prunes := fastRetries(t) log := &fakeLogger{} calls := 0 @@ -74,6 +94,11 @@ func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) { if calls != 2 { t.Fatalf("attempts = %d, want 2", calls) } + // One failed attempt preceded the recovery, so the prune-between-attempts + // seam must have fired exactly once. + if *prunes != 1 { + t.Fatalf("prune-between-attempts fired %d times, want 1", *prunes) + } if !log.contains("transient act/docker execution failure on attempt 1") { t.Fatalf("expected a transient-retry log line, got: %v", log.lines) } @@ -130,7 +155,7 @@ func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) { } func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) { - t.Parallel() + prunes := fastRetries(t) log := &fakeLogger{} calls := 0 @@ -144,6 +169,10 @@ func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T if calls != scenarioRetryAttempts { t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts) } + // A prune runs between attempts but not after the final one. + if *prunes != scenarioRetryAttempts-1 { + t.Fatalf("prune-between-attempts fired %d times, want %d", *prunes, scenarioRetryAttempts-1) + } if !IsTransientWorkflowError(err) { t.Fatalf("final error should still wrap the transient sentinel: %v", err) } From d7c137ffb91fb5c34786c56565ce48b402d5e5ff Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 22:14:45 -0400 Subject: [PATCH 3/4] ci: widen docker address pool for e2e network headroom Signed-off-by: Joshua Temple --- .github/workflows/build-cli.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.github/workflows/build-cli.yaml b/.github/workflows/build-cli.yaml index c77c686..b293385 100644 --- a/.github/workflows/build-cli.yaml +++ b/.github/workflows/build-cli.yaml @@ -64,6 +64,21 @@ jobs: cache: true cache-dependency-path: e2e/go.sum + - name: Widen Docker network address pool + # Each scenario creates its own docker network, and the daemon's + # default address pool is small. Even with synchronous per-scenario + # network teardown, brief cleanup lag under serial load can leave the + # pool short and fail a later scenario at setup with "all predefined + # address pools have been fully subnetted". Carving /24 subnets out of + # a 10.99.0.0/16 base yields 256 networks of headroom, well above the + # handful in flight at once, so cleanup lag can never exhaust the pool. + run: | + sudo mkdir -p /etc/docker + echo '{"default-address-pools":[{"base":"10.99.0.0/16","size":24}]}' | sudo tee /etc/docker/daemon.json + sudo systemctl restart docker + # Wait for the daemon to come back before the tests start. + timeout 60 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done' + - name: Run E2E tests working-directory: e2e # Run scenarios serially (-parallel 1). The 4-core / ~7.9GB runner is From 869998e0a8913693b2dc744d4033eddaf6343d45 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 22:24:12 -0400 Subject: [PATCH 4/4] fix(e2e): reap act-spawned job containers blocking network removal Signed-off-by: Joshua Temple --- e2e/harness/harness.go | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go index 71d40c6..098bf41 100644 --- a/e2e/harness/harness.go +++ b/e2e/harness/harness.go @@ -884,13 +884,24 @@ func (h *Harness) Cleanup() { } // removeNetwork removes the scenario network, waiting for and checking the -// result. It retries briefly so a container that is still detaching cannot -// leave the network behind, and logs (rather than swallows) a terminal -// failure so a genuine leak is visible in the test output. +// result, and logs (rather than swallows) a terminal failure so a genuine leak +// is visible in the test output. +// +// act runs each job by spawning a NESTED container over the docker socket and +// attaching it to this network. Those job containers are act's children, not +// testcontainers-managed, so terminating the act runner does not reap them; +// one can outlive the runner and hold the network open, failing Remove with an +// "active endpoints" error. So before each Remove attempt we force-remove any +// container still attached to the network, then retry briefly to let the +// detach settle. This keeps the network count flat across the suite instead of +// leaking one network per scenario whose job container lingered. func (h *Harness) removeNetwork(ctx context.Context) { const attempts = 5 var err error for i := 0; i < attempts; i++ { + if i > 0 { + h.disconnectNetworkContainers(ctx) + } if err = h.network.Remove(ctx); err == nil { return } @@ -901,3 +912,19 @@ func (h *Harness) removeNetwork(ctx context.Context) { h.networkName, attempts, err) } } + +// disconnectNetworkContainers force-removes every container still attached to +// the scenario network so it can be removed. It targets act's nested job +// containers, which are not testcontainers-managed and so survive the act +// runner's termination. Best effort: a container that is already gone or a +// docker hiccup must not abort cleanup. +func (h *Harness) disconnectNetworkContainers(ctx context.Context) { + out, err := exec.CommandContext(ctx, "docker", "network", "inspect", + "--format", "{{range .Containers}}{{.Name}} {{end}}", h.networkName).Output() + if err != nil { + return + } + for _, name := range strings.Fields(string(out)) { + _ = exec.CommandContext(ctx, "docker", "rm", "-f", name).Run() + } +}