Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/build-cli.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,21 @@ jobs:
cache: true
cache-dependency-path: e2e/go.sum

- name: Widen Docker network address pool
# Each scenario creates its own docker network, and the daemon's
# default address pool is small. Even with synchronous per-scenario
# network teardown, brief cleanup lag under serial load can leave the
# pool short and fail a later scenario at setup with "all predefined
# address pools have been fully subnetted". Carving /24 subnets out of
# a 10.99.0.0/16 base yields 256 networks of headroom, well above the
# handful in flight at once, so cleanup lag can never exhaust the pool.
run: |
sudo mkdir -p /etc/docker
echo '{"default-address-pools":[{"base":"10.99.0.0/16","size":24}]}' | sudo tee /etc/docker/daemon.json
sudo systemctl restart docker
# Wait for the daemon to come back before the tests start.
timeout 60 sh -c 'until docker info >/dev/null 2>&1; do sleep 1; done'

- name: Run E2E tests
working-directory: e2e
# Run scenarios serially (-parallel 1). The 4-core / ~7.9GB runner is
Expand Down
61 changes: 59 additions & 2 deletions e2e/harness/harness.go
Original file line number Diff line number Diff line change
Expand Up @@ -858,7 +858,18 @@ func (h *Harness) getProjectRoot() (string, error) {
}
}

// Cleanup terminates all containers
// Cleanup terminates all containers and removes the scenario's docker network.
//
// Network removal is synchronous and verified, not fire-and-forget: a single
// scenario allocates a /16 (or /24) from the daemon's address pool, and across
// the serial scenario suite a leaked network drains that pool until a late
// scenario cannot allocate one and dies at setup ("all predefined address
// pools have been fully subnetted"). The retry layer defers Cleanup per
// attempt, so every attempt - including a failed one - releases its network
// here. Because the act and gitea containers detach from the network during
// their own teardown, Remove can briefly observe an "active endpoints" error;
// a short bounded retry lets that detach settle so the count returns to
// baseline rather than growing monotonically.
func (h *Harness) Cleanup() {
ctx := context.Background()
if h.act != nil {
Expand All @@ -868,6 +879,52 @@ func (h *Harness) Cleanup() {
_ = h.gitea.Terminate(ctx)
}
if h.network != nil {
_ = h.network.Remove(ctx)
h.removeNetwork(ctx)
}
}

// removeNetwork removes the scenario network, waiting for and checking the
// result, and logs (rather than swallows) a terminal failure so a genuine leak
// is visible in the test output.
//
// act runs each job by spawning a NESTED container over the docker socket and
// attaching it to this network. Those job containers are act's children, not
// testcontainers-managed, so terminating the act runner does not reap them;
// one can outlive the runner and hold the network open, failing Remove with an
// "active endpoints" error. So before each Remove attempt we force-remove any
// container still attached to the network, then retry briefly to let the
// detach settle. This keeps the network count flat across the suite instead of
// leaking one network per scenario whose job container lingered.
func (h *Harness) removeNetwork(ctx context.Context) {
const attempts = 5
var err error
for i := 0; i < attempts; i++ {
if i > 0 {
h.disconnectNetworkContainers(ctx)
}
if err = h.network.Remove(ctx); err == nil {
return
}
time.Sleep(500 * time.Millisecond)
}
if h.t != nil {
h.t.Logf("warning: failed to remove docker network %q after %d attempts: %v",
h.networkName, attempts, err)
}
}

// disconnectNetworkContainers force-removes every container still attached to
// the scenario network so it can be removed. It targets act's nested job
// containers, which are not testcontainers-managed and so survive the act
// runner's termination. Best effort: a container that is already gone or a
// docker hiccup must not abort cleanup.
func (h *Harness) disconnectNetworkContainers(ctx context.Context) {
out, err := exec.CommandContext(ctx, "docker", "network", "inspect",
"--format", "{{range .Containers}}{{.Name}} {{end}}", h.networkName).Output()
if err != nil {
return
}
for _, name := range strings.Fields(string(out)) {
_ = exec.CommandContext(ctx, "docker", "rm", "-f", name).Run()
}
}
38 changes: 37 additions & 1 deletion e2e/harness/scenario_retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,32 @@ package harness
import (
"context"
"fmt"
"os/exec"
"testing"
"time"
)

// scenarioRetryAttempts bounds how many times a single multi-step scenario is
// run end to end. Each attempt runs against a fresh gitea repo and fresh act
// containers, so a retry starts from a clean slate with no partial mutation
// carried over. Only transient act/docker execution failures consume an
// attempt; real assertion or job-level failures fail on the first attempt.
const scenarioRetryAttempts = 3
//
// Five attempts gives contention-driven transients a couple more chances under
// heavy CI load: the recovery logs show several scenarios passing on attempt 2
// or 3, so the mechanism works and the extra headroom covers the slowest tail.
const scenarioRetryAttempts = 5

// scenarioRetryBackoff is the pause between scenario attempts. It lets a burst
// of container/docker contention subside before the next clean-slate attempt
// rather than retrying instantly into the same pressure. It is a var, not a
// const, so unit tests can zero it to avoid real sleeps.
var scenarioRetryBackoff = 5 * time.Second

// pruneBetweenAttempts reclaims orphaned docker networks between attempts. It
// is a var so unit tests can replace it with a no-op (and so the production
// path stays a single, testable seam over the docker CLI).
var pruneBetweenAttempts = pruneDockerNetworks

// logger is the minimal logging surface a scenario attempt needs. *testing.T
// satisfies it, and unit tests can supply a fake to assert on retry logging.
Expand Down Expand Up @@ -48,6 +65,17 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
if n < scenarioRetryAttempts {
log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
name, n, scenarioRetryAttempts, err)
// Reclaim any docker networks orphaned by the failed attempt and
// pause so contention can subside before the next clean-slate
// attempt. Both are best effort: the attempt's own Cleanup already
// removes its network, and a prune failure must not mask the
// scenario result.
pruneBetweenAttempts(ctx)
select {
case <-ctx.Done():
return ctx.Err()
case <-time.After(scenarioRetryBackoff):
}
continue
}
log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
Expand All @@ -56,6 +84,14 @@ func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt
return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
}

// pruneDockerNetworks best-effort reclaims unused docker networks between
// scenario attempts so a network orphaned by a failed attempt cannot
// accumulate and exhaust the daemon's address pool. Any error is intentionally
// ignored: this is a defensive reclaim, not a correctness requirement.
func pruneDockerNetworks(ctx context.Context) {
_ = exec.CommandContext(ctx, "docker", "network", "prune", "-f").Run()
}

// RunMultiStepScenario runs a whole multi-step scenario with a bounded
// scenario-level retry on transient act/docker execution failures. Each attempt
// builds a fresh harness (new docker network, gitea container + repo, act
Expand Down
33 changes: 31 additions & 2 deletions e2e/harness/scenario_retry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,26 @@ func transientErr() error {
return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
}

// fastRetries removes the inter-attempt pause and stubs out the docker network
// prune for the duration of a test so the retry loop neither sleeps nor shells
// out to docker in unit tests, and restores both on cleanup. It records how
// many times the prune seam fired so callers can assert prune-between-attempts
// behaviour. These globals are shared, so a test using this must not run in
// parallel.
func fastRetries(t *testing.T) *int {
t.Helper()
prevBackoff := scenarioRetryBackoff
prevPrune := pruneBetweenAttempts
prunes := 0
scenarioRetryBackoff = 0
pruneBetweenAttempts = func(context.Context) { prunes++ }
t.Cleanup(func() {
scenarioRetryBackoff = prevBackoff
pruneBetweenAttempts = prevPrune
})
return &prunes
}

func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
t.Parallel()

Expand All @@ -57,7 +77,7 @@ func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
}

func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
t.Parallel()
prunes := fastRetries(t)

log := &fakeLogger{}
calls := 0
Expand All @@ -74,6 +94,11 @@ func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
if calls != 2 {
t.Fatalf("attempts = %d, want 2", calls)
}
// One failed attempt preceded the recovery, so the prune-between-attempts
// seam must have fired exactly once.
if *prunes != 1 {
t.Fatalf("prune-between-attempts fired %d times, want 1", *prunes)
}
if !log.contains("transient act/docker execution failure on attempt 1") {
t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
}
Expand Down Expand Up @@ -130,7 +155,7 @@ func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
}

func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
t.Parallel()
prunes := fastRetries(t)

log := &fakeLogger{}
calls := 0
Expand All @@ -144,6 +169,10 @@ func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T
if calls != scenarioRetryAttempts {
t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
}
// A prune runs between attempts but not after the final one.
if *prunes != scenarioRetryAttempts-1 {
t.Fatalf("prune-between-attempts fired %d times, want %d", *prunes, scenarioRetryAttempts-1)
}
if !IsTransientWorkflowError(err) {
t.Fatalf("final error should still wrap the transient sentinel: %v", err)
}
Expand Down
Loading