Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 7 additions & 14 deletions e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,22 +29,15 @@ func TestMultiStepScenarios(t *testing.T) {
t.Run(scenario.Name, func(t *testing.T) {
t.Parallel()

ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
defer cancel()

h := harness.New(t)
defer h.Cleanup()

err := h.SetupInfra(ctx)
require.NoError(t, err, "failed to setup infrastructure")

// Stage initial repo with config
err = h.StageRepoFromConfig(ctx, scenario.Config)
require.NoError(t, err, "failed to stage repo")

// Create runner and execute
runner := harness.NewRunner(t, h)
err = runner.Run(ctx, scenario)
// RunMultiStepScenario runs the whole scenario with a bounded
// scenario-level retry on transient act/docker execution failures.
// Each attempt gets a fresh harness (network, gitea repo, act
// containers), so a retry is a clean slate. Real assertion or
// job-level failures fail deterministically without a retry.
err := harness.RunMultiStepScenario(ctx, t, scenario)
require.NoError(t, err, "scenario failed")
})
}
Expand Down
31 changes: 31 additions & 0 deletions e2e/harness/act.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,19 @@ func (a *ActRunner) RunWorkflowFromRepo(ctx context.Context, opts RunOpts) (*Ext
// up as a green-but-empty scenario (#25).
func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string, exitCode int) {
if exitCode != 0 {
// ExecError means act could NOT run the workflow to a conclusion: a
// genuine act/docker transport or exec hiccup where no job reached a
// conclusion. It must NOT cover the case where act ran the workflow and
// a job genuinely concluded "failure" - that is a real, deterministic
// defect and retrying it would mask a real failure as transient.
//
// So only tag ExecError when the non-zero exit is unaccompanied by any
// parsed job-level failure. If a job concluded "failure" (or the
// reconciled conclusion is already "failure"), this was a real outcome.
execError := !hasJobFailure(result)
result.Conclusion = "failure"
result.Error = "workflow execution failed"
result.ExecError = execError
}

if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" {
Expand All @@ -394,6 +405,26 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string
}
}

// hasJobFailure reports whether act parsed a genuine job-level failure: either
// the reconciled run conclusion is already "failure", or at least one parsed
// job concluded "failure". When true, a non-zero act exit reflects a real
// workflow outcome rather than an act/docker exec hiccup, so it must not be
// classified as a transient ExecError.
func hasJobFailure(result *ExtendedWorkflowResult) bool {
if result == nil {
return false
}
if result.Conclusion == "failure" {
return true
}
for _, job := range result.Jobs {
if job != nil && job.Conclusion == "failure" {
return true
}
}
return false
}

// buildActArgs builds additional act command arguments. eventPath is the
// in-container event-payload file written by writeEventFile (empty when no
// EventJSON was supplied); when set it is appended as the act -e flag.
Expand Down
4 changes: 2 additions & 2 deletions e2e/harness/hotfix_actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (r *Runner) executeHotfixPlan(ctx context.Context, step *HotfixPlanStep) er

if result.Conclusion != "success" {
r.t.Logf(" HotfixPlan workflow logs:\n%s", result.Logs)
return fmt.Errorf("hotfix plan workflow failed: %s", result.Error)
return workflowFailureError("hotfix plan", result)
}

r.t.Logf(" HotfixPlan: parsed %d jobs", len(result.Jobs))
Expand Down Expand Up @@ -397,7 +397,7 @@ func (r *Runner) executeHotfixMerged(ctx context.Context, step *HotfixMergedStep

if result.Conclusion != "success" {
r.t.Logf(" HotfixMerged workflow logs:\n%s", result.Logs)
return fmt.Errorf("hotfix merged workflow failed: %s", result.Error)
return workflowFailureError("hotfix merged", result)
}

if err := r.syncStateFromGitea(ctx, config); err != nil {
Expand Down
8 changes: 8 additions & 0 deletions e2e/harness/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ type ExtendedWorkflowResult struct {
Jobs map[string]*JobResultExtended
Logs string
Error string
// ExecError is true when act itself could not run the workflow to a real
// conclusion: the act invocation exited non-zero (a docker-exec or act
// transport hiccup) rather than a workflow job genuinely concluding
// "failure". It distinguishes a transient infrastructure flake (safe to
// retry from a clean slate) from a real job-level failure or an assertion
// mismatch (which must fail deterministically). A run that parsed real job
// events and concluded "failure" leaves ExecError false.
ExecError bool
}

// JobResultExtended contains detailed result of a single job
Expand Down
4 changes: 2 additions & 2 deletions e2e/harness/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ func (r *Runner) executeOrchestrate(ctx context.Context, config Config, expectFa
if result.Conclusion != "success" {
r.t.Logf(" Orchestrate failed with conclusion: %s", result.Conclusion)
r.t.Logf(" Workflow logs:\n%s", result.Logs)
return fmt.Errorf("orchestrate workflow failed: %s", result.Error)
return workflowFailureError("orchestrate", result)
}

// Debug: show what jobs were parsed
Expand Down Expand Up @@ -626,7 +626,7 @@ func (r *Runner) executePromote(ctx context.Context, promote *PromoteStep, confi

if result.Conclusion != "success" {
r.t.Logf(" Promote workflow logs:\n%s", result.Logs)
return fmt.Errorf("promote workflow failed: %s", result.Error)
return workflowFailureError("promote", result)
}

// Sync state from Gitea
Expand Down
88 changes: 88 additions & 0 deletions e2e/harness/scenario_retry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
package harness

import (
"context"
"fmt"
"testing"
)

// scenarioRetryAttempts bounds how many times a single multi-step scenario is
// run end to end. Each attempt runs against a fresh gitea repo and fresh act
// containers, so a retry starts from a clean slate with no partial mutation
// carried over. Only transient act/docker execution failures consume an
// attempt; real assertion or job-level failures fail on the first attempt.
const scenarioRetryAttempts = 3

// logger is the minimal logging surface a scenario attempt needs. *testing.T
// satisfies it, and unit tests can supply a fake to assert on retry logging.
type logger interface {
Logf(format string, args ...any)
}

// runScenarioWithRetry runs attempt up to scenarioRetryAttempts times, retrying
// ONLY when the attempt returns a transient act/docker execution failure
// (errors that wrap errTransientWorkflow). Any non-transient error - a real
// job-level failure, an expect_failure mismatch, or a state/branch/tag
// assertion mismatch - fails immediately without a retry. attempt must perform
// a full clean-slate run (fresh repo + fresh containers) on every call so a
// retry never inherits partial state from a prior attempt.
//
// It returns nil on the first successful attempt, or the last error after the
// attempt budget is exhausted.
func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt func(ctx context.Context) error) error {
var lastErr error
for n := 1; n <= scenarioRetryAttempts; n++ {
err := attempt(ctx)
if err == nil {
if n > 1 {
log.Logf("scenario %q: passed on attempt %d/%d after transient retry", name, n, scenarioRetryAttempts)
}
return nil
}
// Real assertion failures and genuine job-level failures are
// deterministic; surface them immediately so they fail the run.
if !IsTransientWorkflowError(err) {
return err
}
lastErr = err
if n < scenarioRetryAttempts {
log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
name, n, scenarioRetryAttempts, err)
continue
}
log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
name, scenarioRetryAttempts, err)
}
return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
}

// RunMultiStepScenario runs a whole multi-step scenario with a bounded
// scenario-level retry on transient act/docker execution failures. Each attempt
// builds a fresh harness (new docker network, gitea container + repo, act
// container), stages the repo from the scenario config, runs every step, and
// tears the harness down - so a retry is a clean slate with no carried-over
// mutation. This is the safe layer at which to retry: re-running a partial,
// state-mutating act run in place is not safe, but re-running an entire scenario
// from scratch is.
//
// It retries ONLY transient failures (act/docker could not execute the workflow
// to a real conclusion). A real job-level failure, an expect_failure mismatch,
// or any state/branch/tag assertion mismatch fails deterministically on the
// first attempt.
func RunMultiStepScenario(ctx context.Context, t *testing.T, scenario *MultiStepScenario) error {
t.Helper()
return runScenarioWithRetry(ctx, t, scenario.Name, func(ctx context.Context) error {
h := New(t)
defer h.Cleanup()

if err := h.SetupInfra(ctx); err != nil {
return fmt.Errorf("failed to setup infrastructure: %w", err)
}
if err := h.StageRepoFromConfig(ctx, scenario.Config); err != nil {
return fmt.Errorf("failed to stage repo: %w", err)
}

runner := NewRunner(t, h)
return runner.Run(ctx, scenario)
})
}
153 changes: 153 additions & 0 deletions e2e/harness/scenario_retry_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
package harness

import (
"context"
"errors"
"fmt"
"strings"
"sync"
"testing"
)

// fakeLogger captures formatted log lines for assertions.
type fakeLogger struct {
mu sync.Mutex
lines []string
}

func (f *fakeLogger) Logf(format string, args ...any) {
f.mu.Lock()
defer f.mu.Unlock()
f.lines = append(f.lines, fmt.Sprintf(format, args...))
}

func (f *fakeLogger) contains(sub string) bool {
f.mu.Lock()
defer f.mu.Unlock()
for _, l := range f.lines {
if strings.Contains(l, sub) {
return true
}
}
return false
}

func transientErr() error {
return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
}

func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
t.Parallel()

log := &fakeLogger{}
calls := 0
err := runScenarioWithRetry(context.Background(), log, "Scenario", func(context.Context) error {
calls++
return nil
})
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if calls != 1 {
t.Fatalf("attempts = %d, want 1", calls)
}
if log.contains("retry") {
t.Fatalf("did not expect a retry log on first-attempt pass: %v", log.lines)
}
}

func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
t.Parallel()

log := &fakeLogger{}
calls := 0
err := runScenarioWithRetry(context.Background(), log, "Hotfix_Stacked", func(context.Context) error {
calls++
if calls < 2 {
return transientErr()
}
return nil
})
if err != nil {
t.Fatalf("expected recovery, got error: %v", err)
}
if calls != 2 {
t.Fatalf("attempts = %d, want 2", calls)
}
if !log.contains("transient act/docker execution failure on attempt 1") {
t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
}
if !log.contains("passed on attempt 2") {
t.Fatalf("expected a recovery log line, got: %v", log.lines)
}
}

// TestRunScenarioWithRetry_AssertionFailsImmediately is the critical safety
// guarantee: a real assertion failure (here, an expect_failure mismatch) must
// NOT be retried and must surface on the first attempt.
func TestRunScenarioWithRetry_AssertionFailsImmediately(t *testing.T) {
t.Parallel()

assertionErr := errors.New("expected promote to fail but it succeeded")
log := &fakeLogger{}
calls := 0
err := runScenarioWithRetry(context.Background(), log, "Promote_rolls_back", func(context.Context) error {
calls++
return assertionErr
})
if !errors.Is(err, assertionErr) {
t.Fatalf("expected the assertion error to surface, got: %v", err)
}
if calls != 1 {
t.Fatalf("assertion failure must not retry: attempts = %d, want 1", calls)
}
if log.contains("retry") {
t.Fatalf("must not log a retry for an assertion failure: %v", log.lines)
}
}

// TestRunScenarioWithRetry_RealJobFailureNotRetried guards that a genuine
// job-level workflow failure (ExecError false) is treated as deterministic.
func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
t.Parallel()

realFailure := workflowFailureError("orchestrate", &ExtendedWorkflowResult{
Conclusion: "failure",
Error: "build job concluded failure",
})
log := &fakeLogger{}
calls := 0
err := runScenarioWithRetry(context.Background(), log, "Four_Environment_Cascade_Promotion", func(context.Context) error {
calls++
return realFailure
})
if err == nil {
t.Fatal("expected the real failure to surface")
}
if calls != 1 {
t.Fatalf("real job failure must not retry: attempts = %d, want 1", calls)
}
}

func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
t.Parallel()

log := &fakeLogger{}
calls := 0
err := runScenarioWithRetry(context.Background(), log, "Flaky", func(context.Context) error {
calls++
return transientErr()
})
if err == nil {
t.Fatal("expected an error after exhausting attempts")
}
if calls != scenarioRetryAttempts {
t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
}
if !IsTransientWorkflowError(err) {
t.Fatalf("final error should still wrap the transient sentinel: %v", err)
}
if !strings.Contains(err.Error(), fmt.Sprintf("after %d attempts", scenarioRetryAttempts)) {
t.Fatalf("final error should report the attempt count: %v", err)
}
}
Loading
Loading