From 7f2c6ea375ee9df5a14e2ae67c45a53249912009 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 20:35:38 -0400 Subject: [PATCH 1/3] test: classify transient act-exec workflow failures via typed error Tag a non-zero act exit as an execution-layer hiccup (ExecError) distinct from a real job-level failure conclusion, and return a sentinel-wrapped error from the orchestrate/promote/hotfix failure paths so callers can tell a retryable infrastructure flake from a deterministic outcome. Signed-off-by: Joshua Temple --- e2e/harness/act.go | 5 ++ e2e/harness/hotfix_actions.go | 4 +- e2e/harness/parser.go | 8 ++ e2e/harness/runner.go | 4 +- e2e/harness/transient.go | 40 +++++++++ e2e/harness/transient_test.go | 162 ++++++++++++++++++++++++++++++++++ 6 files changed, 219 insertions(+), 4 deletions(-) create mode 100644 e2e/harness/transient.go create mode 100644 e2e/harness/transient_test.go diff --git a/e2e/harness/act.go b/e2e/harness/act.go index 27c9a68..48d6550 100644 --- a/e2e/harness/act.go +++ b/e2e/harness/act.go @@ -386,6 +386,11 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string if exitCode != 0 { result.Conclusion = "failure" result.Error = "workflow execution failed" + // A non-zero act exit is the act/docker execution layer failing to run + // the workflow to a real conclusion (a transport/exec hiccup), as + // opposed to a workflow job genuinely concluding "failure". Mark it + // transient so the scenario runner may retry it from a clean slate. + result.ExecError = true } if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" { diff --git a/e2e/harness/hotfix_actions.go b/e2e/harness/hotfix_actions.go index 5b51fa0..91a8fe2 100644 --- a/e2e/harness/hotfix_actions.go +++ b/e2e/harness/hotfix_actions.go @@ -101,7 +101,7 @@ func (r *Runner) executeHotfixPlan(ctx context.Context, step *HotfixPlanStep) er if result.Conclusion != "success" { r.t.Logf(" HotfixPlan workflow logs:\n%s", result.Logs) - return fmt.Errorf("hotfix plan workflow failed: %s", result.Error) + return workflowFailureError("hotfix plan", result) } r.t.Logf(" HotfixPlan: parsed %d jobs", len(result.Jobs)) @@ -397,7 +397,7 @@ func (r *Runner) executeHotfixMerged(ctx context.Context, step *HotfixMergedStep if result.Conclusion != "success" { r.t.Logf(" HotfixMerged workflow logs:\n%s", result.Logs) - return fmt.Errorf("hotfix merged workflow failed: %s", result.Error) + return workflowFailureError("hotfix merged", result) } if err := r.syncStateFromGitea(ctx, config); err != nil { diff --git a/e2e/harness/parser.go b/e2e/harness/parser.go index 79cbfe2..7553b9e 100644 --- a/e2e/harness/parser.go +++ b/e2e/harness/parser.go @@ -29,6 +29,14 @@ type ExtendedWorkflowResult struct { Jobs map[string]*JobResultExtended Logs string Error string + // ExecError is true when act itself could not run the workflow to a real + // conclusion: the act invocation exited non-zero (a docker-exec or act + // transport hiccup) rather than a workflow job genuinely concluding + // "failure". It distinguishes a transient infrastructure flake (safe to + // retry from a clean slate) from a real job-level failure or an assertion + // mismatch (which must fail deterministically). A run that parsed real job + // events and concluded "failure" leaves ExecError false. + ExecError bool } // JobResultExtended contains detailed result of a single job diff --git a/e2e/harness/runner.go b/e2e/harness/runner.go index 119bb0d..d6c391e 100644 --- a/e2e/harness/runner.go +++ b/e2e/harness/runner.go @@ -485,7 +485,7 @@ func (r *Runner) executeOrchestrate(ctx context.Context, config Config, expectFa if result.Conclusion != "success" { r.t.Logf(" Orchestrate failed with conclusion: %s", result.Conclusion) r.t.Logf(" Workflow logs:\n%s", result.Logs) - return fmt.Errorf("orchestrate workflow failed: %s", result.Error) + return workflowFailureError("orchestrate", result) } // Debug: show what jobs were parsed @@ -626,7 +626,7 @@ func (r *Runner) executePromote(ctx context.Context, promote *PromoteStep, confi if result.Conclusion != "success" { r.t.Logf(" Promote workflow logs:\n%s", result.Logs) - return fmt.Errorf("promote workflow failed: %s", result.Error) + return workflowFailureError("promote", result) } // Sync state from Gitea diff --git a/e2e/harness/transient.go b/e2e/harness/transient.go new file mode 100644 index 0000000..6ab2a98 --- /dev/null +++ b/e2e/harness/transient.go @@ -0,0 +1,40 @@ +package harness + +import ( + "errors" + "fmt" +) + +// errTransientWorkflow is the sentinel that marks a workflow failure as a +// transient act/docker execution hiccup rather than a real outcome. The +// scenario runner retries ONLY on errors that wrap this sentinel; every other +// failure (a real job-level "failure" conclusion, an expect_failure mismatch, a +// state/branch/tag assertion mismatch) is deterministic and must fail +// immediately. +var errTransientWorkflow = errors.New("transient workflow execution failure") + +// IsTransientWorkflowError reports whether err is (or wraps) a transient +// act/docker execution failure that is safe to retry from a clean slate. +func IsTransientWorkflowError(err error) bool { + return errors.Is(err, errTransientWorkflow) +} + +// workflowFailureError builds the error returned when a workflow run concluded +// in failure on a non-expect_failure step. When the failure was an act/docker +// execution hiccup (result.ExecError), the error wraps errTransientWorkflow so +// the scenario runner may retry it from a fresh repo and fresh containers. A +// real job-level failure conclusion (ExecError false) yields a plain error that +// is never retried. +// +// This must only be called on a genuine failure path. An expect_failure step +// that legitimately concluded "failure" is the expected outcome and returns nil +// before reaching here, so a transient classification can never mask it. +func workflowFailureError(action string, result *ExtendedWorkflowResult) error { + if result == nil { + return fmt.Errorf("%s workflow failed", action) + } + if result.ExecError { + return fmt.Errorf("%s workflow failed: %s: %w", action, result.Error, errTransientWorkflow) + } + return fmt.Errorf("%s workflow failed: %s", action, result.Error) +} diff --git a/e2e/harness/transient_test.go b/e2e/harness/transient_test.go new file mode 100644 index 0000000..9cd09bf --- /dev/null +++ b/e2e/harness/transient_test.go @@ -0,0 +1,162 @@ +package harness + +import ( + "errors" + "fmt" + "strings" + "testing" +) + +func TestIsTransientWorkflowError(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + want bool + }{ + { + name: "nil error is not transient", + err: nil, + want: false, + }, + { + name: "plain error is not transient", + err: errors.New("expected promote to fail but it succeeded"), + want: false, + }, + { + name: "sentinel directly is transient", + err: errTransientWorkflow, + want: true, + }, + { + name: "wrapped sentinel is transient", + err: fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow), + want: true, + }, + { + name: "doubly wrapped sentinel is transient", + err: fmt.Errorf("step 3 (Promote) failed: %w", + fmt.Errorf("promote workflow failed: %w", errTransientWorkflow)), + want: true, + }, + { + name: "real job failure wrapping a non-sentinel is not transient", + err: fmt.Errorf("promote workflow failed: %s", "build job concluded failure"), + want: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := IsTransientWorkflowError(tt.err); got != tt.want { + t.Fatalf("IsTransientWorkflowError(%v) = %v, want %v", tt.err, got, tt.want) + } + }) + } +} + +func TestWorkflowFailureError_TransientClassification(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + result *ExtendedWorkflowResult + wantTransient bool + }{ + { + name: "exec error result is transient", + result: &ExtendedWorkflowResult{Conclusion: "failure", Error: "workflow execution failed", ExecError: true}, + wantTransient: true, + }, + { + name: "real job-level failure is not transient", + result: &ExtendedWorkflowResult{Conclusion: "failure", Error: "build job failed"}, + wantTransient: false, + }, + { + name: "missing-workflow failure is not transient", + result: &ExtendedWorkflowResult{Conclusion: "failure", Error: "act produced no jobs for workflow \"promote.yaml\""}, + wantTransient: false, + }, + { + name: "nil result is not transient", + result: nil, + wantTransient: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + err := workflowFailureError("promote", tt.result) + if got := IsTransientWorkflowError(err); got != tt.wantTransient { + t.Fatalf("workflowFailureError transient = %v, want %v (err=%v)", got, tt.wantTransient, err) + } + if !strings.Contains(err.Error(), "promote workflow failed") { + t.Fatalf("error message missing action prefix: %q", err.Error()) + } + }) + } +} + +// TestNormalizeWorkflowResult_ExecErrorTagging verifies that only an act/docker +// exec hiccup (non-zero exit) is tagged transient, while a real "no jobs +// parsed" failure (a missing or unloadable workflow) and a successful run are +// not. +func TestNormalizeWorkflowResult_ExecErrorTagging(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + jobs map[string]*JobResultExtended + workflowPath string + exitCode int + wantConclusion string + wantExecError bool + }{ + { + name: "non-zero exit tags transient exec error", + jobs: map[string]*JobResultExtended{}, + workflowPath: ".github/workflows/promote.yaml", + exitCode: 1, + wantConclusion: "failure", + wantExecError: true, + }, + { + name: "zero exit with no jobs is a real failure not transient", + jobs: map[string]*JobResultExtended{}, + workflowPath: ".github/workflows/promote.yaml", + exitCode: 0, + wantConclusion: "failure", + wantExecError: false, + }, + { + name: "zero exit with jobs stays successful and not transient", + jobs: map[string]*JobResultExtended{"build": {Name: "build", Conclusion: "success"}}, + workflowPath: ".github/workflows/promote.yaml", + exitCode: 0, + wantConclusion: "success", + wantExecError: false, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + result := &ExtendedWorkflowResult{Conclusion: "success", Jobs: tt.jobs} + normalizeWorkflowResult(result, tt.workflowPath, tt.exitCode) + if result.Conclusion != tt.wantConclusion { + t.Fatalf("Conclusion = %q, want %q", result.Conclusion, tt.wantConclusion) + } + if result.ExecError != tt.wantExecError { + t.Fatalf("ExecError = %v, want %v", result.ExecError, tt.wantExecError) + } + }) + } +} From cb8573cde0bbbf3cc8d3f2ca793c8efb8096aa6b Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 20:35:44 -0400 Subject: [PATCH 2/3] fix: retry whole scenarios on transient act/docker execution flake Wrap each multi-step scenario in a bounded retry that re-runs the entire scenario from a fresh gitea repo and fresh act containers when, and only when, a step fails with a transient act/docker execution error. Real assertion mismatches, expect_failure mismatches, and genuine job-level failures fail on the first attempt with no retry, so a flake never masks a real regression. Signed-off-by: Joshua Temple --- e2e/e2e_test.go | 21 ++-- e2e/harness/scenario_retry.go | 88 +++++++++++++++++ e2e/harness/scenario_retry_test.go | 153 +++++++++++++++++++++++++++++ 3 files changed, 248 insertions(+), 14 deletions(-) create mode 100644 e2e/harness/scenario_retry.go create mode 100644 e2e/harness/scenario_retry_test.go diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index 09916d7..ce376be 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -29,22 +29,15 @@ func TestMultiStepScenarios(t *testing.T) { t.Run(scenario.Name, func(t *testing.T) { t.Parallel() - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute) + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute) defer cancel() - h := harness.New(t) - defer h.Cleanup() - - err := h.SetupInfra(ctx) - require.NoError(t, err, "failed to setup infrastructure") - - // Stage initial repo with config - err = h.StageRepoFromConfig(ctx, scenario.Config) - require.NoError(t, err, "failed to stage repo") - - // Create runner and execute - runner := harness.NewRunner(t, h) - err = runner.Run(ctx, scenario) + // RunMultiStepScenario runs the whole scenario with a bounded + // scenario-level retry on transient act/docker execution failures. + // Each attempt gets a fresh harness (network, gitea repo, act + // containers), so a retry is a clean slate. Real assertion or + // job-level failures fail deterministically without a retry. + err := harness.RunMultiStepScenario(ctx, t, scenario) require.NoError(t, err, "scenario failed") }) } diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go new file mode 100644 index 0000000..f9c7876 --- /dev/null +++ b/e2e/harness/scenario_retry.go @@ -0,0 +1,88 @@ +package harness + +import ( + "context" + "fmt" + "testing" +) + +// scenarioRetryAttempts bounds how many times a single multi-step scenario is +// run end to end. Each attempt runs against a fresh gitea repo and fresh act +// containers, so a retry starts from a clean slate with no partial mutation +// carried over. Only transient act/docker execution failures consume an +// attempt; real assertion or job-level failures fail on the first attempt. +const scenarioRetryAttempts = 3 + +// logger is the minimal logging surface a scenario attempt needs. *testing.T +// satisfies it, and unit tests can supply a fake to assert on retry logging. +type logger interface { + Logf(format string, args ...any) +} + +// runScenarioWithRetry runs attempt up to scenarioRetryAttempts times, retrying +// ONLY when the attempt returns a transient act/docker execution failure +// (errors that wrap errTransientWorkflow). Any non-transient error - a real +// job-level failure, an expect_failure mismatch, or a state/branch/tag +// assertion mismatch - fails immediately without a retry. attempt must perform +// a full clean-slate run (fresh repo + fresh containers) on every call so a +// retry never inherits partial state from a prior attempt. +// +// It returns nil on the first successful attempt, or the last error after the +// attempt budget is exhausted. +func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt func(ctx context.Context) error) error { + var lastErr error + for n := 1; n <= scenarioRetryAttempts; n++ { + err := attempt(ctx) + if err == nil { + if n > 1 { + log.Logf("scenario %q: passed on attempt %d/%d after transient retry", name, n, scenarioRetryAttempts) + } + return nil + } + // Real assertion failures and genuine job-level failures are + // deterministic; surface them immediately so they fail the run. + if !IsTransientWorkflowError(err) { + return err + } + lastErr = err + if n < scenarioRetryAttempts { + log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v", + name, n, scenarioRetryAttempts, err) + continue + } + log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v", + name, scenarioRetryAttempts, err) + } + return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr) +} + +// RunMultiStepScenario runs a whole multi-step scenario with a bounded +// scenario-level retry on transient act/docker execution failures. Each attempt +// builds a fresh harness (new docker network, gitea container + repo, act +// container), stages the repo from the scenario config, runs every step, and +// tears the harness down - so a retry is a clean slate with no carried-over +// mutation. This is the safe layer at which to retry: re-running a partial, +// state-mutating act run in place is not safe, but re-running an entire scenario +// from scratch is. +// +// It retries ONLY transient failures (act/docker could not execute the workflow +// to a real conclusion). A real job-level failure, an expect_failure mismatch, +// or any state/branch/tag assertion mismatch fails deterministically on the +// first attempt. +func RunMultiStepScenario(ctx context.Context, t *testing.T, scenario *MultiStepScenario) error { + t.Helper() + return runScenarioWithRetry(ctx, t, scenario.Name, func(ctx context.Context) error { + h := New(t) + defer h.Cleanup() + + if err := h.SetupInfra(ctx); err != nil { + return fmt.Errorf("failed to setup infrastructure: %w", err) + } + if err := h.StageRepoFromConfig(ctx, scenario.Config); err != nil { + return fmt.Errorf("failed to stage repo: %w", err) + } + + runner := NewRunner(t, h) + return runner.Run(ctx, scenario) + }) +} diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go new file mode 100644 index 0000000..e146d46 --- /dev/null +++ b/e2e/harness/scenario_retry_test.go @@ -0,0 +1,153 @@ +package harness + +import ( + "context" + "errors" + "fmt" + "strings" + "sync" + "testing" +) + +// fakeLogger captures formatted log lines for assertions. +type fakeLogger struct { + mu sync.Mutex + lines []string +} + +func (f *fakeLogger) Logf(format string, args ...any) { + f.mu.Lock() + defer f.mu.Unlock() + f.lines = append(f.lines, fmt.Sprintf(format, args...)) +} + +func (f *fakeLogger) contains(sub string) bool { + f.mu.Lock() + defer f.mu.Unlock() + for _, l := range f.lines { + if strings.Contains(l, sub) { + return true + } + } + return false +} + +func transientErr() error { + return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow) +} + +func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) { + t.Parallel() + + log := &fakeLogger{} + calls := 0 + err := runScenarioWithRetry(context.Background(), log, "Scenario", func(context.Context) error { + calls++ + return nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if calls != 1 { + t.Fatalf("attempts = %d, want 1", calls) + } + if log.contains("retry") { + t.Fatalf("did not expect a retry log on first-attempt pass: %v", log.lines) + } +} + +func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) { + t.Parallel() + + log := &fakeLogger{} + calls := 0 + err := runScenarioWithRetry(context.Background(), log, "Hotfix_Stacked", func(context.Context) error { + calls++ + if calls < 2 { + return transientErr() + } + return nil + }) + if err != nil { + t.Fatalf("expected recovery, got error: %v", err) + } + if calls != 2 { + t.Fatalf("attempts = %d, want 2", calls) + } + if !log.contains("transient act/docker execution failure on attempt 1") { + t.Fatalf("expected a transient-retry log line, got: %v", log.lines) + } + if !log.contains("passed on attempt 2") { + t.Fatalf("expected a recovery log line, got: %v", log.lines) + } +} + +// TestRunScenarioWithRetry_AssertionFailsImmediately is the critical safety +// guarantee: a real assertion failure (here, an expect_failure mismatch) must +// NOT be retried and must surface on the first attempt. +func TestRunScenarioWithRetry_AssertionFailsImmediately(t *testing.T) { + t.Parallel() + + assertionErr := errors.New("expected promote to fail but it succeeded") + log := &fakeLogger{} + calls := 0 + err := runScenarioWithRetry(context.Background(), log, "Promote_rolls_back", func(context.Context) error { + calls++ + return assertionErr + }) + if !errors.Is(err, assertionErr) { + t.Fatalf("expected the assertion error to surface, got: %v", err) + } + if calls != 1 { + t.Fatalf("assertion failure must not retry: attempts = %d, want 1", calls) + } + if log.contains("retry") { + t.Fatalf("must not log a retry for an assertion failure: %v", log.lines) + } +} + +// TestRunScenarioWithRetry_RealJobFailureNotRetried guards that a genuine +// job-level workflow failure (ExecError false) is treated as deterministic. +func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) { + t.Parallel() + + realFailure := workflowFailureError("orchestrate", &ExtendedWorkflowResult{ + Conclusion: "failure", + Error: "build job concluded failure", + }) + log := &fakeLogger{} + calls := 0 + err := runScenarioWithRetry(context.Background(), log, "Four_Environment_Cascade_Promotion", func(context.Context) error { + calls++ + return realFailure + }) + if err == nil { + t.Fatal("expected the real failure to surface") + } + if calls != 1 { + t.Fatalf("real job failure must not retry: attempts = %d, want 1", calls) + } +} + +func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) { + t.Parallel() + + log := &fakeLogger{} + calls := 0 + err := runScenarioWithRetry(context.Background(), log, "Flaky", func(context.Context) error { + calls++ + return transientErr() + }) + if err == nil { + t.Fatal("expected an error after exhausting attempts") + } + if calls != scenarioRetryAttempts { + t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts) + } + if !IsTransientWorkflowError(err) { + t.Fatalf("final error should still wrap the transient sentinel: %v", err) + } + if !strings.Contains(err.Error(), fmt.Sprintf("after %d attempts", scenarioRetryAttempts)) { + t.Fatalf("final error should report the attempt count: %v", err) + } +} From be1f884ed4f0e99d5b2e6c813ead6869a2c294c1 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 20:48:08 -0400 Subject: [PATCH 3/3] fix: classify act job-level failures as non-transient, not exec errors Signed-off-by: Joshua Temple --- e2e/harness/act.go | 36 ++++++++++++++++++++++++++++++----- e2e/harness/transient_test.go | 16 +++++++++++++++- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/e2e/harness/act.go b/e2e/harness/act.go index 48d6550..6d31c37 100644 --- a/e2e/harness/act.go +++ b/e2e/harness/act.go @@ -384,13 +384,19 @@ func (a *ActRunner) RunWorkflowFromRepo(ctx context.Context, opts RunOpts) (*Ext // up as a green-but-empty scenario (#25). func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string, exitCode int) { if exitCode != 0 { + // ExecError means act could NOT run the workflow to a conclusion: a + // genuine act/docker transport or exec hiccup where no job reached a + // conclusion. It must NOT cover the case where act ran the workflow and + // a job genuinely concluded "failure" - that is a real, deterministic + // defect and retrying it would mask a real failure as transient. + // + // So only tag ExecError when the non-zero exit is unaccompanied by any + // parsed job-level failure. If a job concluded "failure" (or the + // reconciled conclusion is already "failure"), this was a real outcome. + execError := !hasJobFailure(result) result.Conclusion = "failure" result.Error = "workflow execution failed" - // A non-zero act exit is the act/docker execution layer failing to run - // the workflow to a real conclusion (a transport/exec hiccup), as - // opposed to a workflow job genuinely concluding "failure". Mark it - // transient so the scenario runner may retry it from a clean slate. - result.ExecError = true + result.ExecError = execError } if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" { @@ -399,6 +405,26 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string } } +// hasJobFailure reports whether act parsed a genuine job-level failure: either +// the reconciled run conclusion is already "failure", or at least one parsed +// job concluded "failure". When true, a non-zero act exit reflects a real +// workflow outcome rather than an act/docker exec hiccup, so it must not be +// classified as a transient ExecError. +func hasJobFailure(result *ExtendedWorkflowResult) bool { + if result == nil { + return false + } + if result.Conclusion == "failure" { + return true + } + for _, job := range result.Jobs { + if job != nil && job.Conclusion == "failure" { + return true + } + } + return false +} + // buildActArgs builds additional act command arguments. eventPath is the // in-container event-payload file written by writeEventFile (empty when no // EventJSON was supplied); when set it is appended as the act -e flag. diff --git a/e2e/harness/transient_test.go b/e2e/harness/transient_test.go index 9cd09bf..6484bda 100644 --- a/e2e/harness/transient_test.go +++ b/e2e/harness/transient_test.go @@ -120,13 +120,27 @@ func TestNormalizeWorkflowResult_ExecErrorTagging(t *testing.T) { wantExecError bool }{ { - name: "non-zero exit tags transient exec error", + name: "non-zero exit with no jobs tags transient exec error", jobs: map[string]*JobResultExtended{}, workflowPath: ".github/workflows/promote.yaml", exitCode: 1, wantConclusion: "failure", wantExecError: true, }, + { + // Regression: act exits non-zero when a job genuinely concludes + // "failure". That is a real, deterministic defect, not an + // act/docker transport hiccup, so ExecError must stay false and the + // scenario runner must NOT retry it. + name: "non-zero exit with a failed job is a real failure not transient", + jobs: map[string]*JobResultExtended{ + "build": {Name: "build", Conclusion: "failure"}, + }, + workflowPath: ".github/workflows/promote.yaml", + exitCode: 1, + wantConclusion: "failure", + wantExecError: false, + }, { name: "zero exit with no jobs is a real failure not transient", jobs: map[string]*JobResultExtended{},