stablekernel · joshua-temple · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
@@ -29,22 +29,15 @@ func TestMultiStepScenarios(t *testing.T) {
 		t.Run(scenario.Name, func(t *testing.T) {
 			t.Parallel()
 
-			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
 			defer cancel()
 
-			h := harness.New(t)
-			defer h.Cleanup()
-
-			err := h.SetupInfra(ctx)
-			require.NoError(t, err, "failed to setup infrastructure")
-
-			// Stage initial repo with config
-			err = h.StageRepoFromConfig(ctx, scenario.Config)
-			require.NoError(t, err, "failed to stage repo")
-
-			// Create runner and execute
-			runner := harness.NewRunner(t, h)
-			err = runner.Run(ctx, scenario)
+			// RunMultiStepScenario runs the whole scenario with a bounded
+			// scenario-level retry on transient act/docker execution failures.
+			// Each attempt gets a fresh harness (network, gitea repo, act
+			// containers), so a retry is a clean slate. Real assertion or
+			// job-level failures fail deterministically without a retry.
+			err := harness.RunMultiStepScenario(ctx, t, scenario)
 			require.NoError(t, err, "scenario failed")
 		})
 	}

diff --git a/e2e/harness/act.go b/e2e/harness/act.go
@@ -384,8 +384,19 @@ func (a *ActRunner) RunWorkflowFromRepo(ctx context.Context, opts RunOpts) (*Ext
 // up as a green-but-empty scenario (#25).
 func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string, exitCode int) {
 	if exitCode != 0 {
+		// ExecError means act could NOT run the workflow to a conclusion: a
+		// genuine act/docker transport or exec hiccup where no job reached a
+		// conclusion. It must NOT cover the case where act ran the workflow and
+		// a job genuinely concluded "failure" - that is a real, deterministic
+		// defect and retrying it would mask a real failure as transient.
+		//
+		// So only tag ExecError when the non-zero exit is unaccompanied by any
+		// parsed job-level failure. If a job concluded "failure" (or the
+		// reconciled conclusion is already "failure"), this was a real outcome.
+		execError := !hasJobFailure(result)
 		result.Conclusion = "failure"
 		result.Error = "workflow execution failed"
+		result.ExecError = execError
 	}
 
 	if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" {
@@ -394,6 +405,26 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string
 	}
 }
 
+// hasJobFailure reports whether act parsed a genuine job-level failure: either
+// the reconciled run conclusion is already "failure", or at least one parsed
+// job concluded "failure". When true, a non-zero act exit reflects a real
+// workflow outcome rather than an act/docker exec hiccup, so it must not be
+// classified as a transient ExecError.
+func hasJobFailure(result *ExtendedWorkflowResult) bool {
+	if result == nil {
+		return false
+	}
+	if result.Conclusion == "failure" {
+		return true
+	}
+	for _, job := range result.Jobs {
+		if job != nil && job.Conclusion == "failure" {
+			return true
+		}
+	}
+	return false
+}
+
 // buildActArgs builds additional act command arguments. eventPath is the
 // in-container event-payload file written by writeEventFile (empty when no
 // EventJSON was supplied); when set it is appended as the act -e flag.

diff --git a/e2e/harness/hotfix_actions.go b/e2e/harness/hotfix_actions.go
@@ -101,7 +101,7 @@ func (r *Runner) executeHotfixPlan(ctx context.Context, step *HotfixPlanStep) er
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  HotfixPlan workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("hotfix plan workflow failed: %s", result.Error)
+		return workflowFailureError("hotfix plan", result)
 	}
 
 	r.t.Logf("  HotfixPlan: parsed %d jobs", len(result.Jobs))
@@ -397,7 +397,7 @@ func (r *Runner) executeHotfixMerged(ctx context.Context, step *HotfixMergedStep
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  HotfixMerged workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("hotfix merged workflow failed: %s", result.Error)
+		return workflowFailureError("hotfix merged", result)
 	}
 
 	if err := r.syncStateFromGitea(ctx, config); err != nil {

diff --git a/e2e/harness/parser.go b/e2e/harness/parser.go
@@ -29,6 +29,14 @@ type ExtendedWorkflowResult struct {
 	Jobs       map[string]*JobResultExtended
 	Logs       string
 	Error      string
+	// ExecError is true when act itself could not run the workflow to a real
+	// conclusion: the act invocation exited non-zero (a docker-exec or act
+	// transport hiccup) rather than a workflow job genuinely concluding
+	// "failure". It distinguishes a transient infrastructure flake (safe to
+	// retry from a clean slate) from a real job-level failure or an assertion
+	// mismatch (which must fail deterministically). A run that parsed real job
+	// events and concluded "failure" leaves ExecError false.
+	ExecError bool
 }
 
 // JobResultExtended contains detailed result of a single job

diff --git a/e2e/harness/runner.go b/e2e/harness/runner.go
@@ -485,7 +485,7 @@ func (r *Runner) executeOrchestrate(ctx context.Context, config Config, expectFa
 	if result.Conclusion != "success" {
 		r.t.Logf("  Orchestrate failed with conclusion: %s", result.Conclusion)
 		r.t.Logf("  Workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("orchestrate workflow failed: %s", result.Error)
+		return workflowFailureError("orchestrate", result)
 	}
 
 	// Debug: show what jobs were parsed
@@ -626,7 +626,7 @@ func (r *Runner) executePromote(ctx context.Context, promote *PromoteStep, confi
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  Promote workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("promote workflow failed: %s", result.Error)
+		return workflowFailureError("promote", result)
 	}
 
 	// Sync state from Gitea

diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go
@@ -0,0 +1,88 @@
+package harness
+
+import (
+	"context"
+	"fmt"
+	"testing"
+)
+
+// scenarioRetryAttempts bounds how many times a single multi-step scenario is
+// run end to end. Each attempt runs against a fresh gitea repo and fresh act
+// containers, so a retry starts from a clean slate with no partial mutation
+// carried over. Only transient act/docker execution failures consume an
+// attempt; real assertion or job-level failures fail on the first attempt.
+const scenarioRetryAttempts = 3
+
+// logger is the minimal logging surface a scenario attempt needs. *testing.T
+// satisfies it, and unit tests can supply a fake to assert on retry logging.
+type logger interface {
+	Logf(format string, args ...any)
+}
+
+// runScenarioWithRetry runs attempt up to scenarioRetryAttempts times, retrying
+// ONLY when the attempt returns a transient act/docker execution failure
+// (errors that wrap errTransientWorkflow). Any non-transient error - a real
+// job-level failure, an expect_failure mismatch, or a state/branch/tag
+// assertion mismatch - fails immediately without a retry. attempt must perform
+// a full clean-slate run (fresh repo + fresh containers) on every call so a
+// retry never inherits partial state from a prior attempt.
+//
+// It returns nil on the first successful attempt, or the last error after the
+// attempt budget is exhausted.
+func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt func(ctx context.Context) error) error {
+	var lastErr error
+	for n := 1; n <= scenarioRetryAttempts; n++ {
+		err := attempt(ctx)
+		if err == nil {
+			if n > 1 {
+				log.Logf("scenario %q: passed on attempt %d/%d after transient retry", name, n, scenarioRetryAttempts)
+			}
+			return nil
+		}
+		// Real assertion failures and genuine job-level failures are
+		// deterministic; surface them immediately so they fail the run.
+		if !IsTransientWorkflowError(err) {
+			return err
+		}
+		lastErr = err
+		if n < scenarioRetryAttempts {
+			log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
+				name, n, scenarioRetryAttempts, err)
+			continue
+		}
+		log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
+			name, scenarioRetryAttempts, err)
+	}
+	return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
+}
+
+// RunMultiStepScenario runs a whole multi-step scenario with a bounded
+// scenario-level retry on transient act/docker execution failures. Each attempt
+// builds a fresh harness (new docker network, gitea container + repo, act
+// container), stages the repo from the scenario config, runs every step, and
+// tears the harness down - so a retry is a clean slate with no carried-over
+// mutation. This is the safe layer at which to retry: re-running a partial,
+// state-mutating act run in place is not safe, but re-running an entire scenario
+// from scratch is.
+//
+// It retries ONLY transient failures (act/docker could not execute the workflow
+// to a real conclusion). A real job-level failure, an expect_failure mismatch,
+// or any state/branch/tag assertion mismatch fails deterministically on the
+// first attempt.
+func RunMultiStepScenario(ctx context.Context, t *testing.T, scenario *MultiStepScenario) error {
+	t.Helper()
+	return runScenarioWithRetry(ctx, t, scenario.Name, func(ctx context.Context) error {
+		h := New(t)
+		defer h.Cleanup()
+
+		if err := h.SetupInfra(ctx); err != nil {
+			return fmt.Errorf("failed to setup infrastructure: %w", err)
+		}
+		if err := h.StageRepoFromConfig(ctx, scenario.Config); err != nil {
+			return fmt.Errorf("failed to stage repo: %w", err)
+		}
+
+		runner := NewRunner(t, h)
+		return runner.Run(ctx, scenario)
+	})
+}
diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go
@@ -0,0 +1,153 @@
+package harness
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+	"testing"
+)
+
+// fakeLogger captures formatted log lines for assertions.
+type fakeLogger struct {
+	mu    sync.Mutex
+	lines []string
+}
+
+func (f *fakeLogger) Logf(format string, args ...any) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.lines = append(f.lines, fmt.Sprintf(format, args...))
+}
+
+func (f *fakeLogger) contains(sub string) bool {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for _, l := range f.lines {
+		if strings.Contains(l, sub) {
+			return true
+		}
+	}
+	return false
+}
+
+func transientErr() error {
+	return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
+}
+
+func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Scenario", func(context.Context) error {
+		calls++
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if calls != 1 {
+		t.Fatalf("attempts = %d, want 1", calls)
+	}
+	if log.contains("retry") {
+		t.Fatalf("did not expect a retry log on first-attempt pass: %v", log.lines)
+	}
+}
+
+func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Hotfix_Stacked", func(context.Context) error {
+		calls++
+		if calls < 2 {
+			return transientErr()
+		}
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("expected recovery, got error: %v", err)
+	}
+	if calls != 2 {
+		t.Fatalf("attempts = %d, want 2", calls)
+	}
+	if !log.contains("transient act/docker execution failure on attempt 1") {
+		t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
+	}
+	if !log.contains("passed on attempt 2") {
+		t.Fatalf("expected a recovery log line, got: %v", log.lines)
+	}
+}
+
+// TestRunScenarioWithRetry_AssertionFailsImmediately is the critical safety
+// guarantee: a real assertion failure (here, an expect_failure mismatch) must
+// NOT be retried and must surface on the first attempt.
+func TestRunScenarioWithRetry_AssertionFailsImmediately(t *testing.T) {
+	t.Parallel()
+
+	assertionErr := errors.New("expected promote to fail but it succeeded")
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Promote_rolls_back", func(context.Context) error {
+		calls++
+		return assertionErr
+	})
+	if !errors.Is(err, assertionErr) {
+		t.Fatalf("expected the assertion error to surface, got: %v", err)
+	}
+	if calls != 1 {
+		t.Fatalf("assertion failure must not retry: attempts = %d, want 1", calls)
+	}
+	if log.contains("retry") {
+		t.Fatalf("must not log a retry for an assertion failure: %v", log.lines)
+	}
+}
+
+// TestRunScenarioWithRetry_RealJobFailureNotRetried guards that a genuine
+// job-level workflow failure (ExecError false) is treated as deterministic.
+func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
+	t.Parallel()
+
+	realFailure := workflowFailureError("orchestrate", &ExtendedWorkflowResult{
+		Conclusion: "failure",
+		Error:      "build job concluded failure",
+	})
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Four_Environment_Cascade_Promotion", func(context.Context) error {
+		calls++
+		return realFailure
+	})
+	if err == nil {
+		t.Fatal("expected the real failure to surface")
+	}
+	if calls != 1 {
+		t.Fatalf("real job failure must not retry: attempts = %d, want 1", calls)
+	}
+}
+
+func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Flaky", func(context.Context) error {
+		calls++
+		return transientErr()
+	})
+	if err == nil {
+		t.Fatal("expected an error after exhausting attempts")
+	}
+	if calls != scenarioRetryAttempts {
+		t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
+	}
+	if !IsTransientWorkflowError(err) {
+		t.Fatalf("final error should still wrap the transient sentinel: %v", err)
+	}
+	if !strings.Contains(err.Error(), fmt.Sprintf("after %d attempts", scenarioRetryAttempts)) {
+		t.Fatalf("final error should report the attempt count: %v", err)
+	}
+}