From 7f2c6ea375ee9df5a14e2ae67c45a53249912009 Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 20:35:38 -0400
Subject: [PATCH 1/3] test: classify transient act-exec workflow failures via
 typed error

Tag a non-zero act exit as an execution-layer hiccup (ExecError) distinct from a real job-level failure conclusion, and return a sentinel-wrapped error from the orchestrate/promote/hotfix failure paths so callers can tell a retryable infrastructure flake from a deterministic outcome.

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/act.go            |   5 ++
 e2e/harness/hotfix_actions.go |   4 +-
 e2e/harness/parser.go         |   8 ++
 e2e/harness/runner.go         |   4 +-
 e2e/harness/transient.go      |  40 +++++++++
 e2e/harness/transient_test.go | 162 ++++++++++++++++++++++++++++++++++
 6 files changed, 219 insertions(+), 4 deletions(-)
 create mode 100644 e2e/harness/transient.go
 create mode 100644 e2e/harness/transient_test.go

diff --git a/e2e/harness/act.go b/e2e/harness/act.go
index 27c9a68..48d6550 100644
--- a/e2e/harness/act.go
+++ b/e2e/harness/act.go
@@ -386,6 +386,11 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string
 	if exitCode != 0 {
 		result.Conclusion = "failure"
 		result.Error = "workflow execution failed"
+		// A non-zero act exit is the act/docker execution layer failing to run
+		// the workflow to a real conclusion (a transport/exec hiccup), as
+		// opposed to a workflow job genuinely concluding "failure". Mark it
+		// transient so the scenario runner may retry it from a clean slate.
+		result.ExecError = true
 	}
 
 	if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" {
diff --git a/e2e/harness/hotfix_actions.go b/e2e/harness/hotfix_actions.go
index 5b51fa0..91a8fe2 100644
--- a/e2e/harness/hotfix_actions.go
+++ b/e2e/harness/hotfix_actions.go
@@ -101,7 +101,7 @@ func (r *Runner) executeHotfixPlan(ctx context.Context, step *HotfixPlanStep) er
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  HotfixPlan workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("hotfix plan workflow failed: %s", result.Error)
+		return workflowFailureError("hotfix plan", result)
 	}
 
 	r.t.Logf("  HotfixPlan: parsed %d jobs", len(result.Jobs))
@@ -397,7 +397,7 @@ func (r *Runner) executeHotfixMerged(ctx context.Context, step *HotfixMergedStep
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  HotfixMerged workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("hotfix merged workflow failed: %s", result.Error)
+		return workflowFailureError("hotfix merged", result)
 	}
 
 	if err := r.syncStateFromGitea(ctx, config); err != nil {
diff --git a/e2e/harness/parser.go b/e2e/harness/parser.go
index 79cbfe2..7553b9e 100644
--- a/e2e/harness/parser.go
+++ b/e2e/harness/parser.go
@@ -29,6 +29,14 @@ type ExtendedWorkflowResult struct {
 	Jobs       map[string]*JobResultExtended
 	Logs       string
 	Error      string
+	// ExecError is true when act itself could not run the workflow to a real
+	// conclusion: the act invocation exited non-zero (a docker-exec or act
+	// transport hiccup) rather than a workflow job genuinely concluding
+	// "failure". It distinguishes a transient infrastructure flake (safe to
+	// retry from a clean slate) from a real job-level failure or an assertion
+	// mismatch (which must fail deterministically). A run that parsed real job
+	// events and concluded "failure" leaves ExecError false.
+	ExecError bool
 }
 
 // JobResultExtended contains detailed result of a single job
diff --git a/e2e/harness/runner.go b/e2e/harness/runner.go
index 119bb0d..d6c391e 100644
--- a/e2e/harness/runner.go
+++ b/e2e/harness/runner.go
@@ -485,7 +485,7 @@ func (r *Runner) executeOrchestrate(ctx context.Context, config Config, expectFa
 	if result.Conclusion != "success" {
 		r.t.Logf("  Orchestrate failed with conclusion: %s", result.Conclusion)
 		r.t.Logf("  Workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("orchestrate workflow failed: %s", result.Error)
+		return workflowFailureError("orchestrate", result)
 	}
 
 	// Debug: show what jobs were parsed
@@ -626,7 +626,7 @@ func (r *Runner) executePromote(ctx context.Context, promote *PromoteStep, confi
 
 	if result.Conclusion != "success" {
 		r.t.Logf("  Promote workflow logs:\n%s", result.Logs)
-		return fmt.Errorf("promote workflow failed: %s", result.Error)
+		return workflowFailureError("promote", result)
 	}
 
 	// Sync state from Gitea
diff --git a/e2e/harness/transient.go b/e2e/harness/transient.go
new file mode 100644
index 0000000..6ab2a98
--- /dev/null
+++ b/e2e/harness/transient.go
@@ -0,0 +1,40 @@
+package harness
+
+import (
+	"errors"
+	"fmt"
+)
+
+// errTransientWorkflow is the sentinel that marks a workflow failure as a
+// transient act/docker execution hiccup rather than a real outcome. The
+// scenario runner retries ONLY on errors that wrap this sentinel; every other
+// failure (a real job-level "failure" conclusion, an expect_failure mismatch, a
+// state/branch/tag assertion mismatch) is deterministic and must fail
+// immediately.
+var errTransientWorkflow = errors.New("transient workflow execution failure")
+
+// IsTransientWorkflowError reports whether err is (or wraps) a transient
+// act/docker execution failure that is safe to retry from a clean slate.
+func IsTransientWorkflowError(err error) bool {
+	return errors.Is(err, errTransientWorkflow)
+}
+
+// workflowFailureError builds the error returned when a workflow run concluded
+// in failure on a non-expect_failure step. When the failure was an act/docker
+// execution hiccup (result.ExecError), the error wraps errTransientWorkflow so
+// the scenario runner may retry it from a fresh repo and fresh containers. A
+// real job-level failure conclusion (ExecError false) yields a plain error that
+// is never retried.
+//
+// This must only be called on a genuine failure path. An expect_failure step
+// that legitimately concluded "failure" is the expected outcome and returns nil
+// before reaching here, so a transient classification can never mask it.
+func workflowFailureError(action string, result *ExtendedWorkflowResult) error {
+	if result == nil {
+		return fmt.Errorf("%s workflow failed", action)
+	}
+	if result.ExecError {
+		return fmt.Errorf("%s workflow failed: %s: %w", action, result.Error, errTransientWorkflow)
+	}
+	return fmt.Errorf("%s workflow failed: %s", action, result.Error)
+}
diff --git a/e2e/harness/transient_test.go b/e2e/harness/transient_test.go
new file mode 100644
index 0000000..9cd09bf
--- /dev/null
+++ b/e2e/harness/transient_test.go
@@ -0,0 +1,162 @@
+package harness
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+	"testing"
+)
+
+func TestIsTransientWorkflowError(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name string
+		err  error
+		want bool
+	}{
+		{
+			name: "nil error is not transient",
+			err:  nil,
+			want: false,
+		},
+		{
+			name: "plain error is not transient",
+			err:  errors.New("expected promote to fail but it succeeded"),
+			want: false,
+		},
+		{
+			name: "sentinel directly is transient",
+			err:  errTransientWorkflow,
+			want: true,
+		},
+		{
+			name: "wrapped sentinel is transient",
+			err:  fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow),
+			want: true,
+		},
+		{
+			name: "doubly wrapped sentinel is transient",
+			err: fmt.Errorf("step 3 (Promote) failed: %w",
+				fmt.Errorf("promote workflow failed: %w", errTransientWorkflow)),
+			want: true,
+		},
+		{
+			name: "real job failure wrapping a non-sentinel is not transient",
+			err:  fmt.Errorf("promote workflow failed: %s", "build job concluded failure"),
+			want: false,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			if got := IsTransientWorkflowError(tt.err); got != tt.want {
+				t.Fatalf("IsTransientWorkflowError(%v) = %v, want %v", tt.err, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestWorkflowFailureError_TransientClassification(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name          string
+		result        *ExtendedWorkflowResult
+		wantTransient bool
+	}{
+		{
+			name:          "exec error result is transient",
+			result:        &ExtendedWorkflowResult{Conclusion: "failure", Error: "workflow execution failed", ExecError: true},
+			wantTransient: true,
+		},
+		{
+			name:          "real job-level failure is not transient",
+			result:        &ExtendedWorkflowResult{Conclusion: "failure", Error: "build job failed"},
+			wantTransient: false,
+		},
+		{
+			name:          "missing-workflow failure is not transient",
+			result:        &ExtendedWorkflowResult{Conclusion: "failure", Error: "act produced no jobs for workflow \"promote.yaml\""},
+			wantTransient: false,
+		},
+		{
+			name:          "nil result is not transient",
+			result:        nil,
+			wantTransient: false,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			err := workflowFailureError("promote", tt.result)
+			if got := IsTransientWorkflowError(err); got != tt.wantTransient {
+				t.Fatalf("workflowFailureError transient = %v, want %v (err=%v)", got, tt.wantTransient, err)
+			}
+			if !strings.Contains(err.Error(), "promote workflow failed") {
+				t.Fatalf("error message missing action prefix: %q", err.Error())
+			}
+		})
+	}
+}
+
+// TestNormalizeWorkflowResult_ExecErrorTagging verifies that only an act/docker
+// exec hiccup (non-zero exit) is tagged transient, while a real "no jobs
+// parsed" failure (a missing or unloadable workflow) and a successful run are
+// not.
+func TestNormalizeWorkflowResult_ExecErrorTagging(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name           string
+		jobs           map[string]*JobResultExtended
+		workflowPath   string
+		exitCode       int
+		wantConclusion string
+		wantExecError  bool
+	}{
+		{
+			name:           "non-zero exit tags transient exec error",
+			jobs:           map[string]*JobResultExtended{},
+			workflowPath:   ".github/workflows/promote.yaml",
+			exitCode:       1,
+			wantConclusion: "failure",
+			wantExecError:  true,
+		},
+		{
+			name:           "zero exit with no jobs is a real failure not transient",
+			jobs:           map[string]*JobResultExtended{},
+			workflowPath:   ".github/workflows/promote.yaml",
+			exitCode:       0,
+			wantConclusion: "failure",
+			wantExecError:  false,
+		},
+		{
+			name:           "zero exit with jobs stays successful and not transient",
+			jobs:           map[string]*JobResultExtended{"build": {Name: "build", Conclusion: "success"}},
+			workflowPath:   ".github/workflows/promote.yaml",
+			exitCode:       0,
+			wantConclusion: "success",
+			wantExecError:  false,
+		},
+	}
+
+	for _, tt := range tests {
+		tt := tt
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			result := &ExtendedWorkflowResult{Conclusion: "success", Jobs: tt.jobs}
+			normalizeWorkflowResult(result, tt.workflowPath, tt.exitCode)
+			if result.Conclusion != tt.wantConclusion {
+				t.Fatalf("Conclusion = %q, want %q", result.Conclusion, tt.wantConclusion)
+			}
+			if result.ExecError != tt.wantExecError {
+				t.Fatalf("ExecError = %v, want %v", result.ExecError, tt.wantExecError)
+			}
+		})
+	}
+}

From cb8573cde0bbbf3cc8d3f2ca793c8efb8096aa6b Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 20:35:44 -0400
Subject: [PATCH 2/3] fix: retry whole scenarios on transient act/docker
 execution flake

Wrap each multi-step scenario in a bounded retry that re-runs the entire scenario from a fresh gitea repo and fresh act containers when, and only when, a step fails with a transient act/docker execution error. Real assertion mismatches, expect_failure mismatches, and genuine job-level failures fail on the first attempt with no retry, so a flake never masks a real regression.

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/e2e_test.go                    |  21 ++--
 e2e/harness/scenario_retry.go      |  88 +++++++++++++++++
 e2e/harness/scenario_retry_test.go | 153 +++++++++++++++++++++++++++++
 3 files changed, 248 insertions(+), 14 deletions(-)
 create mode 100644 e2e/harness/scenario_retry.go
 create mode 100644 e2e/harness/scenario_retry_test.go

diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go
index 09916d7..ce376be 100644
--- a/e2e/e2e_test.go
+++ b/e2e/e2e_test.go
@@ -29,22 +29,15 @@ func TestMultiStepScenarios(t *testing.T) {
 		t.Run(scenario.Name, func(t *testing.T) {
 			t.Parallel()
 
-			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
+			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Minute)
 			defer cancel()
 
-			h := harness.New(t)
-			defer h.Cleanup()
-
-			err := h.SetupInfra(ctx)
-			require.NoError(t, err, "failed to setup infrastructure")
-
-			// Stage initial repo with config
-			err = h.StageRepoFromConfig(ctx, scenario.Config)
-			require.NoError(t, err, "failed to stage repo")
-
-			// Create runner and execute
-			runner := harness.NewRunner(t, h)
-			err = runner.Run(ctx, scenario)
+			// RunMultiStepScenario runs the whole scenario with a bounded
+			// scenario-level retry on transient act/docker execution failures.
+			// Each attempt gets a fresh harness (network, gitea repo, act
+			// containers), so a retry is a clean slate. Real assertion or
+			// job-level failures fail deterministically without a retry.
+			err := harness.RunMultiStepScenario(ctx, t, scenario)
 			require.NoError(t, err, "scenario failed")
 		})
 	}
diff --git a/e2e/harness/scenario_retry.go b/e2e/harness/scenario_retry.go
new file mode 100644
index 0000000..f9c7876
--- /dev/null
+++ b/e2e/harness/scenario_retry.go
@@ -0,0 +1,88 @@
+package harness
+
+import (
+	"context"
+	"fmt"
+	"testing"
+)
+
+// scenarioRetryAttempts bounds how many times a single multi-step scenario is
+// run end to end. Each attempt runs against a fresh gitea repo and fresh act
+// containers, so a retry starts from a clean slate with no partial mutation
+// carried over. Only transient act/docker execution failures consume an
+// attempt; real assertion or job-level failures fail on the first attempt.
+const scenarioRetryAttempts = 3
+
+// logger is the minimal logging surface a scenario attempt needs. *testing.T
+// satisfies it, and unit tests can supply a fake to assert on retry logging.
+type logger interface {
+	Logf(format string, args ...any)
+}
+
+// runScenarioWithRetry runs attempt up to scenarioRetryAttempts times, retrying
+// ONLY when the attempt returns a transient act/docker execution failure
+// (errors that wrap errTransientWorkflow). Any non-transient error - a real
+// job-level failure, an expect_failure mismatch, or a state/branch/tag
+// assertion mismatch - fails immediately without a retry. attempt must perform
+// a full clean-slate run (fresh repo + fresh containers) on every call so a
+// retry never inherits partial state from a prior attempt.
+//
+// It returns nil on the first successful attempt, or the last error after the
+// attempt budget is exhausted.
+func runScenarioWithRetry(ctx context.Context, log logger, name string, attempt func(ctx context.Context) error) error {
+	var lastErr error
+	for n := 1; n <= scenarioRetryAttempts; n++ {
+		err := attempt(ctx)
+		if err == nil {
+			if n > 1 {
+				log.Logf("scenario %q: passed on attempt %d/%d after transient retry", name, n, scenarioRetryAttempts)
+			}
+			return nil
+		}
+		// Real assertion failures and genuine job-level failures are
+		// deterministic; surface them immediately so they fail the run.
+		if !IsTransientWorkflowError(err) {
+			return err
+		}
+		lastErr = err
+		if n < scenarioRetryAttempts {
+			log.Logf("scenario %q: transient act/docker execution failure on attempt %d/%d, retrying from a clean slate: %v",
+				name, n, scenarioRetryAttempts, err)
+			continue
+		}
+		log.Logf("scenario %q: exhausted %d attempts; last failure was transient: %v",
+			name, scenarioRetryAttempts, err)
+	}
+	return fmt.Errorf("scenario %q failed after %d attempts: %w", name, scenarioRetryAttempts, lastErr)
+}
+
+// RunMultiStepScenario runs a whole multi-step scenario with a bounded
+// scenario-level retry on transient act/docker execution failures. Each attempt
+// builds a fresh harness (new docker network, gitea container + repo, act
+// container), stages the repo from the scenario config, runs every step, and
+// tears the harness down - so a retry is a clean slate with no carried-over
+// mutation. This is the safe layer at which to retry: re-running a partial,
+// state-mutating act run in place is not safe, but re-running an entire scenario
+// from scratch is.
+//
+// It retries ONLY transient failures (act/docker could not execute the workflow
+// to a real conclusion). A real job-level failure, an expect_failure mismatch,
+// or any state/branch/tag assertion mismatch fails deterministically on the
+// first attempt.
+func RunMultiStepScenario(ctx context.Context, t *testing.T, scenario *MultiStepScenario) error {
+	t.Helper()
+	return runScenarioWithRetry(ctx, t, scenario.Name, func(ctx context.Context) error {
+		h := New(t)
+		defer h.Cleanup()
+
+		if err := h.SetupInfra(ctx); err != nil {
+			return fmt.Errorf("failed to setup infrastructure: %w", err)
+		}
+		if err := h.StageRepoFromConfig(ctx, scenario.Config); err != nil {
+			return fmt.Errorf("failed to stage repo: %w", err)
+		}
+
+		runner := NewRunner(t, h)
+		return runner.Run(ctx, scenario)
+	})
+}
diff --git a/e2e/harness/scenario_retry_test.go b/e2e/harness/scenario_retry_test.go
new file mode 100644
index 0000000..e146d46
--- /dev/null
+++ b/e2e/harness/scenario_retry_test.go
@@ -0,0 +1,153 @@
+package harness
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+	"testing"
+)
+
+// fakeLogger captures formatted log lines for assertions.
+type fakeLogger struct {
+	mu    sync.Mutex
+	lines []string
+}
+
+func (f *fakeLogger) Logf(format string, args ...any) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.lines = append(f.lines, fmt.Sprintf(format, args...))
+}
+
+func (f *fakeLogger) contains(sub string) bool {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	for _, l := range f.lines {
+		if strings.Contains(l, sub) {
+			return true
+		}
+	}
+	return false
+}
+
+func transientErr() error {
+	return fmt.Errorf("promote workflow failed: workflow execution failed: %w", errTransientWorkflow)
+}
+
+func TestRunScenarioWithRetry_PassesFirstAttempt(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Scenario", func(context.Context) error {
+		calls++
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if calls != 1 {
+		t.Fatalf("attempts = %d, want 1", calls)
+	}
+	if log.contains("retry") {
+		t.Fatalf("did not expect a retry log on first-attempt pass: %v", log.lines)
+	}
+}
+
+func TestRunScenarioWithRetry_RecoversAfterTransient(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Hotfix_Stacked", func(context.Context) error {
+		calls++
+		if calls < 2 {
+			return transientErr()
+		}
+		return nil
+	})
+	if err != nil {
+		t.Fatalf("expected recovery, got error: %v", err)
+	}
+	if calls != 2 {
+		t.Fatalf("attempts = %d, want 2", calls)
+	}
+	if !log.contains("transient act/docker execution failure on attempt 1") {
+		t.Fatalf("expected a transient-retry log line, got: %v", log.lines)
+	}
+	if !log.contains("passed on attempt 2") {
+		t.Fatalf("expected a recovery log line, got: %v", log.lines)
+	}
+}
+
+// TestRunScenarioWithRetry_AssertionFailsImmediately is the critical safety
+// guarantee: a real assertion failure (here, an expect_failure mismatch) must
+// NOT be retried and must surface on the first attempt.
+func TestRunScenarioWithRetry_AssertionFailsImmediately(t *testing.T) {
+	t.Parallel()
+
+	assertionErr := errors.New("expected promote to fail but it succeeded")
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Promote_rolls_back", func(context.Context) error {
+		calls++
+		return assertionErr
+	})
+	if !errors.Is(err, assertionErr) {
+		t.Fatalf("expected the assertion error to surface, got: %v", err)
+	}
+	if calls != 1 {
+		t.Fatalf("assertion failure must not retry: attempts = %d, want 1", calls)
+	}
+	if log.contains("retry") {
+		t.Fatalf("must not log a retry for an assertion failure: %v", log.lines)
+	}
+}
+
+// TestRunScenarioWithRetry_RealJobFailureNotRetried guards that a genuine
+// job-level workflow failure (ExecError false) is treated as deterministic.
+func TestRunScenarioWithRetry_RealJobFailureNotRetried(t *testing.T) {
+	t.Parallel()
+
+	realFailure := workflowFailureError("orchestrate", &ExtendedWorkflowResult{
+		Conclusion: "failure",
+		Error:      "build job concluded failure",
+	})
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Four_Environment_Cascade_Promotion", func(context.Context) error {
+		calls++
+		return realFailure
+	})
+	if err == nil {
+		t.Fatal("expected the real failure to surface")
+	}
+	if calls != 1 {
+		t.Fatalf("real job failure must not retry: attempts = %d, want 1", calls)
+	}
+}
+
+func TestRunScenarioWithRetry_ExhaustsAttemptsOnPersistentTransient(t *testing.T) {
+	t.Parallel()
+
+	log := &fakeLogger{}
+	calls := 0
+	err := runScenarioWithRetry(context.Background(), log, "Flaky", func(context.Context) error {
+		calls++
+		return transientErr()
+	})
+	if err == nil {
+		t.Fatal("expected an error after exhausting attempts")
+	}
+	if calls != scenarioRetryAttempts {
+		t.Fatalf("attempts = %d, want %d", calls, scenarioRetryAttempts)
+	}
+	if !IsTransientWorkflowError(err) {
+		t.Fatalf("final error should still wrap the transient sentinel: %v", err)
+	}
+	if !strings.Contains(err.Error(), fmt.Sprintf("after %d attempts", scenarioRetryAttempts)) {
+		t.Fatalf("final error should report the attempt count: %v", err)
+	}
+}

From be1f884ed4f0e99d5b2e6c813ead6869a2c294c1 Mon Sep 17 00:00:00 2001
From: Joshua Temple <joshua.temple@stablekernel.com>
Date: Thu, 11 Jun 2026 20:48:08 -0400
Subject: [PATCH 3/3] fix: classify act job-level failures as non-transient,
 not exec errors

Signed-off-by: Joshua Temple <joshua.temple@stablekernel.com>
---
 e2e/harness/act.go            | 36 ++++++++++++++++++++++++++++++-----
 e2e/harness/transient_test.go | 16 +++++++++++++++-
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/e2e/harness/act.go b/e2e/harness/act.go
index 48d6550..6d31c37 100644
--- a/e2e/harness/act.go
+++ b/e2e/harness/act.go
@@ -384,13 +384,19 @@ func (a *ActRunner) RunWorkflowFromRepo(ctx context.Context, opts RunOpts) (*Ext
 // up as a green-but-empty scenario (#25).
 func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string, exitCode int) {
 	if exitCode != 0 {
+		// ExecError means act could NOT run the workflow to a conclusion: a
+		// genuine act/docker transport or exec hiccup where no job reached a
+		// conclusion. It must NOT cover the case where act ran the workflow and
+		// a job genuinely concluded "failure" - that is a real, deterministic
+		// defect and retrying it would mask a real failure as transient.
+		//
+		// So only tag ExecError when the non-zero exit is unaccompanied by any
+		// parsed job-level failure. If a job concluded "failure" (or the
+		// reconciled conclusion is already "failure"), this was a real outcome.
+		execError := !hasJobFailure(result)
 		result.Conclusion = "failure"
 		result.Error = "workflow execution failed"
-		// A non-zero act exit is the act/docker execution layer failing to run
-		// the workflow to a real conclusion (a transport/exec hiccup), as
-		// opposed to a workflow job genuinely concluding "failure". Mark it
-		// transient so the scenario runner may retry it from a clean slate.
-		result.ExecError = true
+		result.ExecError = execError
 	}
 
 	if workflowPath != "" && len(result.Jobs) == 0 && result.Conclusion != "failure" {
@@ -399,6 +405,26 @@ func normalizeWorkflowResult(result *ExtendedWorkflowResult, workflowPath string
 	}
 }
 
+// hasJobFailure reports whether act parsed a genuine job-level failure: either
+// the reconciled run conclusion is already "failure", or at least one parsed
+// job concluded "failure". When true, a non-zero act exit reflects a real
+// workflow outcome rather than an act/docker exec hiccup, so it must not be
+// classified as a transient ExecError.
+func hasJobFailure(result *ExtendedWorkflowResult) bool {
+	if result == nil {
+		return false
+	}
+	if result.Conclusion == "failure" {
+		return true
+	}
+	for _, job := range result.Jobs {
+		if job != nil && job.Conclusion == "failure" {
+			return true
+		}
+	}
+	return false
+}
+
 // buildActArgs builds additional act command arguments. eventPath is the
 // in-container event-payload file written by writeEventFile (empty when no
 // EventJSON was supplied); when set it is appended as the act -e flag.
diff --git a/e2e/harness/transient_test.go b/e2e/harness/transient_test.go
index 9cd09bf..6484bda 100644
--- a/e2e/harness/transient_test.go
+++ b/e2e/harness/transient_test.go
@@ -120,13 +120,27 @@ func TestNormalizeWorkflowResult_ExecErrorTagging(t *testing.T) {
 		wantExecError  bool
 	}{
 		{
-			name:           "non-zero exit tags transient exec error",
+			name:           "non-zero exit with no jobs tags transient exec error",
 			jobs:           map[string]*JobResultExtended{},
 			workflowPath:   ".github/workflows/promote.yaml",
 			exitCode:       1,
 			wantConclusion: "failure",
 			wantExecError:  true,
 		},
+		{
+			// Regression: act exits non-zero when a job genuinely concludes
+			// "failure". That is a real, deterministic defect, not an
+			// act/docker transport hiccup, so ExecError must stay false and the
+			// scenario runner must NOT retry it.
+			name: "non-zero exit with a failed job is a real failure not transient",
+			jobs: map[string]*JobResultExtended{
+				"build": {Name: "build", Conclusion: "failure"},
+			},
+			workflowPath:   ".github/workflows/promote.yaml",
+			exitCode:       1,
+			wantConclusion: "failure",
+			wantExecError:  false,
+		},
 		{
 			name:           "zero exit with no jobs is a real failure not transient",
 			jobs:           map[string]*JobResultExtended{},