From b2cb50e6f3f2207e14620db59815dbbf272c2c29 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 00:30:40 -0400 Subject: [PATCH 1/2] fix(e2e): close gitea lost-commit race in workflow generation The e2e harness pushed generated workflows via raw git push from the act container, then immediately wrote the next commit through gitea's Contents API. Under parallel-suite load gitea's API layer could hold a stale branch head from before the push, parent the API commit on it, and move the branch ref there - silently discarding the workflows commit. The later fetch+reset then produced a tree with no .github/workflows, surfacing as a misattributed generate-workflow failure. Verify the push converged before returning: read the pushed SHA in the container and poll gitea's branch head until it matches (bounded). Add a bounded fetch+reset retry at the sync site with its own distinct error message, and treat docker-exec transport errors in the workflow probe as retryable rather than as a missing file. Cap build-cli e2e parallelism at 2 to match e2e.yaml and shrink the race window. Signed-off-by: Joshua Temple --- .github/workflows/build-cli.yaml | 5 +- e2e/harness/harness.go | 227 ++++++++++++++++++++++++++----- e2e/harness/lost_commit_test.go | 100 ++++++++++++++ 3 files changed, 299 insertions(+), 33 deletions(-) create mode 100644 e2e/harness/lost_commit_test.go diff --git a/.github/workflows/build-cli.yaml b/.github/workflows/build-cli.yaml index 7af7214..2fb8893 100644 --- a/.github/workflows/build-cli.yaml +++ b/.github/workflows/build-cli.yaml @@ -61,4 +61,7 @@ jobs: - name: Run E2E tests working-directory: e2e - run: go test -v -parallel 4 -timeout 30m ./... + # Cap subtest parallelism at 2 to match e2e.yaml's E2E_PARALLEL. The + # 4-core / ~7.9GB runner is OOM-killed at 4 (silent FAIL, see #104), and + # the extra container load widens the gitea push-vs-Contents-API race. + run: go test -v -parallel 2 -timeout 30m ./... diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go index 188479b..a4b73ef 100644 --- a/e2e/harness/harness.go +++ b/e2e/harness/harness.go @@ -467,7 +467,9 @@ func (h *Harness) GenerateWorkflows(ctx context.Context) error { } _, _, _ = h.act.Container().Exec(ctx, namespaceCmd) - // Configure git in the container and commit workflows and CLI binary + // Configure git in the container and commit workflows and CLI binary. + // Echo the pushed HEAD on a stable marker line so we can read it back and + // verify gitea has observed the push before returning (see below). commitCmd := []string{ "bash", "-c", "cd /tmp/repo && " + @@ -475,38 +477,170 @@ func (h *Harness) GenerateWorkflows(ctx context.Context) error { "git config user.name 'Test' && " + "git add .github/workflows/*.yaml .github/bin/ && " + "git commit -m 'chore: generate workflows and add CLI binary' && " + - "git push", + "git push && " + + "echo PUSHED_SHA=$(git rev-parse HEAD)", } exitCode, reader, err = h.act.Container().Exec(ctx, commitCmd) + var output bytes.Buffer + if reader != nil { + _, _ = io.Copy(&output, reader) + } if err != nil || exitCode != 0 { - var output bytes.Buffer - if reader != nil { - _, _ = io.Copy(&output, reader) - } return fmt.Errorf("failed to commit workflows (exit %d): %w\nOutput: %s", exitCode, err, output.String()) } + pushedSHA := parsePushedSHA(output.String()) + if pushedSHA == "" { + return fmt.Errorf("could not determine pushed workflows SHA from push output:\n%s", output.String()) + } + + // Close the lost-commit race. gitea serves its Contents API from a layer + // that can lag a raw `git push` to the same branch: the very next runner + // action (executeCommit) writes through that Contents API and, if it still + // holds the pre-push branch head, parents its commit on the stale head and + // moves the ref there - silently discarding this workflows commit. Block + // until gitea's API reports the pushed SHA as the branch head so every + // subsequent Contents-API write is parented on it. + if err := h.waitForBranchHead(ctx, pushedSHA); err != nil { + return fmt.Errorf("workflows push did not converge in gitea: %w", err) + } + return nil } +// pushedSHAMarker is the prefix GenerateWorkflows echoes after a successful +// push so the pushed HEAD can be recovered from the (otherwise free-form) git +// output. +const pushedSHAMarker = "PUSHED_SHA=" + +// parsePushedSHA extracts the SHA echoed on the `PUSHED_SHA=` marker line. +// It returns "" when no well-formed marker is present. +func parsePushedSHA(output string) string { + for _, line := range strings.Split(output, "\n") { + line = strings.TrimSpace(line) + if !strings.HasPrefix(line, pushedSHAMarker) { + continue + } + sha := strings.TrimSpace(strings.TrimPrefix(line, pushedSHAMarker)) + if isHexSHA(sha) { + return sha + } + } + return "" +} + +// isHexSHA reports whether s is a non-empty hex string of a plausible git +// object length (full 40-char or abbreviated, >= 7). +func isHexSHA(s string) bool { + if len(s) < 7 || len(s) > 64 { + return false + } + for _, r := range s { + switch { + case r >= '0' && r <= '9': + case r >= 'a' && r <= 'f': + case r >= 'A' && r <= 'F': + default: + return false + } + } + return true +} + +// branchHeadPollAttempts and branchHeadPollInterval bound the verify-after-push +// poll. ~10 x 500ms covers the observed gitea API staleness window with margin +// while still failing fast on a genuinely lost push. +const ( + branchHeadPollAttempts = 10 + branchHeadPollInterval = 500 * time.Millisecond +) + +// waitForBranchHead polls gitea's branch API until the main branch head equals +// wantSHA, bounding the wait by branchHeadPollAttempts. It returns a clear +// error (including the last head it observed) if the branch never converges, so +// a genuinely lost push is reported as such rather than masquerading as a +// missing-file failure downstream. +func (h *Harness) waitForBranchHead(ctx context.Context, wantSHA string) error { + if h.gitea == nil || h.repo == nil { + return nil + } + + var lastSHA string + var lastErr error + for attempt := 0; attempt < branchHeadPollAttempts; attempt++ { + if attempt > 0 { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(branchHeadPollInterval): + } + } + head, err := h.gitea.getHeadSHA(ctx, h.repo) + if err != nil { + lastErr = err + continue + } + lastSHA = head + if head == wantSHA { + return nil + } + } + + if lastErr != nil && lastSHA == "" { + return fmt.Errorf("branch head never readable after %d attempts (want %s): %w", + branchHeadPollAttempts, wantSHA, lastErr) + } + return fmt.Errorf("branch head did not reach pushed SHA after %d attempts (want %s, last observed %s)", + branchHeadPollAttempts, wantSHA, lastSHA) +} + // assertOrchestrateGenerated confirms that .github/workflows/orchestrate.yaml // exists and is non-empty in the act container's /tmp/repo immediately after // generate-workflow. On failure it returns the generate output plus a listing // of the workflows directory so the missing-file moment is captured with // context rather than surfacing later as an opaque `cat: ... No such file`. func (h *Harness) assertOrchestrateGenerated(ctx context.Context, genOutput string) error { - const workflowPath = ".github/workflows/orchestrate.yaml" + present, exitCode, err := h.probeOrchestrateWorkflow(ctx) + if present { + return nil + } + // A docker-exec transport error (err != nil) means the probe itself did not + // run, not that the file is absent. Surface it as a distinct, retryable + // condition so a flaky exec is not misreported as a generation failure. + if err != nil { + return fmt.Errorf("workflow probe transport error (exit=%d): %w", exitCode, err) + } + return fmt.Errorf( + "generate-workflow exited 0 but did not produce %s (exit=%d)\ngenerate output:\n%s\nworkflows dir:\n%s", + orchestrateWorkflowPath, exitCode, strings.TrimSpace(genOutput), + strings.TrimSpace(h.workflowsDirListing(ctx)), + ) +} + +// orchestrateWorkflowPath is the generated workflow whose presence gates every +// orchestrate run. +const orchestrateWorkflowPath = ".github/workflows/orchestrate.yaml" + +// probeOrchestrateWorkflow reports whether orchestrate.yaml exists and is +// non-empty in /tmp/repo. The returned error is a docker-exec transport error +// (the probe could not be run), which callers treat as retryable and distinct +// from a clean "file absent" result (present=false, err=nil). +func (h *Harness) probeOrchestrateWorkflow(ctx context.Context) (present bool, exitCode int, err error) { checkCmd := []string{ "bash", "-c", - "cd /tmp/repo && test -s " + workflowPath, + "cd /tmp/repo && test -s " + orchestrateWorkflowPath, } - exitCode, _, err := h.act.Container().Exec(ctx, checkCmd) - if err == nil && exitCode == 0 { - return nil + exitCode, _, err = h.act.Container().Exec(ctx, checkCmd) + if err != nil { + return false, exitCode, err } + return exitCode == 0, exitCode, nil +} - // Gather diagnostics for the failure path only. +// workflowsDirListing returns a best-effort `ls -la` of the workflows dir for +// diagnostics. It never errors; transport failures yield an empty listing. +func (h *Harness) workflowsDirListing(ctx context.Context) string { lsCmd := []string{ "bash", "-c", "cd /tmp/repo && ls -la .github/workflows/ 2>&1 || true", @@ -515,10 +649,7 @@ func (h *Harness) assertOrchestrateGenerated(ctx context.Context, genOutput stri if _, lsReader, lsErr := h.act.Container().Exec(ctx, lsCmd); lsErr == nil && lsReader != nil { _, _ = io.Copy(&listing, lsReader) } - return fmt.Errorf( - "generate-workflow exited 0 but did not produce %s (exit=%d err=%v)\ngenerate output:\n%s\nworkflows dir:\n%s", - workflowPath, exitCode, err, strings.TrimSpace(genOutput), strings.TrimSpace(listing.String()), - ) + return listing.String() } // ensureCLIBinary builds the cascade binary once per test process and @@ -614,31 +745,63 @@ func (h *Harness) SyncRepoToActContainer(ctx context.Context) error { // to origin/main explicitly and chain the reset so a partial sync surfaces // as a non-zero exit instead of silently resetting to a stale tree (which // would drop the just-pushed orchestrate.yaml (#25). + // + // The fetch+reset and the orchestrate.yaml presence check are retried as a + // unit: a transient gitea read or a momentarily stale ref can yield a tree + // without the workflow even though GenerateWorkflows already verified the + // push converged. Re-fetching resolves those; a persistent miss after the + // bound is a real lost-commit and is reported with this call site's own + // message rather than the generation-phase one. syncCmd := []string{ "bash", "-c", "cd /tmp/repo && git fetch origin main && git reset --hard origin/main && (git branch -f master HEAD 2>/dev/null || true)", } - exitCode, reader, err := h.act.Container().Exec(ctx, syncCmd) - if err != nil { - return fmt.Errorf("failed to sync repo: %w", err) - } - if exitCode != 0 { - var output bytes.Buffer - if reader != nil { - _, _ = io.Copy(&output, reader) + const maxAttempts = 5 + const retryDelay = 500 * time.Millisecond + + var lastErr error + for attempt := 1; attempt <= maxAttempts; attempt++ { + if attempt > 1 { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(retryDelay): + } } - return fmt.Errorf("sync repo failed (exit %d): %s", exitCode, output.String()) - } - // After syncing, the orchestrate workflow must be present; if the synced - // tree lacks it the orchestrate run would otherwise misreport as a passing - // 0-job scenario instead of failing here with context. - if err := h.assertOrchestrateGenerated(ctx, ""); err != nil { - return fmt.Errorf("orchestrate workflow missing after repo sync: %w", err) + exitCode, reader, err := h.act.Container().Exec(ctx, syncCmd) + if err != nil { + lastErr = fmt.Errorf("fetch/reset transport error: %w", err) + continue + } + if exitCode != 0 { + var output bytes.Buffer + if reader != nil { + _, _ = io.Copy(&output, reader) + } + lastErr = fmt.Errorf("fetch/reset failed (exit %d): %s", exitCode, strings.TrimSpace(output.String())) + continue + } + + present, probeExit, probeErr := h.probeOrchestrateWorkflow(ctx) + if present { + return nil + } + if probeErr != nil { + lastErr = fmt.Errorf("workflow probe transport error (exit=%d): %w", probeExit, probeErr) + continue + } + lastErr = fmt.Errorf("%s absent after fetch/reset (exit=%d)", orchestrateWorkflowPath, probeExit) } - return nil + // Distinct from the generation-phase message: this is a sync/lost-commit + // failure, not a generate-workflow failure. Misattributing it (the prior + // behavior) sent diagnosis down the wrong path. + return fmt.Errorf( + "orchestrate workflow still missing after %d repo-sync attempts: %w\nworkflows dir:\n%s", + maxAttempts, lastErr, strings.TrimSpace(h.workflowsDirListing(ctx)), + ) } // getProjectRoot finds the project root directory containing cmd/cascade diff --git a/e2e/harness/lost_commit_test.go b/e2e/harness/lost_commit_test.go new file mode 100644 index 0000000..879e2c9 --- /dev/null +++ b/e2e/harness/lost_commit_test.go @@ -0,0 +1,100 @@ +package harness + +import "testing" + +func TestParsePushedSHA(t *testing.T) { + t.Parallel() + + const fullSHA = "0123456789abcdef0123456789abcdef01234567" + + tests := []struct { + name string + output string + want string + }{ + { + name: "marker on its own line", + output: pushedSHAMarker + fullSHA, + want: fullSHA, + }, + { + name: "marker after push chatter", + output: "Enumerating objects: 5, done.\nTo http://gitea/repo.git\n abc..def main -> main\n" + pushedSHAMarker + fullSHA + "\n", + want: fullSHA, + }, + { + name: "trailing whitespace trimmed", + output: pushedSHAMarker + fullSHA + " \n", + want: fullSHA, + }, + { + name: "leading whitespace tolerated", + output: " " + pushedSHAMarker + fullSHA, + want: fullSHA, + }, + { + name: "abbreviated sha accepted", + output: pushedSHAMarker + "abc1234", + want: "abc1234", + }, + { + name: "no marker", + output: "To http://gitea/repo.git\n abc..def main -> main\n", + want: "", + }, + { + name: "marker present but value not hex", + output: pushedSHAMarker + "not-a-sha", + want: "", + }, + { + name: "marker present but value empty", + output: pushedSHAMarker, + want: "", + }, + { + name: "empty output", + output: "", + want: "", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := parsePushedSHA(tt.output); got != tt.want { + t.Fatalf("parsePushedSHA(%q) = %q, want %q", tt.output, got, tt.want) + } + }) + } +} + +func TestIsHexSHA(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + in string + want bool + }{ + {name: "full lowercase sha", in: "0123456789abcdef0123456789abcdef01234567", want: true}, + {name: "uppercase hex", in: "ABCDEF0", want: true}, + {name: "minimum abbreviated length", in: "abc1234", want: true}, + {name: "too short", in: "abc123", want: false}, + {name: "empty", in: "", want: false}, + {name: "non-hex character", in: "abcdefg", want: false}, + {name: "contains hyphen", in: "abc-123", want: false}, + {name: "too long", in: "0123456789abcdef0123456789abcdef0123456789abcdef0123456789abcdef01", want: false}, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + if got := isHexSHA(tt.in); got != tt.want { + t.Fatalf("isHexSHA(%q) = %v, want %v", tt.in, got, tt.want) + } + }) + } +} From 614ebf65bd5604c52a786bf7f02090b157e7ebb0 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 00:38:36 -0400 Subject: [PATCH 2/2] fix(e2e): demux docker exec stream before parsing pushed SHA Signed-off-by: Joshua Temple --- e2e/harness/harness.go | 23 +++++++++++++++++++---- e2e/harness/lost_commit_test.go | 29 ++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 5 deletions(-) diff --git a/e2e/harness/harness.go b/e2e/harness/harness.go index a4b73ef..aff732d 100644 --- a/e2e/harness/harness.go +++ b/e2e/harness/harness.go @@ -14,6 +14,7 @@ import ( "time" "github.com/testcontainers/testcontainers-go" + tcexec "github.com/testcontainers/testcontainers-go/exec" "github.com/testcontainers/testcontainers-go/network" "gopkg.in/yaml.v3" ) @@ -481,7 +482,13 @@ func (h *Harness) GenerateWorkflows(ctx context.Context) error { "echo PUSHED_SHA=$(git rev-parse HEAD)", } - exitCode, reader, err = h.act.Container().Exec(ctx, commitCmd) + // Request a demultiplexed stream. Without tcexec.Multiplexed the reader + // carries Docker's hijacked-attach framing (an 8-byte header per chunk: + // stream type, padding, then a big-endian uint32 length), so the + // PUSHED_SHA marker line arrives prefixed with binary header bytes and + // parsePushedSHA cannot match it. Multiplexed() strips that framing and + // merges stdout/stderr into clean text. + exitCode, reader, err = h.act.Container().Exec(ctx, commitCmd, tcexec.Multiplexed()) var output bytes.Buffer if reader != nil { _, _ = io.Copy(&output, reader) @@ -518,11 +525,19 @@ const pushedSHAMarker = "PUSHED_SHA=" // It returns "" when no well-formed marker is present. func parsePushedSHA(output string) string { for _, line := range strings.Split(output, "\n") { - line = strings.TrimSpace(line) - if !strings.HasPrefix(line, pushedSHAMarker) { + // Locate the marker anywhere in the line rather than only as a + // leading prefix. Docker's multiplexed exec stream frames each chunk + // with an 8-byte binary header (stream type, padding, big-endian + // length); if that framing reaches us the marker is preceded by + // header bytes that may themselves be printable (the length byte can + // land in the ASCII range), so a leading-prefix check fails. Slicing + // from the marker recovers the SHA whether or not the stream was + // demultiplexed upstream. + idx := strings.Index(line, pushedSHAMarker) + if idx < 0 { continue } - sha := strings.TrimSpace(strings.TrimPrefix(line, pushedSHAMarker)) + sha := strings.TrimSpace(line[idx+len(pushedSHAMarker):]) if isHexSHA(sha) { return sha } diff --git a/e2e/harness/lost_commit_test.go b/e2e/harness/lost_commit_test.go index 879e2c9..2460d21 100644 --- a/e2e/harness/lost_commit_test.go +++ b/e2e/harness/lost_commit_test.go @@ -1,6 +1,21 @@ package harness -import "testing" +import ( + "encoding/binary" + "testing" +) + +// dockerFrame builds a single Docker multiplexed-stream frame: an 8-byte +// header (stream type, three padding bytes, big-endian uint32 payload length) +// followed by the payload. It mirrors the framing the hijacked container +// attach protocol emits so tests can exercise parsePushedSHA against raw, +// un-demultiplexed exec output. +func dockerFrame(streamType byte, payload string) string { + header := make([]byte, 8) + header[0] = streamType + binary.BigEndian.PutUint32(header[4:], uint32(len(payload))) + return string(header) + payload +} func TestParsePushedSHA(t *testing.T) { t.Parallel() @@ -12,6 +27,18 @@ func TestParsePushedSHA(t *testing.T) { output string want string }{ + { + name: "docker multiplexed framing", + output: dockerFrame(1, "[main 0508231] chore: generate workflows\n") + + dockerFrame(2, "remote: Processing 1 references\n") + + dockerFrame(1, pushedSHAMarker+fullSHA+"\n"), + want: fullSHA, + }, + { + name: "binary prefix on marker line", + output: "\x01\x00\x00\x00\x00\x00\x00\x4f" + pushedSHAMarker + fullSHA + "\n", + want: fullSHA, + }, { name: "marker on its own line", output: pushedSHAMarker + fullSHA,