From 5c9e7b6204c6f57f7b4dd09b60bd1a1cfefb0002 Mon Sep 17 00:00:00 2001 From: Joshua Temple Date: Thu, 11 Jun 2026 17:46:25 -0400 Subject: [PATCH] fix: harden e2e against gitea throttling and container contention Run the act-heavy e2e scenarios serially (-parallel 1) in CI and raise the go test timeout to 60m. Under concurrent execution the 4-core runner's gitea + act + job containers contend, throttling gitea (405 'try again later') and destabilising act runs, which surfaces as intermittent, product-unrelated failures. Serial execution removes that contention. Wrap the gitea REST calls that have been observed to throttle (merge, create-pr, create-branch, change-files, label create/apply) with a bounded retry on transient 405 'try again later' and 5xx responses. The retry is safe: gitea returns these before applying any state change, so re-issuing cannot double-apply a mutation. Real 4xx client errors are surfaced immediately so expect-failure assertions stay deterministic. Signed-off-by: Joshua Temple --- .github/workflows/build-cli.yaml | 16 ++- .github/workflows/e2e.yaml | 8 +- e2e/harness/gitea.go | 108 +++++++++---------- e2e/harness/gitea_retry.go | 115 ++++++++++++++++++++ e2e/harness/gitea_retry_test.go | 180 +++++++++++++++++++++++++++++++ 5 files changed, 360 insertions(+), 67 deletions(-) create mode 100644 e2e/harness/gitea_retry.go create mode 100644 e2e/harness/gitea_retry_test.go diff --git a/.github/workflows/build-cli.yaml b/.github/workflows/build-cli.yaml index 2fb8893..c77c686 100644 --- a/.github/workflows/build-cli.yaml +++ b/.github/workflows/build-cli.yaml @@ -46,6 +46,11 @@ jobs: name: E2E Tests runs-on: ubuntu-latest needs: [build] + # The act + gitea scenarios run serially (-parallel 1, see the test step), + # which trades wall-clock for reliability, so the job needs more than the + # default headroom. 70m covers the 60m go test budget plus checkout, Go + # install, and container teardown. + timeout-minutes: 70 steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -61,7 +66,10 @@ jobs: - name: Run E2E tests working-directory: e2e - # Cap subtest parallelism at 2 to match e2e.yaml's E2E_PARALLEL. The - # 4-core / ~7.9GB runner is OOM-killed at 4 (silent FAIL, see #104), and - # the extra container load widens the gitea push-vs-Contents-API race. - run: go test -v -parallel 2 -timeout 30m ./... + # Run scenarios serially (-parallel 1). The 4-core / ~7.9GB runner is + # OOM-killed at 4 (silent FAIL, see #104), and even at 2 the concurrent + # act + gitea container load throttles gitea (405 "try again later") and + # destabilises act runs, producing intermittent failures unrelated to + # the product. Serial execution removes that contention; the longer + # 60m timeout covers the resulting slower wall-clock. + run: go test -v -parallel 1 -timeout 60m ./... diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 4194148..45b3501 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -29,7 +29,7 @@ on: parallel: description: 'Subtest parallelism (lower = slower but more reliable)' required: false - default: '2' + default: '1' permissions: contents: read @@ -76,8 +76,10 @@ jobs: # RAM. Each scenario spins up gitea + act + N job containers; at # the default GOMAXPROCS=4, four scenarios concurrently exhaust # memory and the test process is OOM-killed (silent FAIL with no - # per-test output, see #104). - E2E_PARALLEL: ${{ github.event.inputs.parallel || '2' }} + # per-test output, see #104). Even at 2 the concurrent container + # load throttles gitea and destabilises act runs, so the default is + # serial (1); raise it per-dispatch only when chasing wall-clock. + E2E_PARALLEL: ${{ github.event.inputs.parallel || '1' }} run: | go test -v \ -timeout "$E2E_TIMEOUT" \ diff --git a/e2e/harness/gitea.go b/e2e/harness/gitea.go index 525a670..067342a 100644 --- a/e2e/harness/gitea.go +++ b/e2e/harness/gitea.go @@ -183,6 +183,26 @@ func (g *GiteaContainer) Container() testcontainers.Container { return g.container } +// newJSONRequest builds an authenticated gitea API request carrying the given +// JSON body. It is the request factory passed to doRetry: each call returns a +// fresh *http.Request with an unread body so a throttled call can be replayed. +// A nil body produces a request with no payload (used for GET/DELETE). +func (g *GiteaContainer) newJSONRequest(ctx context.Context, method, url string, body []byte) (*http.Request, error) { + var reader io.Reader + if body != nil { + reader = bytes.NewReader(body) + } + req, err := http.NewRequestWithContext(ctx, method, url, reader) + if err != nil { + return nil, fmt.Errorf("build %s request: %w", method, err) + } + if body != nil { + req.Header.Set("Content-Type", "application/json") + } + req.SetBasicAuth(AdminUsername, AdminPassword) + return req, nil +} + // Terminate stops and removes the container func (g *GiteaContainer) Terminate(ctx context.Context) error { return g.container.Terminate(ctx) @@ -296,16 +316,10 @@ func (g *GiteaContainer) CreateCommitOnBranch(ctx context.Context, repo *Repo, b return "", fmt.Errorf("marshal change-files payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/contents", g.url, AdminUsername, repo.Name), - bytes.NewReader(body)) - if err != nil { - return "", fmt.Errorf("build change-files request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/contents", g.url, AdminUsername, repo.Name) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, body) + }) if err != nil { return "", fmt.Errorf("change-files request: %w", err) } @@ -410,16 +424,10 @@ func (g *GiteaContainer) CreateBranch(ctx context.Context, repo *Repo, name, fro return fmt.Errorf("marshal create-branch payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/branches", g.url, AdminUsername, repo.Name), - bytes.NewReader(body)) - if err != nil { - return fmt.Errorf("build create-branch request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/branches", g.url, AdminUsername, repo.Name) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, body) + }) if err != nil { return fmt.Errorf("create-branch request: %w", err) } @@ -700,16 +708,10 @@ func (g *GiteaContainer) CreatePR(ctx context.Context, repo *Repo, head, base, t return 0, fmt.Errorf("marshal create-pr payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/pulls", g.url, AdminUsername, repo.Name), - bytes.NewReader(reqBody)) - if err != nil { - return 0, fmt.Errorf("build create-pr request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/pulls", g.url, AdminUsername, repo.Name) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, reqBody) + }) if err != nil { return 0, fmt.Errorf("create-pr request: %w", err) } @@ -812,16 +814,10 @@ func (g *GiteaContainer) createLabel(ctx context.Context, repo *Repo, name strin return 0, fmt.Errorf("marshal create-label payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/labels", g.url, AdminUsername, repo.Name), - bytes.NewReader(body)) - if err != nil { - return 0, fmt.Errorf("build create-label request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/labels", g.url, AdminUsername, repo.Name) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, body) + }) if err != nil { return 0, fmt.Errorf("create-label request: %w", err) } @@ -852,16 +848,10 @@ func (g *GiteaContainer) applyLabels(ctx context.Context, repo *Repo, index int6 return fmt.Errorf("marshal apply-labels payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/issues/%d/labels", g.url, AdminUsername, repo.Name, index), - bytes.NewReader(body)) - if err != nil { - return fmt.Errorf("build apply-labels request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/issues/%d/labels", g.url, AdminUsername, repo.Name, index) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, body) + }) if err != nil { return fmt.Errorf("apply-labels request: %w", err) } @@ -893,16 +883,14 @@ func (g *GiteaContainer) MergePR(ctx context.Context, repo *Repo, index int64, s return fmt.Errorf("marshal merge-pr payload: %w", err) } - req, err := http.NewRequestWithContext(ctx, "POST", - fmt.Sprintf("%s/api/v1/repos/%s/%s/pulls/%d/merge", g.url, AdminUsername, repo.Name, index), - bytes.NewReader(body)) - if err != nil { - return fmt.Errorf("build merge-pr request: %w", err) - } - req.Header.Set("Content-Type", "application/json") - req.SetBasicAuth(AdminUsername, AdminPassword) - - resp, err := http.DefaultClient.Do(req) + // The merge POST is the call most exposed to gitea's "Please try again + // later" throttle under load, so it is wrapped in the bounded transient + // retry. The retry is safe: gitea returns the 405 throttle BEFORE applying + // the merge, so a re-issue cannot double-merge. + url := fmt.Sprintf("%s/api/v1/repos/%s/%s/pulls/%d/merge", g.url, AdminUsername, repo.Name, index) + resp, err := doRetry(ctx, func() (*http.Request, error) { + return g.newJSONRequest(ctx, "POST", url, body) + }) if err != nil { return fmt.Errorf("merge-pr request: %w", err) } diff --git a/e2e/harness/gitea_retry.go b/e2e/harness/gitea_retry.go new file mode 100644 index 0000000..cb64a34 --- /dev/null +++ b/e2e/harness/gitea_retry.go @@ -0,0 +1,115 @@ +package harness + +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +// giteaRetryAttempts bounds how many times a throttled gitea REST call is +// retried before the error is surfaced. +const giteaRetryAttempts = 5 + +// giteaRetryBackoff is the base delay between retry attempts. Attempt n waits +// giteaRetryBackoff*n, giving a short linear backoff that clears gitea's +// transient "try again later" throttle without materially slowing the suite. +const giteaRetryBackoff = 250 * time.Millisecond + +// transientResponse reports whether an HTTP response from gitea represents a +// transient, safe-to-retry condition rather than a real client error. +// +// Under container and gitea load the suite observes two throttling shapes: +// - 405 Method Not Allowed with a body of {"message":"Please try again +// later"} (gitea's mergeability/lock throttle), and +// - 5xx server errors while gitea is briefly overwhelmed. +// +// A 405 that is NOT the "try again later" throttle is a genuine client error +// (wrong method / disallowed operation) and must not be retried, so the body is +// inspected. Other 4xx codes are real client errors and are never retried, +// which keeps legitimate expect-failure assertions deterministic. +func transientResponse(status int, body string) bool { + if status == http.StatusMethodNotAllowed { + return strings.Contains(strings.ToLower(body), "try again later") + } + return status >= 500 && status <= 599 +} + +// doRetry issues the request built by newReq, retrying on transient gitea +// throttling responses and transient transport errors. newReq must return a +// fresh *http.Request on every call so the request body can be replayed safely. +// +// Retries are bounded and apply ONLY to idempotent-by-effect throttling cases: +// a transient response is one where gitea explicitly asked the caller to try +// again (or returned 5xx) BEFORE applying any state change, so re-issuing the +// request does not double-apply a mutation. Real 4xx client errors and a +// successful response are returned to the caller immediately. +// +// On success the caller owns the returned response body and must close it. On a +// transient response the body is drained and closed between attempts. +func doRetry(ctx context.Context, newReq func() (*http.Request, error)) (*http.Response, error) { + var lastErr error + for attempt := 1; attempt <= giteaRetryAttempts; attempt++ { + req, err := newReq() + if err != nil { + return nil, err + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + // Transport-level errors (connection reset, refused) under load are + // transient; retry within the attempt budget. + lastErr = err + if !sleepBeforeRetry(ctx, attempt) { + return nil, ctx.Err() + } + continue + } + + if attempt < giteaRetryAttempts && transientResponseBody(resp) { + // Drain and close so the connection can be reused, then back off. + drained, _ := io.ReadAll(resp.Body) + _ = resp.Body.Close() + lastErr = fmt.Errorf("gitea transient response: %s - %s", resp.Status, strings.TrimSpace(string(drained))) + if !sleepBeforeRetry(ctx, attempt) { + return nil, ctx.Err() + } + continue + } + + return resp, nil + } + + return nil, fmt.Errorf("gitea request failed after %d attempts: %w", giteaRetryAttempts, lastErr) +} + +// transientResponseBody peeks the response body to classify a transient gitea +// throttle, then rewinds it so the caller still sees the full body. Only used +// when a retry is still possible. +func transientResponseBody(resp *http.Response) bool { + if resp.StatusCode != http.StatusMethodNotAllowed && (resp.StatusCode < 500 || resp.StatusCode > 599) { + return false + } + body, err := io.ReadAll(resp.Body) + _ = resp.Body.Close() + if err != nil { + // Body was consumed; rebuild an empty one so callers do not panic. + resp.Body = io.NopCloser(strings.NewReader("")) + return false + } + resp.Body = io.NopCloser(strings.NewReader(string(body))) + return transientResponse(resp.StatusCode, string(body)) +} + +// sleepBeforeRetry waits the linear backoff for the given attempt, returning +// false if the context is cancelled first. +func sleepBeforeRetry(ctx context.Context, attempt int) bool { + select { + case <-ctx.Done(): + return false + case <-time.After(time.Duration(attempt) * giteaRetryBackoff): + return true + } +} diff --git a/e2e/harness/gitea_retry_test.go b/e2e/harness/gitea_retry_test.go new file mode 100644 index 0000000..342eb71 --- /dev/null +++ b/e2e/harness/gitea_retry_test.go @@ -0,0 +1,180 @@ +package harness + +import ( + "context" + "io" + "net/http" + "net/http/httptest" + "strings" + "sync/atomic" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestTransientResponse(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + status int + body string + want bool + }{ + { + name: "405 try again later is transient", + status: http.StatusMethodNotAllowed, + body: `{"message":"Please try again later"}`, + want: true, + }, + { + name: "405 try again later case-insensitive", + status: http.StatusMethodNotAllowed, + body: `{"message":"please TRY AGAIN LATER"}`, + want: true, + }, + { + name: "405 method not allowed without throttle body is a real client error", + status: http.StatusMethodNotAllowed, + body: `{"message":"merge is not allowed"}`, + want: false, + }, + { + name: "500 server error is transient", + status: http.StatusInternalServerError, + body: "boom", + want: true, + }, + { + name: "503 service unavailable is transient", + status: http.StatusServiceUnavailable, + body: "", + want: true, + }, + { + name: "404 not found is a real client error", + status: http.StatusNotFound, + body: `{"message":"not found"}`, + want: false, + }, + { + name: "409 conflict is a real client error", + status: http.StatusConflict, + body: `{"message":"conflict"}`, + want: false, + }, + { + name: "200 ok is not transient", + status: http.StatusOK, + body: "", + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, transientResponse(tt.status, tt.body)) + }) + } +} + +func TestDoRetry_RetriesTransientThenSucceeds(t *testing.T) { + t.Parallel() + + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + // Body must be replayable across attempts: assert it is intact each call. + got, _ := io.ReadAll(r.Body) + assert.Equal(t, "payload", string(got)) + + n := atomic.AddInt32(&calls, 1) + if n < 3 { + w.WriteHeader(http.StatusMethodNotAllowed) + _, _ = w.Write([]byte(`{"message":"Please try again later"}`)) + return + } + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + })) + defer srv.Close() + + resp, err := doRetry(context.Background(), func() (*http.Request, error) { + return http.NewRequest(http.MethodPost, srv.URL, strings.NewReader("payload")) + }) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + body, _ := io.ReadAll(resp.Body) + assert.Equal(t, "ok", string(body)) + assert.Equal(t, int32(3), atomic.LoadInt32(&calls)) +} + +func TestDoRetry_DoesNotRetryRealClientError(t *testing.T) { + t.Parallel() + + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusConflict) + _, _ = w.Write([]byte(`{"message":"conflict"}`)) + })) + defer srv.Close() + + resp, err := doRetry(context.Background(), func() (*http.Request, error) { + return http.NewRequest(http.MethodPost, srv.URL, nil) + }) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + // 409 is surfaced to the caller without retry so expect-failure assertions + // stay deterministic. + assert.Equal(t, http.StatusConflict, resp.StatusCode) + assert.Equal(t, int32(1), atomic.LoadInt32(&calls)) + + // The caller still sees the full body after classification rewinds it. + body, _ := io.ReadAll(resp.Body) + assert.Contains(t, string(body), "conflict") +} + +func TestDoRetry_ExhaustsAttemptsAndReturnsLastResponse(t *testing.T) { + t.Parallel() + + var calls int32 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + atomic.AddInt32(&calls, 1) + w.WriteHeader(http.StatusMethodNotAllowed) + _, _ = w.Write([]byte(`{"message":"Please try again later"}`)) + })) + defer srv.Close() + + resp, err := doRetry(context.Background(), func() (*http.Request, error) { + return http.NewRequest(http.MethodPost, srv.URL, nil) + }) + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + // After the attempt budget is spent the final transient response is returned + // so the caller's own status check produces a clear error. + assert.Equal(t, http.StatusMethodNotAllowed, resp.StatusCode) + assert.Equal(t, int32(giteaRetryAttempts), atomic.LoadInt32(&calls)) +} + +func TestDoRetry_HonoursContextCancellation(t *testing.T) { + t.Parallel() + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + _, err := doRetry(ctx, func() (*http.Request, error) { + return http.NewRequestWithContext(ctx, http.MethodPost, srv.URL, nil) + }) + require.Error(t, err) +}