From e37db1fad679e174e14b1c6f33254ceeab629de5 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Tue, 12 May 2026 17:10:37 +0200 Subject: [PATCH 1/9] Kill git subprocess group on context cancel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit git push/fetch over HTTPS spawns helpers (git-remote-https, credential helpers) that inherit the parent's stdio pipes. The default exec.CommandContext cancellation only SIGKILLs the direct child, so the helpers keep stdout open and CombinedOutput blocks on Read past the context deadline. tryPushSessionsCommon's 2-minute timeout was therefore unbounded in practice — observed in the wild as a runaway dot stream on [entire] Pushing entire/checkpoints/v1 to origin... that only Ctrl-C could end. Add execx.KillOnCancel(cmd, waitDelay) which sets Setpgid (or CREATE_NEW_PROCESS_GROUP on Windows), a Cancel that SIGKILLs the whole group, and WaitDelay as a pipe-drain backstop. newCommand in checkpoint/remote applies it to every git subprocess routed through the package. Tests: - TestExecCommandContext_HangsOnGrandchildHoldingPipe pins the stdlib behavior that motivates the fix (5s hang vs 200ms deadline). - TestKillOnCancel_TerminatesGrandchildHoldingPipe verifies the helper returns within WaitDelay in the same scenario. - TestNewCommand_AppliesKillOnCancel confirms the wiring. Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: 24d37e721664 --- cmd/entire/cli/checkpoint/remote/git.go | 13 +++ .../remote/git_killoncancel_unix_test.go | 29 +++++++ cmd/entire/cli/execx/killoncancel_unix.go | 31 +++++++ .../cli/execx/killoncancel_unix_test.go | 84 +++++++++++++++++++ cmd/entire/cli/execx/killoncancel_windows.go | 28 +++++++ 5 files changed, 185 insertions(+) create mode 100644 cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go create mode 100644 cmd/entire/cli/execx/killoncancel_unix.go create mode 100644 cmd/entire/cli/execx/killoncancel_unix_test.go create mode 100644 cmd/entire/cli/execx/killoncancel_windows.go diff --git a/cmd/entire/cli/checkpoint/remote/git.go b/cmd/entire/cli/checkpoint/remote/git.go index 1fdd55ccaf..1fab50539e 100644 --- a/cmd/entire/cli/checkpoint/remote/git.go +++ b/cmd/entire/cli/checkpoint/remote/git.go @@ -10,10 +10,18 @@ import ( "strconv" "strings" "sync" + "time" + "github.com/entireio/cli/cmd/entire/cli/execx" "github.com/entireio/cli/cmd/entire/cli/settings" ) +// cancelWaitDelay caps how long Wait blocks reading from inherited pipes after +// the context is cancelled and the process group has been killed. Five seconds +// is generous enough to drain legitimate buffered output but short enough to +// keep a stuck push from holding the hook open indefinitely. +const cancelWaitDelay = 5 * time.Second + // CheckpointTokenEnvVar is the environment variable for providing an access token // used to authenticate git push/fetch operations for checkpoint branches. // The token is injected as an HTTP Basic Authorization header per RFC 7617: @@ -209,6 +217,11 @@ func newCommand(ctx context.Context, args ...string) *exec.Cmd { mkCmd := func(finalArgs []string) *exec.Cmd { c := exec.CommandContext(ctx, "git", finalArgs...) c.Stdin = nil // Disconnect stdin to prevent hanging in hook context + // git push/fetch over HTTPS spawns helpers (git-remote-https, credential + // helpers) that inherit our stdio pipes. Without process-group kill + + // WaitDelay, ctx cancellation only SIGKILLs git itself; the helpers keep + // the pipe open and CombinedOutput blocks indefinitely. + execx.KillOnCancel(c, cancelWaitDelay) return c } diff --git a/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go b/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go new file mode 100644 index 0000000000..e64674d948 --- /dev/null +++ b/cmd/entire/cli/checkpoint/remote/git_killoncancel_unix_test.go @@ -0,0 +1,29 @@ +//go:build unix + +package remote + +import ( + "context" + "testing" +) + +// newCommand must wire up KillOnCancel so that git push/fetch subprocesses +// don't strand grandchildren (git-remote-https, credential helpers) holding +// the inherited stdio pipes when their context fires. Without this, +// tryPushSessionsCommon's 2-minute timeout was effectively unbounded — the +// real-world symptom that motivated this guard. +func TestNewCommand_AppliesKillOnCancel(t *testing.T) { + t.Parallel() + + cmd := newCommand(context.Background(), "status") + + if cmd.SysProcAttr == nil || !cmd.SysProcAttr.Setpgid { + t.Fatalf("newCommand should set Setpgid for process-group kill; got %+v", cmd.SysProcAttr) + } + if cmd.Cancel == nil { + t.Fatal("newCommand should set Cancel to kill the process group") + } + if cmd.WaitDelay <= 0 { + t.Fatalf("newCommand should set WaitDelay; got %s", cmd.WaitDelay) + } +} diff --git a/cmd/entire/cli/execx/killoncancel_unix.go b/cmd/entire/cli/execx/killoncancel_unix.go new file mode 100644 index 0000000000..03e5aa15af --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_unix.go @@ -0,0 +1,31 @@ +//go:build unix + +package execx + +import ( + "os/exec" + "syscall" + "time" +) + +// KillOnCancel configures cmd so that when its context is cancelled the entire +// process group is terminated, and any inherited stdio pipes are forcibly +// closed after waitDelay. This is necessary for subprocesses that fork helpers +// (e.g. `git push` over HTTPS spawns `git-remote-https` and credential +// helpers): SIGKILL on the direct child leaves grandchildren holding the +// inherited pipes, and CombinedOutput stays blocked on Read until they exit on +// their own. Setpgid lets Cancel kill the whole group; WaitDelay is a backstop +// in case a process slips out of the group somehow. +func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Setpgid = true + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + return syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + } + cmd.WaitDelay = waitDelay +} diff --git a/cmd/entire/cli/execx/killoncancel_unix_test.go b/cmd/entire/cli/execx/killoncancel_unix_test.go new file mode 100644 index 0000000000..034a2f83cc --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_unix_test.go @@ -0,0 +1,84 @@ +//go:build unix + +package execx + +import ( + "context" + "os/exec" + "testing" + "time" +) + +// grandchildHoldingStdoutCmd returns a shell snippet that backgrounds a sleep +// (the grandchild) before waiting indefinitely in the parent. The grandchild +// inherits the parent's stdout, so any pipe-reading caller blocks on it long +// after the parent has been killed. +const grandchildHoldingStdoutCmd = "sleep 5 & echo started; wait" + +// Regression: exec.CommandContext alone does not unblock CombinedOutput when +// the killed child has grandchildren still holding the inherited stdout pipe. +// This test pins that behavior so reviewers can see why KillOnCancel exists. +// If a future Go release fixes this in the stdlib, this assertion fails and +// KillOnCancel can be reconsidered. +func TestExecCommandContext_HangsOnGrandchildHoldingPipe(t *testing.T) { + t.Parallel() + if _, err := exec.LookPath("sh"); err != nil { + t.Skip("sh not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + cmd := exec.CommandContext(ctx, "/bin/sh", "-c", grandchildHoldingStdoutCmd) + + start := time.Now() + // We expect CombinedOutput to return with an error (killed by ctx) — the + // point of the test is the duration, so the error is intentionally ignored. + _, _ = cmd.CombinedOutput() //nolint:errcheck // duration is what we assert on + elapsed := time.Since(start) + + if elapsed < 2*time.Second { + t.Fatalf("expected CombinedOutput to hang on grandchild stdout; got elapsed=%s (Go may have fixed this — consider removing KillOnCancel)", elapsed) + } +} + +// KillOnCancel must terminate the whole process group and release the pipes +// shortly after the context fires, even when grandchildren hold stdout. +func TestKillOnCancel_TerminatesGrandchildHoldingPipe(t *testing.T) { + t.Parallel() + if _, err := exec.LookPath("sh"); err != nil { + t.Skip("sh not available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) + defer cancel() + + cmd := exec.CommandContext(ctx, "/bin/sh", "-c", grandchildHoldingStdoutCmd) + KillOnCancel(cmd, 500*time.Millisecond) + + start := time.Now() + _, err := cmd.CombinedOutput() + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected error from cancelled command") + } + if elapsed > 2*time.Second { + t.Fatalf("KillOnCancel should unblock CombinedOutput well before the grandchild exits; got elapsed=%s", elapsed) + } +} + +func TestKillOnCancel_SetsSetpgid(t *testing.T) { + t.Parallel() + cmd := exec.CommandContext(context.Background(), "/bin/true") + KillOnCancel(cmd, time.Second) + if cmd.SysProcAttr == nil || !cmd.SysProcAttr.Setpgid { + t.Fatalf("KillOnCancel did not set Setpgid: %+v", cmd.SysProcAttr) + } + if cmd.WaitDelay != time.Second { + t.Fatalf("KillOnCancel WaitDelay = %s; want 1s", cmd.WaitDelay) + } + if cmd.Cancel == nil { + t.Fatal("KillOnCancel did not set Cancel") + } +} diff --git a/cmd/entire/cli/execx/killoncancel_windows.go b/cmd/entire/cli/execx/killoncancel_windows.go new file mode 100644 index 0000000000..f99903f249 --- /dev/null +++ b/cmd/entire/cli/execx/killoncancel_windows.go @@ -0,0 +1,28 @@ +//go:build windows + +package execx + +import ( + "os/exec" + "syscall" + "time" +) + +// KillOnCancel configures cmd so that when its context is cancelled the +// process is terminated and any inherited stdio pipes are forcibly closed +// after waitDelay. On Windows, CREATE_NEW_PROCESS_GROUP isolates the child +// from the parent's console signals; cmd.Process.Kill terminates the child, +// and WaitDelay covers stray descendants that may keep pipes open. +func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.CreationFlags |= syscall.CREATE_NEW_PROCESS_GROUP + cmd.Cancel = func() error { + if cmd.Process == nil { + return nil + } + return cmd.Process.Kill() + } + cmd.WaitDelay = waitDelay +} From ee4414ebcddfd2040f5b30b86fcfa5c9b9cc3a13 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Tue, 12 May 2026 17:24:17 +0200 Subject: [PATCH 2/9] Improve push UX when checkpoint sync fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A colleague's stuck push surfaced four gaps in how doPushBranch communicates failure. Fix each, plus add tests that pin the new behaviour. 1. fetchAndRebaseSessionsCommon now returns errFetchTimedOut when its inner deadline fires (distinct from server errors) and falls back to the wrapped fetch error when git's output is empty — so users no longer see a bare "Warning: couldn't sync ...: fetch failed:" with nothing after the colon. 2. tryPushSessionsCommon emits errPushTimedOut on its inner deadline. doPushBranch pairs this with " timed out" as the dot-line suffix, so the user sees the failure mode instead of the dots silently trailing off into a "Syncing" line that looks like normal progression. 3. Every push attempt logs at INFO with branch, target (display alias — no URLs/tokens), elapsed, and a coarse class field. Doctor bundles now record push history even at default log level; previously the only push trace was a DEBUG line that nobody had enabled. 4. doPushBranch short-circuits in two cases that previously cascaded confusingly: - outer ctx cancelled (user Ctrl-C, hook timeout) → bail silently instead of running fetch+rebase that will also fail. - inner push timed out → skip the sync→retry cascade entirely (same network condition would just trip the fetch timeout) and print a hint pointing the user at log_level=DEBUG. A pushStderr package writer replaces direct os.Stderr writes from doPushBranch so tests can capture the new output without globally swapping os.Stderr; the legacy captureStderr helper patches both vars for backward compatibility with existing hint tests. Tests in push_messaging_test.go cover each behavior change plus classifyForLog. pushAttemptTimeout is now a var (not const) so tests can shorten it. Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: 6819da9e1737 --- cmd/entire/cli/strategy/push_common.go | 180 +++++++++++++++--- cmd/entire/cli/strategy/push_common_test.go | 7 + .../cli/strategy/push_messaging_test.go | 164 ++++++++++++++++ 3 files changed, 326 insertions(+), 25 deletions(-) create mode 100644 cmd/entire/cli/strategy/push_messaging_test.go diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 2243d346d6..93ee0a4933 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -22,6 +22,28 @@ import ( "github.com/go-git/go-git/v6/plumbing/object" ) +// pushStderr is the destination for user-facing push messages. Defaults to +// os.Stderr; overridable in tests via withPushStderr. +var pushStderr io.Writer = os.Stderr + +// pushAttemptTimeout caps a single tryPushSessionsCommon invocation. Two +// minutes is generous for a healthy network and short enough that a hung +// helper does not hold the hook open indefinitely. Var (not const) so tests +// can shorten it. +var pushAttemptTimeout = 2 * time.Minute + +// fetchAttemptTimeout is the matching cap for fetchAndRebaseSessionsCommon. +var fetchAttemptTimeout = 2 * time.Minute + +// errPushTimedOut is returned by tryPushSessionsCommon when its deadline fires +// before git push completes. Lets callers distinguish "network/server stuck" +// from "server actively rejected"; doPushBranch uses it to suppress the +// sync-and-retry cascade (which would also time out). +var errPushTimedOut = errors.New("push timed out") + +// errFetchTimedOut is the matching sentinel for fetchAndRebaseSessionsCommon. +var errFetchTimedOut = errors.New("fetch timed out") + // pushBranchIfNeeded pushes a branch to the given target if it has unpushed changes. // The target can be a remote name (e.g., "origin") or a URL for direct push. // When pushing to a URL, the "has unpushed" optimization is skipped since there are @@ -78,51 +100,133 @@ func displayPushTarget(target string) string { func doPushBranch(ctx context.Context, target, branchName string) error { displayTarget := displayPushTarget(target) - fmt.Fprintf(os.Stderr, "[entire] Pushing %s to %s...", branchName, displayTarget) - stop := startProgressDots(os.Stderr) + fmt.Fprintf(pushStderr, "[entire] Pushing %s to %s...", branchName, displayTarget) + stop := startProgressDots(pushStderr) + + pushStart := time.Now() + logging.Info(ctx, "push attempt start", + slog.String("branch", branchName), + slog.String("target", displayTarget), + ) - // Try pushing first result, err := tryPushSessionsCommon(ctx, target, branchName) if err == nil { + logging.Info(ctx, "push attempt completed", + slog.String("branch", branchName), + slog.Duration("elapsed", time.Since(pushStart)), + slog.Bool("up_to_date", result.upToDate), + ) finishPush(ctx, stop, result, target) return nil } + + logging.Info(ctx, "push attempt failed", + slog.String("branch", branchName), + slog.Duration("elapsed", time.Since(pushStart)), + slog.String("class", classifyForLog(err)), + ) + + // Outer context cancelled (user Ctrl-C, hook timeout) — bail quietly + // rather than cascading into a sync that will also fail. doPushBranch + // is best-effort: never fail the user's main push because of checkpoint + // sync, mirroring the existing graceful-degradation pattern below. + if ctx.Err() != nil { + stop("") + return nil //nolint:nilerr // intentional graceful degradation + } + + // Inner attempt timed out. Sync will also time out, so skip the cascade + // and give the user a clear next step instead. + if errors.Is(err, errPushTimedOut) { + stop(" timed out") + fmt.Fprintf(pushStderr, "[entire] Warning: push of %s to %s timed out after %s. Network or proxy issue?\n", + branchName, displayTarget, pushAttemptTimeout) + fmt.Fprintln(pushStderr, `[entire] Hint: set "log_level": "DEBUG" in .entire/settings.local.json to capture git's underlying output next time.`) + printCheckpointRemoteHint(target) + return nil + } + stop("") // Protected refs cannot be fixed by syncing and retrying. var protectedErr *protectedRefError if errors.As(err, &protectedErr) { - printProtectedRefBlock(os.Stderr, branchName, target) + printProtectedRefBlock(pushStderr, branchName, target) return nil } - // Push failed - likely non-fast-forward. Try to fetch and rebase. - fmt.Fprintf(os.Stderr, "[entire] Syncing %s with remote...", branchName) - stop = startProgressDots(os.Stderr) + // Push failed — likely non-fast-forward. Try to fetch and rebase. + fmt.Fprintf(pushStderr, "[entire] Syncing %s with remote...", branchName) + stop = startProgressDots(pushStderr) + syncStart := time.Now() - if err := fetchAndRebaseSessionsCommon(ctx, target, branchName); err != nil { - stop("") - fmt.Fprintf(os.Stderr, "[entire] Warning: couldn't sync %s: %v\n", branchName, err) + if syncErr := fetchAndRebaseSessionsCommon(ctx, target, branchName); syncErr != nil { + suffix := "" + if errors.Is(syncErr, errFetchTimedOut) { + suffix = " timed out" + } + stop(suffix) + logging.Info(ctx, "push sync failed", + slog.String("branch", branchName), + slog.Duration("elapsed", time.Since(syncStart)), + slog.String("class", classifyForLog(syncErr)), + ) + fmt.Fprintf(pushStderr, "[entire] Warning: couldn't sync %s: %v\n", branchName, syncErr) printCheckpointRemoteHint(target) return nil // Don't fail the main push } stop(" done") - // Try pushing again after rebase - fmt.Fprintf(os.Stderr, "[entire] Pushing %s to %s...", branchName, displayTarget) - stop = startProgressDots(os.Stderr) + // Try pushing again after rebase. + fmt.Fprintf(pushStderr, "[entire] Pushing %s to %s...", branchName, displayTarget) + stop = startProgressDots(pushStderr) + retryStart := time.Now() - if result, err := tryPushSessionsCommon(ctx, target, branchName); err != nil { - stop("") - fmt.Fprintf(os.Stderr, "[entire] Warning: failed to push %s after sync: %v\n", branchName, err) + retryResult, retryErr := tryPushSessionsCommon(ctx, target, branchName) + if retryErr != nil { + suffix := "" + if errors.Is(retryErr, errPushTimedOut) { + suffix = " timed out" + } + stop(suffix) + logging.Info(ctx, "push retry failed", + slog.String("branch", branchName), + slog.Duration("elapsed", time.Since(retryStart)), + slog.String("class", classifyForLog(retryErr)), + ) + fmt.Fprintf(pushStderr, "[entire] Warning: failed to push %s after sync: %v\n", branchName, retryErr) printCheckpointRemoteHint(target) - } else { - finishPush(ctx, stop, result, target) + return nil } - + logging.Info(ctx, "push retry completed", + slog.String("branch", branchName), + slog.Duration("elapsed", time.Since(retryStart)), + slog.Bool("up_to_date", retryResult.upToDate), + ) + finishPush(ctx, stop, retryResult, target) return nil } +// classifyForLog maps a push/fetch error to a short string suitable for log +// fields. Avoids leaking the raw error (which may contain URLs or tokens +// embedded in git's output) — only the category is logged. +func classifyForLog(err error) string { + if err == nil { + return "" + } + if errors.Is(err, errPushTimedOut) || errors.Is(err, errFetchTimedOut) { + return "timed_out" + } + var protectedErr *protectedRefError + if errors.As(err, &protectedErr) { + return "protected_ref" + } + if errors.Is(err, errNonFastForward) { + return "non_fast_forward" + } + return "other" +} + // printCheckpointRemoteHint prints a hint when a push to a checkpoint URL fails. // Only prints when the target is a URL (not the user's default remote). func printCheckpointRemoteHint(target string) { @@ -280,13 +384,22 @@ func finishPush(ctx context.Context, stop func(string), result pushResult, targe // tryPushSessionsCommon attempts to push the sessions branch. func tryPushSessionsCommon(ctx context.Context, remoteName, branchName string) (pushResult, error) { - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + localCtx, cancel := context.WithTimeout(ctx, pushAttemptTimeout) defer cancel() - result, err := remote.Push(ctx, remoteName, branchName) + result, err := remote.Push(localCtx, remoteName, branchName) outputStr := result.Output if err != nil { - return pushResult{}, classifyPushFailure(ctx, outputStr, err) + // Inner deadline fired: distinct sentinel so doPushBranch can pick + // the right messaging and skip the sync-retry cascade. + if errors.Is(localCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil { + logging.Debug(ctx, "git push timed out", + slog.String("error", err.Error()), + slog.Duration("after", pushAttemptTimeout), + ) + return pushResult{}, errPushTimedOut + } + return pushResult{}, classifyPushFailure(localCtx, outputStr, err) } return parsePushResult(outputStr), nil @@ -375,8 +488,9 @@ func printProtectedRefBlock(w io.Writer, ref, target string) { // always apply cleanly. // The target can be a remote name or a URL. func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string) error { - ctx, cancel := context.WithTimeout(ctx, 2*time.Minute) + localCtx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) defer cancel() + ctx = localCtx fetchTarget, err := remote.ResolveFetchTarget(ctx, target) if err != nil { @@ -407,7 +521,23 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string RefSpecs: []string{refSpec}, NoTags: true, }); fetchErr != nil { - return fmt.Errorf("fetch failed: %s", output) + // Inner deadline fired: distinct sentinel so doPushBranch can render + // " timed out" instead of an empty trailing colon. + if errors.Is(localCtx.Err(), context.DeadlineExceeded) { + logging.Debug(ctx, "git fetch timed out", + slog.String("error", fetchErr.Error()), + slog.Duration("after", fetchAttemptTimeout), + ) + return errFetchTimedOut + } + msg := strings.TrimSpace(string(output)) + if msg == "" { + // When git is SIGKILLed (kill-on-cancel from execx.KillOnCancel) the + // child writes nothing to stderr; fall back to the wrapper's error + // text so users don't see a bare "fetch failed:" with no detail. + msg = fetchErr.Error() + } + return fmt.Errorf("fetch failed: %s", msg) } repo, err := OpenRepository(ctx) @@ -421,7 +551,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // this cherry-picks local commits onto remote tip, updating the local ref. // If reconciliation fails, abort — proceeding to rebase on disconnected // branches would silently combine unrelated histories. - if reconcileErr := ReconcileDisconnectedMetadataBranch(ctx, repo, fetchedRefName, os.Stderr); reconcileErr != nil { + if reconcileErr := ReconcileDisconnectedMetadataBranch(ctx, repo, fetchedRefName, pushStderr); reconcileErr != nil { return fmt.Errorf("metadata reconciliation failed: %w", reconcileErr) } diff --git a/cmd/entire/cli/strategy/push_common_test.go b/cmd/entire/cli/strategy/push_common_test.go index 95eec80336..7da69d828b 100644 --- a/cmd/entire/cli/strategy/push_common_test.go +++ b/cmd/entire/cli/strategy/push_common_test.go @@ -1406,15 +1406,21 @@ func TestHasUnmigratedV1Checkpoints(t *testing.T) { func captureStderr(t *testing.T) func() string { t.Helper() old := os.Stderr + oldPushStderr := pushStderr r, w, err := os.Pipe() require.NoError(t, err) os.Stderr = w + // push_common.go writes user-facing output to the package-level pushStderr, + // not directly to os.Stderr. Patch both so legacy tests that only redirect + // os.Stderr still see the same output captured. + pushStderr = w // Safety net: restore stderr and close pipe ends on test failure/panic. // In the normal path the returned function handles cleanup first; // duplicate Close calls return an error that we intentionally ignore. t.Cleanup(func() { os.Stderr = old + pushStderr = oldPushStderr _ = w.Close() _ = r.Close() }) @@ -1426,6 +1432,7 @@ func captureStderr(t *testing.T) func() string { require.NoError(t, readErr) _ = r.Close() os.Stderr = old + pushStderr = oldPushStderr return buf.String() } } diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go new file mode 100644 index 0000000000..aa873bc004 --- /dev/null +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -0,0 +1,164 @@ +package strategy + +import ( + "context" + "errors" + "os" + "path/filepath" + "testing" + "time" + + "github.com/entireio/cli/cmd/entire/cli/logging" + "github.com/entireio/cli/cmd/entire/cli/paths" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// withShortPushTimeout replaces pushAttemptTimeout for the test duration and +// restores it via t.Cleanup. Lets us trigger errPushTimedOut without waiting +// the production 2-minute deadline. +func withShortPushTimeout(t *testing.T, d time.Duration) { + t.Helper() + original := pushAttemptTimeout + pushAttemptTimeout = d + t.Cleanup(func() { pushAttemptTimeout = original }) +} + +// (1) When the underlying git fetch reports an error, the user-visible +// "fetch failed: …" message must include some non-empty detail. Previously, +// when git's stdout was empty (e.g. SIGKILLed helper), the message ended +// with a bare colon and the user had no idea what failed. +// +// Not parallel: t.Chdir() for OpenRepository. +func TestFetchAndRebase_FetchFailure_IncludesUnderlyingError(t *testing.T) { + tmpDir := setupRepoWithCheckpointBranch(t) + t.Chdir(tmpDir) + + // Point at a path that does not exist. git fetch will fail; output is + // typically non-empty but the test passes either way because we now + // always include some detail. + nonExistent := filepath.Join(t.TempDir(), "does-not-exist") + err := fetchAndRebaseSessionsCommon(context.Background(), nonExistent, paths.MetadataBranchName) + + require.Error(t, err) + msg := err.Error() + require.Greater(t, len(msg), len("fetch failed: "), "error message must include detail beyond 'fetch failed: '; got %q", msg) +} + +// (2) When tryPushSessionsCommon hits its own 2-minute deadline, doPushBranch +// must end the dot line with " timed out" (not silently). And (4): it must +// also skip the sync→retry cascade, since the same network condition would +// trip the fetch timeout too. +// +// Not parallel: t.Chdir() and stderr capture. +func TestDoPushBranch_TimedOut_PrintsTimedOutAndSkipsSync(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + // Force every push attempt to time out immediately. KillOnCancel + + // WaitDelay ensures the call returns within milliseconds. + withShortPushTimeout(t, 1*time.Millisecond) + + restore := captureStderr(t) + err := doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err, "doPushBranch should swallow the error (graceful degradation)") + assert.Contains(t, output, "timed out", "dot line must end with 'timed out'") + assert.NotContains(t, output, "Syncing", "must not cascade into sync on inner-push timeout") + assert.Contains(t, output, "log_level", "should hint at enabling DEBUG logging") +} + +// (4b) If the OUTER context is already cancelled when doPushBranch runs (the +// user hit Ctrl-C, or the hook deadline fired), we must bail without printing +// a misleading sync cascade or warnings. +// +// Not parallel: t.Chdir() and stderr capture. +func TestDoPushBranch_OuterContextCancelled_BailsQuietly(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + restore := captureStderr(t) + err := doPushBranch(ctx, bareDir, paths.MetadataBranchName) + output := restore() + + require.NoError(t, err) + assert.NotContains(t, output, "Syncing", "must not run sync after outer cancel") + assert.NotContains(t, output, "Warning:", "must not print sync/retry warnings after outer cancel") +} + +// (3) Push attempts must be logged at INFO so doctor bundles capture +// push history even when log_level=INFO (the default). Without this, a +// bundle from a stuck push reveals nothing about what was attempted. +// +// Not parallel: t.Chdir() and global logger state. +func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + require.NoError(t, logging.Init(context.Background(), "2026-05-12-push-log-test")) + t.Cleanup(logging.Close) + + restore := captureStderr(t) + err := doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) + _ = restore() + require.NoError(t, err) + + logging.Close() // flush + + logBytes, err := os.ReadFile(filepath.Join(workDir, ".entire", "logs", "entire.log")) + require.NoError(t, err) + logText := string(logBytes) + + assert.Contains(t, logText, "push attempt start", "log must record push start") + assert.Contains(t, logText, "push attempt completed", "log must record push completion") + assert.Contains(t, logText, `"branch":"`+paths.MetadataBranchName+`"`, "log must include branch") +} + +// (2/4) Companion test for the lower-level helper: tryPushSessionsCommon +// returns errPushTimedOut when its inner deadline fires before the outer +// context. Pinning this so future refactors of the cascade logic can rely +// on the sentinel. +// +// Not parallel: t.Chdir(). +func TestTryPushSessionsCommon_InnerTimeout_ReturnsErrPushTimedOut(t *testing.T) { + workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) + t.Chdir(workDir) + + withShortPushTimeout(t, 1*time.Millisecond) + + _, err := tryPushSessionsCommon(context.Background(), bareDir, paths.MetadataBranchName) + require.Error(t, err) + assert.ErrorIs(t, err, errPushTimedOut, + "err must wrap errPushTimedOut; got %v (class=%q)", err, classifyForLog(err)) +} + +// classifyForLog must keep logs free of raw push output (which can contain +// URLs and tokens). Anything we don't recognize collapses to "other". +func TestClassifyForLog(t *testing.T) { + t.Parallel() + + cases := []struct { + name string + err error + want string + }{ + {"nil", nil, ""}, + {"push timeout", errPushTimedOut, "timed_out"}, + {"fetch timeout", errFetchTimedOut, "timed_out"}, + {"non-fast-forward", errNonFastForward, "non_fast_forward"}, + {"protected", &protectedRefError{output: "GH013"}, "protected_ref"}, + {"other", errors.New("some random error"), "other"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tc.want, classifyForLog(tc.err)) + }) + } +} From c6dee4532fbf7c4c505098236c314a7889c2c434 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Tue, 12 May 2026 18:00:22 +0200 Subject: [PATCH 3/9] Tighten push UX code after review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Findings from /simplify cleanup pass: - Extract reportPushFailure to dedupe the sync-failure and retry-failure tails of doPushBranch (suffix selection, INFO log, warning, hint). Drops ~15 lines and keeps the two branches in sync by construction. - Replace classifyForLog's bare string literals with named constants (pushClassTimedOut, pushClassProtectedRef, pushClassNonFastForward, pushClassOther) so log consumers and tests share one source of truth. - Fix aliasing smell in fetchAndRebaseSessionsCommon: drop the `localCtx, cancel := …; ctx = localCtx` half-rename in favour of the original `ctx, cancel := context.WithTimeout(…)` pattern. - Rewrite pushStderr doc comment to describe the actual capture path (captureStderr) and document why the standalone hint helpers stay on os.Stderr — the function withPushStderr it referenced never existed. - Strengthen TestFetchAndRebase_FetchFailure_IncludesUnderlyingError: assert on "fetch failed: " prefix + non-empty trimmed detail instead of just len(msg) > prefixLen, which would pass with any single character. - Drop sleep 5 -> sleep 3 in the execx regression-pinning test. Keeps the 2s assertion gap, saves ~2s on every test run (execx package time dropped 5.2s -> 3.4s). - Remove numbered // (1) // (2) // (4b) scaffolding from test docstrings and trim WHAT-comments ("Var (not const)…", "Try pushing again after rebase.", "Push failed — likely non-fast-forward…"). Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: bbc8a4322628 --- .../cli/execx/killoncancel_unix_test.go | 6 +- cmd/entire/cli/strategy/push_common.go | 120 +++++++++++------- .../cli/strategy/push_messaging_test.go | 50 ++++---- 3 files changed, 103 insertions(+), 73 deletions(-) diff --git a/cmd/entire/cli/execx/killoncancel_unix_test.go b/cmd/entire/cli/execx/killoncancel_unix_test.go index 034a2f83cc..a6d3a43937 100644 --- a/cmd/entire/cli/execx/killoncancel_unix_test.go +++ b/cmd/entire/cli/execx/killoncancel_unix_test.go @@ -12,8 +12,10 @@ import ( // grandchildHoldingStdoutCmd returns a shell snippet that backgrounds a sleep // (the grandchild) before waiting indefinitely in the parent. The grandchild // inherits the parent's stdout, so any pipe-reading caller blocks on it long -// after the parent has been killed. -const grandchildHoldingStdoutCmd = "sleep 5 & echo started; wait" +// after the parent has been killed. The sleep is short enough that the +// regression-pinning test costs ~3s rather than 5s on every CI run while +// still leaving a clear gap above the 2s assertion threshold. +const grandchildHoldingStdoutCmd = "sleep 3 & echo started; wait" // Regression: exec.CommandContext alone does not unblock CombinedOutput when // the killed child has grandchildren still holding the inherited stdout pipe. diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 93ee0a4933..861eee4237 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -22,28 +22,38 @@ import ( "github.com/go-git/go-git/v6/plumbing/object" ) -// pushStderr is the destination for user-facing push messages. Defaults to -// os.Stderr; overridable in tests via withPushStderr. +// pushStderr is the destination for output emitted by doPushBranch itself +// (the dot lines, "Warning:" lines, hints). The standalone hint helpers +// (printCheckpointRemoteHint, printSettingsCommitHint, …) deliberately keep +// writing to os.Stderr because they pre-date this var and have their own +// inline-redirect tests. captureStderr patches both so end-to-end tests of +// doPushBranch still see hint output. var pushStderr io.Writer = os.Stderr // pushAttemptTimeout caps a single tryPushSessionsCommon invocation. Two // minutes is generous for a healthy network and short enough that a hung -// helper does not hold the hook open indefinitely. Var (not const) so tests -// can shorten it. +// helper does not hold the hook open indefinitely. var pushAttemptTimeout = 2 * time.Minute -// fetchAttemptTimeout is the matching cap for fetchAndRebaseSessionsCommon. var fetchAttemptTimeout = 2 * time.Minute -// errPushTimedOut is returned by tryPushSessionsCommon when its deadline fires -// before git push completes. Lets callers distinguish "network/server stuck" -// from "server actively rejected"; doPushBranch uses it to suppress the -// sync-and-retry cascade (which would also time out). +// errPushTimedOut signals that tryPushSessionsCommon's inner deadline fired +// before git push completed. doPushBranch uses it to suppress the +// sync-and-retry cascade — fetch would just time out on the same condition. var errPushTimedOut = errors.New("push timed out") -// errFetchTimedOut is the matching sentinel for fetchAndRebaseSessionsCommon. var errFetchTimedOut = errors.New("fetch timed out") +// Log-field values for the "class" attribute on push-related INFO logs. +// Kept as constants so log consumers and tests share one source of truth and +// the function stays free of bare string literals. +const ( + pushClassTimedOut = "timed_out" + pushClassProtectedRef = "protected_ref" + pushClassNonFastForward = "non_fast_forward" + pushClassOther = "other" +) + // pushBranchIfNeeded pushes a branch to the given target if it has unpushed changes. // The target can be a remote name (e.g., "origin") or a URL for direct push. // When pushing to a URL, the "has unpushed" optimization is skipped since there are @@ -155,47 +165,39 @@ func doPushBranch(ctx context.Context, target, branchName string) error { return nil } - // Push failed — likely non-fast-forward. Try to fetch and rebase. fmt.Fprintf(pushStderr, "[entire] Syncing %s with remote...", branchName) stop = startProgressDots(pushStderr) syncStart := time.Now() if syncErr := fetchAndRebaseSessionsCommon(ctx, target, branchName); syncErr != nil { - suffix := "" - if errors.Is(syncErr, errFetchTimedOut) { - suffix = " timed out" - } - stop(suffix) - logging.Info(ctx, "push sync failed", - slog.String("branch", branchName), - slog.Duration("elapsed", time.Since(syncStart)), - slog.String("class", classifyForLog(syncErr)), - ) - fmt.Fprintf(pushStderr, "[entire] Warning: couldn't sync %s: %v\n", branchName, syncErr) - printCheckpointRemoteHint(target) - return nil // Don't fail the main push + reportPushFailure(ctx, reportPushFailureArgs{ + stop: stop, + err: syncErr, + logMsg: "push sync failed", + warnMsg: fmt.Sprintf("[entire] Warning: couldn't sync %s: %v\n", branchName, syncErr), + branch: branchName, + target: target, + start: syncStart, + }) + return nil } stop(" done") - // Try pushing again after rebase. fmt.Fprintf(pushStderr, "[entire] Pushing %s to %s...", branchName, displayTarget) stop = startProgressDots(pushStderr) retryStart := time.Now() retryResult, retryErr := tryPushSessionsCommon(ctx, target, branchName) if retryErr != nil { - suffix := "" - if errors.Is(retryErr, errPushTimedOut) { - suffix = " timed out" - } - stop(suffix) - logging.Info(ctx, "push retry failed", - slog.String("branch", branchName), - slog.Duration("elapsed", time.Since(retryStart)), - slog.String("class", classifyForLog(retryErr)), - ) - fmt.Fprintf(pushStderr, "[entire] Warning: failed to push %s after sync: %v\n", branchName, retryErr) - printCheckpointRemoteHint(target) + reportPushFailure(ctx, reportPushFailureArgs{ + stop: stop, + err: retryErr, + logMsg: "push retry failed", + warnMsg: fmt.Sprintf("[entire] Warning: failed to push %s after sync: %v\n", branchName, retryErr), + branch: branchName, + target: target, + start: retryStart, + }) return nil } logging.Info(ctx, "push retry completed", @@ -207,6 +209,37 @@ func doPushBranch(ctx context.Context, target, branchName string) error { return nil } +// reportPushFailureArgs groups the inputs to reportPushFailure to keep the +// recoverable-failure paths in doPushBranch readable. +type reportPushFailureArgs struct { + stop func(string) + err error + logMsg string + warnMsg string + branch string + target string + start time.Time +} + +// reportPushFailure handles the common tail of a failed sync or retry-push: +// pick the dot-line suffix (`" timed out"` vs blank), log at INFO with a +// classification label, print the warning, and surface the +// checkpoint-remote hint when the target is a URL. +func reportPushFailure(ctx context.Context, a reportPushFailureArgs) { + suffix := "" + if errors.Is(a.err, errPushTimedOut) || errors.Is(a.err, errFetchTimedOut) { + suffix = " timed out" + } + a.stop(suffix) + logging.Info(ctx, a.logMsg, + slog.String("branch", a.branch), + slog.Duration("elapsed", time.Since(a.start)), + slog.String("class", classifyForLog(a.err)), + ) + fmt.Fprint(pushStderr, a.warnMsg) + printCheckpointRemoteHint(a.target) +} + // classifyForLog maps a push/fetch error to a short string suitable for log // fields. Avoids leaking the raw error (which may contain URLs or tokens // embedded in git's output) — only the category is logged. @@ -215,16 +248,16 @@ func classifyForLog(err error) string { return "" } if errors.Is(err, errPushTimedOut) || errors.Is(err, errFetchTimedOut) { - return "timed_out" + return pushClassTimedOut } var protectedErr *protectedRefError if errors.As(err, &protectedErr) { - return "protected_ref" + return pushClassProtectedRef } if errors.Is(err, errNonFastForward) { - return "non_fast_forward" + return pushClassNonFastForward } - return "other" + return pushClassOther } // printCheckpointRemoteHint prints a hint when a push to a checkpoint URL fails. @@ -488,9 +521,8 @@ func printProtectedRefBlock(w io.Writer, ref, target string) { // always apply cleanly. // The target can be a remote name or a URL. func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string) error { - localCtx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) + ctx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) defer cancel() - ctx = localCtx fetchTarget, err := remote.ResolveFetchTarget(ctx, target) if err != nil { @@ -523,7 +555,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string }); fetchErr != nil { // Inner deadline fired: distinct sentinel so doPushBranch can render // " timed out" instead of an empty trailing colon. - if errors.Is(localCtx.Err(), context.DeadlineExceeded) { + if errors.Is(ctx.Err(), context.DeadlineExceeded) { logging.Debug(ctx, "git fetch timed out", slog.String("error", fetchErr.Error()), slog.Duration("after", fetchAttemptTimeout), diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index aa873bc004..010bf0ce2c 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -5,6 +5,7 @@ import ( "errors" "os" "path/filepath" + "strings" "testing" "time" @@ -25,31 +26,29 @@ func withShortPushTimeout(t *testing.T, d time.Duration) { t.Cleanup(func() { pushAttemptTimeout = original }) } -// (1) When the underlying git fetch reports an error, the user-visible -// "fetch failed: …" message must include some non-empty detail. Previously, -// when git's stdout was empty (e.g. SIGKILLed helper), the message ended -// with a bare colon and the user had no idea what failed. +// Regression: a colleague's terminal once showed "Warning: couldn't sync …: +// fetch failed:" with nothing after the colon, because the wrapping discarded +// git's underlying error whenever git produced no stdout. The wrapped error +// must now appear in the message even when output is empty. // // Not parallel: t.Chdir() for OpenRepository. func TestFetchAndRebase_FetchFailure_IncludesUnderlyingError(t *testing.T) { tmpDir := setupRepoWithCheckpointBranch(t) t.Chdir(tmpDir) - // Point at a path that does not exist. git fetch will fail; output is - // typically non-empty but the test passes either way because we now - // always include some detail. nonExistent := filepath.Join(t.TempDir(), "does-not-exist") err := fetchAndRebaseSessionsCommon(context.Background(), nonExistent, paths.MetadataBranchName) require.Error(t, err) msg := err.Error() - require.Greater(t, len(msg), len("fetch failed: "), "error message must include detail beyond 'fetch failed: '; got %q", msg) + require.True(t, strings.HasPrefix(msg, "fetch failed: "), "expected fetch failed prefix, got %q", msg) + detail := strings.TrimPrefix(msg, "fetch failed: ") + require.NotEmpty(t, strings.TrimSpace(detail), "fetch failed must carry a non-empty cause; got %q", msg) } -// (2) When tryPushSessionsCommon hits its own 2-minute deadline, doPushBranch -// must end the dot line with " timed out" (not silently). And (4): it must -// also skip the sync→retry cascade, since the same network condition would -// trip the fetch timeout too. +// On inner-push timeout, doPushBranch must end the dot line with " timed out", +// skip the sync→retry cascade (same network condition would also trip the +// fetch deadline), and surface the DEBUG-logging hint. // // Not parallel: t.Chdir() and stderr capture. func TestDoPushBranch_TimedOut_PrintsTimedOutAndSkipsSync(t *testing.T) { @@ -70,9 +69,8 @@ func TestDoPushBranch_TimedOut_PrintsTimedOutAndSkipsSync(t *testing.T) { assert.Contains(t, output, "log_level", "should hint at enabling DEBUG logging") } -// (4b) If the OUTER context is already cancelled when doPushBranch runs (the -// user hit Ctrl-C, or the hook deadline fired), we must bail without printing -// a misleading sync cascade or warnings. +// When the outer context is already cancelled (user Ctrl-C, hook deadline), +// doPushBranch must bail without a misleading sync cascade or warning. // // Not parallel: t.Chdir() and stderr capture. func TestDoPushBranch_OuterContextCancelled_BailsQuietly(t *testing.T) { @@ -91,9 +89,9 @@ func TestDoPushBranch_OuterContextCancelled_BailsQuietly(t *testing.T) { assert.NotContains(t, output, "Warning:", "must not print sync/retry warnings after outer cancel") } -// (3) Push attempts must be logged at INFO so doctor bundles capture -// push history even when log_level=INFO (the default). Without this, a -// bundle from a stuck push reveals nothing about what was attempted. +// Push attempts must reach INFO so doctor bundles capture push history at the +// default log level. Without this, a bundle from a stuck push reveals nothing +// about what was attempted. // // Not parallel: t.Chdir() and global logger state. func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { @@ -119,10 +117,8 @@ func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { assert.Contains(t, logText, `"branch":"`+paths.MetadataBranchName+`"`, "log must include branch") } -// (2/4) Companion test for the lower-level helper: tryPushSessionsCommon -// returns errPushTimedOut when its inner deadline fires before the outer -// context. Pinning this so future refactors of the cascade logic can rely -// on the sentinel. +// Pins the errPushTimedOut sentinel so future refactors of doPushBranch's +// cascade logic can keep relying on errors.Is(err, errPushTimedOut). // // Not parallel: t.Chdir(). func TestTryPushSessionsCommon_InnerTimeout_ReturnsErrPushTimedOut(t *testing.T) { @@ -148,11 +144,11 @@ func TestClassifyForLog(t *testing.T) { want string }{ {"nil", nil, ""}, - {"push timeout", errPushTimedOut, "timed_out"}, - {"fetch timeout", errFetchTimedOut, "timed_out"}, - {"non-fast-forward", errNonFastForward, "non_fast_forward"}, - {"protected", &protectedRefError{output: "GH013"}, "protected_ref"}, - {"other", errors.New("some random error"), "other"}, + {"push timeout", errPushTimedOut, pushClassTimedOut}, + {"fetch timeout", errFetchTimedOut, pushClassTimedOut}, + {"non-fast-forward", errNonFastForward, pushClassNonFastForward}, + {"protected", &protectedRefError{output: "GH013"}, pushClassProtectedRef}, + {"other", errors.New("some random error"), pushClassOther}, } for _, tc := range cases { From 9e03c8e72896cb9006de1146f411c206ab54fb58 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Tue, 12 May 2026 18:47:30 +0200 Subject: [PATCH 4/9] Bail on outer ctx at every stage; kill subprocess tree on Windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two review findings against the previous two commits: Finding 1: outer-context cancellation was only honored after the first push attempt. doPushBranch checked ctx.Err() once before deciding to sync, but the sync-failure and retry-failure tails unconditionally flowed through reportPushFailure, so a Ctrl-C or hook deadline arriving during fetch+rebase or during the retry push still produced misleading "Warning: couldn't sync …" / "failed to push … after sync" output. Move the ctx.Err() check into reportPushFailure itself so every cancellation point in the cascade collapses to the same silent bail: empty dot suffix, no log, no warning. New TestReportPushFailure_BailsOnOuter- ContextCancel pins the contract. Finding 2: KillOnCancel on Windows only terminated the direct child. CREATE_NEW_PROCESS_GROUP isolates console signals but does not propagate SIGKILL-equivalent semantics to descendants, and Process.Kill walks no tree — git-remote-https and credential helpers survived the cancel and kept the inherited stdio pipes open. Replace the direct kill with taskkill.exe /F /T /PID which walks the descendant chain. Fall back to Process.Kill if taskkill is unavailable; WaitDelay still bounds the user-visible hang in that case. Cross-builds under GOOS=windows; runtime test would require a Windows environment. Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: d14d590faf48 --- cmd/entire/cli/execx/killoncancel_windows.go | 29 +++++++++++++--- cmd/entire/cli/strategy/push_common.go | 9 +++++ .../cli/strategy/push_messaging_test.go | 34 +++++++++++++++++++ 3 files changed, 67 insertions(+), 5 deletions(-) diff --git a/cmd/entire/cli/execx/killoncancel_windows.go b/cmd/entire/cli/execx/killoncancel_windows.go index f99903f249..693855d528 100644 --- a/cmd/entire/cli/execx/killoncancel_windows.go +++ b/cmd/entire/cli/execx/killoncancel_windows.go @@ -3,16 +3,24 @@ package execx import ( + "fmt" "os/exec" + "strconv" "syscall" "time" ) // KillOnCancel configures cmd so that when its context is cancelled the -// process is terminated and any inherited stdio pipes are forcibly closed -// after waitDelay. On Windows, CREATE_NEW_PROCESS_GROUP isolates the child -// from the parent's console signals; cmd.Process.Kill terminates the child, -// and WaitDelay covers stray descendants that may keep pipes open. +// process and any descendants are terminated and any inherited stdio pipes +// are forcibly closed after waitDelay. +// +// Windows specifics: CREATE_NEW_PROCESS_GROUP isolates the child from the +// parent's console signals, but neither it nor Process.Kill terminates +// descendants — git over HTTPS spawns git-remote-https plus credential +// helpers that survive a direct kill and keep the inherited stdio open. We +// shell out to taskkill.exe /F /T (force + tree) so the whole descendant +// chain dies. If taskkill is unavailable, we fall back to the direct kill +// and rely on waitDelay to bound the user-visible hang. func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { if cmd.SysProcAttr == nil { cmd.SysProcAttr = &syscall.SysProcAttr{} @@ -22,7 +30,18 @@ func KillOnCancel(cmd *exec.Cmd, waitDelay time.Duration) { if cmd.Process == nil { return nil } - return cmd.Process.Kill() + kill := exec.Command("taskkill.exe", "/F", "/T", "/PID", strconv.Itoa(cmd.Process.Pid)) + taskkillErr := kill.Run() + if taskkillErr == nil { + return nil + } + // taskkill is missing or refused — fall back to direct kill so at + // least the visible child terminates. WaitDelay still closes the + // pipes within bounded time, so the caller does not hang. + if killErr := cmd.Process.Kill(); killErr != nil { + return fmt.Errorf("taskkill: %w; fallback kill: %w", taskkillErr, killErr) + } + return taskkillErr } cmd.WaitDelay = waitDelay } diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 861eee4237..f2182cda66 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -225,7 +225,16 @@ type reportPushFailureArgs struct { // pick the dot-line suffix (`" timed out"` vs blank), log at INFO with a // classification label, print the warning, and surface the // checkpoint-remote hint when the target is a URL. +// +// If the outer context is done (user Ctrl-C or hook deadline arrived after +// the first push attempt), this collapses to a silent bail: empty dot suffix, +// no log, no warning. Matches the early bail-out in doPushBranch's +// first-attempt path so that every cancellation point behaves the same. func reportPushFailure(ctx context.Context, a reportPushFailureArgs) { + if ctx.Err() != nil { + a.stop("") + return + } suffix := "" if errors.Is(a.err, errPushTimedOut) || errors.Is(a.err, errFetchTimedOut) { suffix = " timed out" diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index 010bf0ce2c..693c4782fa 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -117,6 +117,40 @@ func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { assert.Contains(t, logText, `"branch":"`+paths.MetadataBranchName+`"`, "log must include branch") } +// reportPushFailure is the single chokepoint for the sync- and retry-stage +// failure tails of doPushBranch. If the outer context is done by the time +// it runs (Ctrl-C / hook deadline that arrived after the first push attempt +// completed), it must collapse to a silent bail — empty dot suffix, no log, +// no warning, no hint. Without this, late cancellation can still produce +// misleading "Warning: couldn't sync …" output. +func TestReportPushFailure_BailsOnOuterContextCancel(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + var suffixes []string + stop := func(s string) { suffixes = append(suffixes, s) } + + old := pushStderr + var buf strings.Builder + pushStderr = &buf + t.Cleanup(func() { pushStderr = old }) + + reportPushFailure(ctx, reportPushFailureArgs{ + stop: stop, + err: errors.New("simulated sync failure"), + logMsg: "push sync failed", + warnMsg: "[entire] Warning: couldn't sync foo: …\n", + branch: "entire/checkpoints/v1", + target: "origin", + start: time.Now(), + }) + + assert.Equal(t, []string{""}, suffixes, "stop must be called with empty suffix to clear the dot line") + assert.Empty(t, buf.String(), "no warning should be emitted after outer cancel") +} + // Pins the errPushTimedOut sentinel so future refactors of doPushBranch's // cascade logic can keep relying on errors.Is(err, errPushTimedOut). // From 8246e7839028a4411753ca50b21cf49d97482beb Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Tue, 12 May 2026 20:03:48 +0200 Subject: [PATCH 5/9] Inject writer into reportPushFailure for race-safe tests TestReportPushFailure_BailsOnOuterContextCancel previously called t.Parallel() and then reassigned the package-global pushStderr, which races with any non-parallel test that uses captureStderr (the shared helper that also swaps pushStderr). Under the race detector this would surface as a data race; in practice the t.Cleanup ordering could leak a closed pipe into another test. Add an `out io.Writer` field to reportPushFailureArgs and have doPushBranch pass pushStderr at the two call sites. The test now injects a local strings.Builder and never touches the global, so t.Parallel() is sound. Verified with `go test -race`. pushStderr itself remains as the writer used directly by doPushBranch (dot lines, timed-out warning, hint). Plumbing it through to every call site is a larger refactor and out of scope. Co-Authored-By: Claude Opus 4.7 (1M context) Entire-Checkpoint: c87f60db628c --- cmd/entire/cli/strategy/push_common.go | 9 +++++++-- cmd/entire/cli/strategy/push_messaging_test.go | 11 +++++------ 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index f2182cda66..84a025c84f 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -171,6 +171,7 @@ func doPushBranch(ctx context.Context, target, branchName string) error { if syncErr := fetchAndRebaseSessionsCommon(ctx, target, branchName); syncErr != nil { reportPushFailure(ctx, reportPushFailureArgs{ + out: pushStderr, stop: stop, err: syncErr, logMsg: "push sync failed", @@ -190,6 +191,7 @@ func doPushBranch(ctx context.Context, target, branchName string) error { retryResult, retryErr := tryPushSessionsCommon(ctx, target, branchName) if retryErr != nil { reportPushFailure(ctx, reportPushFailureArgs{ + out: pushStderr, stop: stop, err: retryErr, logMsg: "push retry failed", @@ -210,8 +212,11 @@ func doPushBranch(ctx context.Context, target, branchName string) error { } // reportPushFailureArgs groups the inputs to reportPushFailure to keep the -// recoverable-failure paths in doPushBranch readable. +// recoverable-failure paths in doPushBranch readable. out is taken as a +// parameter (rather than reaching into the package-global pushStderr) so +// tests can capture without racing on shared state. type reportPushFailureArgs struct { + out io.Writer stop func(string) err error logMsg string @@ -245,7 +250,7 @@ func reportPushFailure(ctx context.Context, a reportPushFailureArgs) { slog.Duration("elapsed", time.Since(a.start)), slog.String("class", classifyForLog(a.err)), ) - fmt.Fprint(pushStderr, a.warnMsg) + fmt.Fprint(a.out, a.warnMsg) printCheckpointRemoteHint(a.target) } diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index 693c4782fa..06a604e920 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -123,6 +123,9 @@ func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { // completed), it must collapse to a silent bail — empty dot suffix, no log, // no warning, no hint. Without this, late cancellation can still produce // misleading "Warning: couldn't sync …" output. +// +// Safe under t.Parallel because reportPushFailureArgs.out lets us inject a +// per-test writer instead of mutating package-global pushStderr. func TestReportPushFailure_BailsOnOuterContextCancel(t *testing.T) { t.Parallel() @@ -130,15 +133,11 @@ func TestReportPushFailure_BailsOnOuterContextCancel(t *testing.T) { cancel() var suffixes []string - stop := func(s string) { suffixes = append(suffixes, s) } - - old := pushStderr var buf strings.Builder - pushStderr = &buf - t.Cleanup(func() { pushStderr = old }) reportPushFailure(ctx, reportPushFailureArgs{ - stop: stop, + out: &buf, + stop: func(s string) { suffixes = append(suffixes, s) }, err: errors.New("simulated sync failure"), logMsg: "push sync failed", warnMsg: "[entire] Warning: couldn't sync foo: …\n", From cc0307d75db33f5849c111b90664a9f19f646ecc Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Wed, 13 May 2026 14:00:06 +0200 Subject: [PATCH 6/9] Distinguish inner fetch timeout from outer context cancellation In fetchAndRebaseSessionsCommon, the timeout-wrapped context shadowed the outer ctx, so a hook-level deadline shorter than fetchAttemptTimeout was misreported as errFetchTimedOut. Mirror tryPushSessionsCommon's pattern: hold the wrapped context as localCtx and gate the sentinel on the outer ctx still being alive. Entire-Checkpoint: cce2e7fcfcd8 --- cmd/entire/cli/strategy/push_common.go | 25 +++++++++++-------- .../cli/strategy/push_messaging_test.go | 22 ++++++++++++++++ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 84a025c84f..68a1ccda2c 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -535,10 +535,10 @@ func printProtectedRefBlock(w io.Writer, ref, target string) { // always apply cleanly. // The target can be a remote name or a URL. func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string) error { - ctx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) + localCtx, cancel := context.WithTimeout(ctx, fetchAttemptTimeout) defer cancel() - fetchTarget, err := remote.ResolveFetchTarget(ctx, target) + fetchTarget, err := remote.ResolveFetchTarget(localCtx, target) if err != nil { return fmt.Errorf("resolve fetch target: %w", err) } @@ -562,14 +562,17 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // Use --filter=blob:none for a partial fetch that downloads only commits // and trees, skipping blobs. The merge only needs the tree structure to // combine entries; blobs are already local or fetched on demand. - if output, fetchErr := remote.Fetch(ctx, remote.FetchOptions{ + if output, fetchErr := remote.Fetch(localCtx, remote.FetchOptions{ Remote: fetchTarget, RefSpecs: []string{refSpec}, NoTags: true, }); fetchErr != nil { - // Inner deadline fired: distinct sentinel so doPushBranch can render - // " timed out" instead of an empty trailing colon. - if errors.Is(ctx.Err(), context.DeadlineExceeded) { + // Inner deadline fired (but outer is still alive): distinct sentinel + // so doPushBranch can render " timed out" instead of an empty trailing + // colon. When the outer context fired, let the wrapped error propagate + // so the caller sees the real cancellation cause instead of an inner- + // timeout mislabel. + if errors.Is(localCtx.Err(), context.DeadlineExceeded) && ctx.Err() == nil { logging.Debug(ctx, "git fetch timed out", slog.String("error", fetchErr.Error()), slog.Duration("after", fetchAttemptTimeout), @@ -586,7 +589,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string return fmt.Errorf("fetch failed: %s", msg) } - repo, err := OpenRepository(ctx) + repo, err := OpenRepository(localCtx) if err != nil { return fmt.Errorf("failed to open git repository: %w", err) } @@ -597,7 +600,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // this cherry-picks local commits onto remote tip, updating the local ref. // If reconciliation fails, abort — proceeding to rebase on disconnected // branches would silently combine unrelated histories. - if reconcileErr := ReconcileDisconnectedMetadataBranch(ctx, repo, fetchedRefName, pushStderr); reconcileErr != nil { + if reconcileErr := ReconcileDisconnectedMetadataBranch(localCtx, repo, fetchedRefName, pushStderr); reconcileErr != nil { return fmt.Errorf("metadata reconciliation failed: %w", reconcileErr) } @@ -623,7 +626,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string if err != nil { return fmt.Errorf("failed to get repo path: %w", err) } - mergeBase, err := getMergeBase(ctx, repoPath, localRef.Hash().String(), remoteRef.Hash().String()) + mergeBase, err := getMergeBase(localCtx, repoPath, localRef.Hash().String(), remoteRef.Hash().String()) if err != nil { return fmt.Errorf("failed to find merge base: %w", err) } @@ -644,7 +647,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string // them onto the remote tip. This preserves local-only commits even when the // local metadata branch already contains old merge commits, while avoiding // replaying shared ancestors older than the true merge-base. - localCommits, err := collectCommitsSince(ctx, repo, repoPath, localRef.Hash(), remoteRef.Hash()) + localCommits, err := collectCommitsSince(localCtx, repo, repoPath, localRef.Hash(), remoteRef.Hash()) if err != nil { return fmt.Errorf("failed to collect local commits: %w", err) } @@ -661,7 +664,7 @@ func fetchAndRebaseSessionsCommon(ctx context.Context, target, branchName string return nil } - newTip, err := cherryPickOnto(ctx, repo, remoteRef.Hash(), localCommits) + newTip, err := cherryPickOnto(localCtx, repo, remoteRef.Hash(), localCommits) if err != nil { return fmt.Errorf("failed to rebase local commits onto remote: %w", err) } diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index 06a604e920..52d3df13bf 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -46,6 +46,28 @@ func TestFetchAndRebase_FetchFailure_IncludesUnderlyingError(t *testing.T) { require.NotEmpty(t, strings.TrimSpace(detail), "fetch failed must carry a non-empty cause; got %q", msg) } +// Regression: when the outer context already fired its deadline (e.g. a +// hook-level timeout shorter than fetchAttemptTimeout), the inner localCtx +// also reports DeadlineExceeded because it derives from outer. A naive check +// against the inner ctx alone would misreport this as errFetchTimedOut and +// swallow the real outer cancellation cause. The fix gates the sentinel on +// the outer ctx still being alive. +// +// Not parallel: t.Chdir() for OpenRepository. +func TestFetchAndRebase_OuterDeadlineFired_DoesNotReturnFetchTimedOut(t *testing.T) { + tmpDir := setupRepoWithCheckpointBranch(t) + t.Chdir(tmpDir) + + ctx, cancel := context.WithDeadline(context.Background(), time.Now().Add(-time.Hour)) + defer cancel() + + nonExistent := filepath.Join(t.TempDir(), "does-not-exist") + err := fetchAndRebaseSessionsCommon(ctx, nonExistent, paths.MetadataBranchName) + require.Error(t, err) + assert.NotErrorIs(t, err, errFetchTimedOut, + "outer-context deadline must not be misreported as inner fetch timeout; got %v", err) +} + // On inner-push timeout, doPushBranch must end the dot line with " timed out", // skip the sync→retry cascade (same network condition would also trip the // fetch deadline), and surface the DEBUG-logging hint. From b5032e315ed767687236e910db394a058a4c9865 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Wed, 13 May 2026 14:00:36 +0200 Subject: [PATCH 7/9] Invoke sh by name in killoncancel tests The tests guard with exec.LookPath("sh") but were invoking /bin/sh directly, which would skip the guard's intent on systems where sh lives outside /bin (some non-FHS layouts). Use the PATH-resolved form so the guard's outcome actually governs the test. Entire-Checkpoint: bd7c6a83418a --- cmd/entire/cli/execx/killoncancel_unix_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/entire/cli/execx/killoncancel_unix_test.go b/cmd/entire/cli/execx/killoncancel_unix_test.go index a6d3a43937..c97b912bd6 100644 --- a/cmd/entire/cli/execx/killoncancel_unix_test.go +++ b/cmd/entire/cli/execx/killoncancel_unix_test.go @@ -31,7 +31,7 @@ func TestExecCommandContext_HangsOnGrandchildHoldingPipe(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) defer cancel() - cmd := exec.CommandContext(ctx, "/bin/sh", "-c", grandchildHoldingStdoutCmd) + cmd := exec.CommandContext(ctx, "sh", "-c", grandchildHoldingStdoutCmd) start := time.Now() // We expect CombinedOutput to return with an error (killed by ctx) — the @@ -55,7 +55,7 @@ func TestKillOnCancel_TerminatesGrandchildHoldingPipe(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) defer cancel() - cmd := exec.CommandContext(ctx, "/bin/sh", "-c", grandchildHoldingStdoutCmd) + cmd := exec.CommandContext(ctx, "sh", "-c", grandchildHoldingStdoutCmd) KillOnCancel(cmd, 500*time.Millisecond) start := time.Now() From f060ba3cdf22b378395f7b20b08827859c2407e5 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Wed, 13 May 2026 14:01:07 +0200 Subject: [PATCH 8/9] Use zero timeout for push-timeout tests Forcing pushAttemptTimeout to 1ms still raced against a fast local-bare-repo push that could complete before the deadline fired. Setting it to 0 makes localCtx fire DeadlineExceeded synchronously, so the test always reaches the errPushTimedOut path. Entire-Checkpoint: faf57c42d164 --- cmd/entire/cli/strategy/push_messaging_test.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index 52d3df13bf..54a0ceb3cd 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -79,7 +79,10 @@ func TestDoPushBranch_TimedOut_PrintsTimedOutAndSkipsSync(t *testing.T) { // Force every push attempt to time out immediately. KillOnCancel + // WaitDelay ensures the call returns within milliseconds. - withShortPushTimeout(t, 1*time.Millisecond) + // Zero timeout makes localCtx fire DeadlineExceeded synchronously, so the + // remote.Push call sees an already-cancelled context and we exercise the + // errPushTimedOut path without racing the network or local push. + withShortPushTimeout(t, 0) restore := captureStderr(t) err := doPushBranch(context.Background(), bareDir, paths.MetadataBranchName) @@ -180,7 +183,10 @@ func TestTryPushSessionsCommon_InnerTimeout_ReturnsErrPushTimedOut(t *testing.T) workDir, bareDir := setupBareRemoteWithCheckpointBranch(t) t.Chdir(workDir) - withShortPushTimeout(t, 1*time.Millisecond) + // Zero timeout makes localCtx fire DeadlineExceeded synchronously, so the + // remote.Push call sees an already-cancelled context and we exercise the + // errPushTimedOut path without racing the network or local push. + withShortPushTimeout(t, 0) _, err := tryPushSessionsCommon(context.Background(), bareDir, paths.MetadataBranchName) require.Error(t, err) From 7d7276baa1a891b34dd074305f7bf9e1677214e2 Mon Sep 17 00:00:00 2001 From: Stefan Haubold Date: Wed, 13 May 2026 14:02:23 +0200 Subject: [PATCH 9/9] Include target alias in push INFO logs Only "push attempt start" carried the target field; completion, failure, retry, sync-failure, and retry-failure entries omitted it. Doctor bundles collecting these had to infer the target from surrounding context, which breaks when multiple remotes (origin + checkpoint remote) push in the same hook invocation. Add target (redacted via displayPushTarget for URL targets) to every push-related log line. Entire-Checkpoint: 2c5cd80e2128 --- cmd/entire/cli/strategy/push_common.go | 4 ++++ cmd/entire/cli/strategy/push_messaging_test.go | 3 +++ 2 files changed, 7 insertions(+) diff --git a/cmd/entire/cli/strategy/push_common.go b/cmd/entire/cli/strategy/push_common.go index 68a1ccda2c..dc1c39a01e 100644 --- a/cmd/entire/cli/strategy/push_common.go +++ b/cmd/entire/cli/strategy/push_common.go @@ -123,6 +123,7 @@ func doPushBranch(ctx context.Context, target, branchName string) error { if err == nil { logging.Info(ctx, "push attempt completed", slog.String("branch", branchName), + slog.String("target", displayTarget), slog.Duration("elapsed", time.Since(pushStart)), slog.Bool("up_to_date", result.upToDate), ) @@ -132,6 +133,7 @@ func doPushBranch(ctx context.Context, target, branchName string) error { logging.Info(ctx, "push attempt failed", slog.String("branch", branchName), + slog.String("target", displayTarget), slog.Duration("elapsed", time.Since(pushStart)), slog.String("class", classifyForLog(err)), ) @@ -204,6 +206,7 @@ func doPushBranch(ctx context.Context, target, branchName string) error { } logging.Info(ctx, "push retry completed", slog.String("branch", branchName), + slog.String("target", displayTarget), slog.Duration("elapsed", time.Since(retryStart)), slog.Bool("up_to_date", retryResult.upToDate), ) @@ -247,6 +250,7 @@ func reportPushFailure(ctx context.Context, a reportPushFailureArgs) { a.stop(suffix) logging.Info(ctx, a.logMsg, slog.String("branch", a.branch), + slog.String("target", displayPushTarget(a.target)), slog.Duration("elapsed", time.Since(a.start)), slog.String("class", classifyForLog(a.err)), ) diff --git a/cmd/entire/cli/strategy/push_messaging_test.go b/cmd/entire/cli/strategy/push_messaging_test.go index 54a0ceb3cd..7ee44ed8f5 100644 --- a/cmd/entire/cli/strategy/push_messaging_test.go +++ b/cmd/entire/cli/strategy/push_messaging_test.go @@ -140,6 +140,9 @@ func TestDoPushBranch_LogsAttemptsAtInfo(t *testing.T) { assert.Contains(t, logText, "push attempt start", "log must record push start") assert.Contains(t, logText, "push attempt completed", "log must record push completion") assert.Contains(t, logText, `"branch":"`+paths.MetadataBranchName+`"`, "log must include branch") + // Doctor bundles need the target on completion entries too, not just on + // the start record, so a stuck or retried push can be correlated. + assert.Contains(t, logText, `"target":"`+bareDir+`"`, "completion log must include target") } // reportPushFailure is the single chokepoint for the sync- and retry-stage