From ad4398dcded85af8b80d6cb172e8261027570232 Mon Sep 17 00:00:00 2001 From: CMGS Date: Fri, 15 May 2026 17:51:39 +0800 Subject: [PATCH] fix: make CH pause/resume idempotent on cocoon side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CH's Vm::pause / Vm::resume at vmm/src/vm.rs:412-440 reject same-state transitions (Paused→Paused, Running→Running) as InvalidStateTransition, returning 500. A cocoon process SIGKILLed between pause and resume during snapshot save leaves the VM stuck Paused, after which every subsequent snapshot save fails — observed on GKE prod and journal-confirmed. pauseVM / resumeVM now swallow CH's exact "Invalid transition: InvalidStateTransition(, )" 500 body, treating it as "already in the desired state". Matched via APIError code + substring; transport errors, other 500 bodies, and different transitions (Created→Paused etc.) still propagate. FC research (upstream src/vmm/src/vstate/vcpu.rs): FC's vCPU event loop acks Pause from paused state and Resume from running state without error — FC API is inherently idempotent. Updated the misleading "Non-idempotent" comment on FC's pauseVM/resumeVM; no behavior change. --- hypervisor/cloudhypervisor/helper.go | 18 +++++++++++ hypervisor/cloudhypervisor/helper_test.go | 39 +++++++++++++++++++++++ hypervisor/firecracker/api.go | 6 ++-- 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 hypervisor/cloudhypervisor/helper_test.go diff --git a/hypervisor/cloudhypervisor/helper.go b/hypervisor/cloudhypervisor/helper.go index 207f65f..9270387 100644 --- a/hypervisor/cloudhypervisor/helper.go +++ b/hypervisor/cloudhypervisor/helper.go @@ -3,6 +3,7 @@ package cloudhypervisor import ( "context" "encoding/json" + "errors" "fmt" "net/http" "os" @@ -112,16 +113,33 @@ func shutdownVM(ctx context.Context, hc *http.Client) error { return err } +// pauseVM is idempotent — swallows CH's Paused→Paused 500 so a stuck-paused VM recovers. func pauseVM(ctx context.Context, hc *http.Client) error { _, err := vmAPIOnce(ctx, hc, "vm.pause", nil) + if err != nil && isAlreadyInStateError(err, "Paused") { + return nil + } return err } +// resumeVM is idempotent — swallows CH's Running→Running 500. func resumeVM(ctx context.Context, hc *http.Client) error { _, err := vmAPIOnce(ctx, hc, "vm.resume", nil) + if err != nil && isAlreadyInStateError(err, "Running") { + return nil + } return err } +// isAlreadyInStateError matches CH's exact `Invalid transition: InvalidStateTransition(, )` in a 500 body. +func isAlreadyInStateError(err error, state string) bool { + var ae *utils.APIError + if !errors.As(err, &ae) || ae.Code != http.StatusInternalServerError { + return false + } + return strings.Contains(ae.Message, fmt.Sprintf("Invalid transition: InvalidStateTransition(%s, %s)", state, state)) +} + // snapshotVM and restoreVM temporarily extend the client timeout for // long-running memory transfers, then restore it for subsequent calls. func snapshotVM(ctx context.Context, hc *http.Client, destDir string) error { diff --git a/hypervisor/cloudhypervisor/helper_test.go b/hypervisor/cloudhypervisor/helper_test.go new file mode 100644 index 0000000..f9195db --- /dev/null +++ b/hypervisor/cloudhypervisor/helper_test.go @@ -0,0 +1,39 @@ +package cloudhypervisor + +import ( + "errors" + "fmt" + "net/http" + "testing" + + "github.com/cocoonstack/cocoon/utils" +) + +func TestIsAlreadyInStateError(t *testing.T) { + chPaused := `PUT http://localhost/api/v1/vm.pause → 500: ["Error from API","The VM could not be paused","Cannot pause VM","Failed to pause migratable component","Invalid transition: InvalidStateTransition(Paused, Paused)"]` + chRunning := `PUT http://localhost/api/v1/vm.resume → 500: ["Error from API","Cannot resume VM","Failed","Invalid transition: InvalidStateTransition(Running, Running)"]` + + tests := []struct { + name string + err error + state string + want bool + }{ + {name: "paused paused match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "Paused", want: true}, + {name: "running running match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chRunning}, state: "Running", want: true}, + {name: "wrong state in match", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "Running", want: false}, + {name: "non-500 code", err: &utils.APIError{Code: http.StatusBadRequest, Message: chPaused}, state: "Paused", want: false}, + {name: "different transition (Created→Paused)", err: &utils.APIError{Code: http.StatusInternalServerError, Message: "InvalidStateTransition(Created, Paused)"}, state: "Paused", want: false}, + {name: "non-APIError", err: errors.New("dial unix: connection refused"), state: "Paused", want: false}, + {name: "nil error", err: nil, state: "Paused", want: false}, + {name: "wrapped APIError", err: fmt.Errorf("snapshot save: %w", &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}), state: "Paused", want: true}, + {name: "empty state", err: &utils.APIError{Code: http.StatusInternalServerError, Message: chPaused}, state: "", want: false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := isAlreadyInStateError(tt.err, tt.state); got != tt.want { + t.Errorf("isAlreadyInStateError(%v, %q) = %v, want %v", tt.err, tt.state, got, tt.want) + } + }) + } +} diff --git a/hypervisor/firecracker/api.go b/hypervisor/firecracker/api.go index 27ed966..73a1a81 100644 --- a/hypervisor/firecracker/api.go +++ b/hypervisor/firecracker/api.go @@ -169,8 +169,7 @@ func sendCtrlAltDel(ctx context.Context, hc *http.Client) error { return fcAPIOnce(ctx, hc, http.MethodPut, "/actions", body) } -// pauseVM pauses a running FC instance via PATCH /vm. Non-idempotent: a -// retry after a lost response would hit "already paused" and mask success. +// pauseVM pauses a running FC instance via PATCH /vm. Idempotent: FC's vCPU event loop acks Pause from the paused state without error (vstate/vcpu.rs). func pauseVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStatePaused}) if err != nil { @@ -179,8 +178,7 @@ func pauseVM(ctx context.Context, hc *http.Client) error { return fcAPIOnce(ctx, hc, http.MethodPatch, "/vm", body) } -// resumeVM resumes a paused FC instance via PATCH /vm. Same non-idempotent -// shape as pauseVM. +// resumeVM resumes a paused FC instance via PATCH /vm. Idempotent like pauseVM. func resumeVM(ctx context.Context, hc *http.Client) error { body, err := json.Marshal(map[string]string{"state": vmStateResumed}) if err != nil {