From f057f92c993438eda4bbe556ff8435e19ae7c1a6 Mon Sep 17 00:00:00 2001 From: ashishch432 Date: Sun, 26 Apr 2026 11:03:54 +0000 Subject: [PATCH] fix: cache version probe result to prevent reconciliation deadlock --- api/v1alpha1/clickhousecluster_types.go | 5 + api/v1alpha1/keepercluster_types.go | 5 + .../clickhouse.com_clickhouseclusters.yaml | 5 + .../bases/clickhouse.com_keeperclusters.yaml | 5 + .../clickhouseclusters.clickhouse.com.yaml | 5 + .../crd/keeperclusters.clickhouse.com.yaml | 5 + docs/api_reference.md | 2 + .../controller/clickhouse/controller_test.go | 6 + internal/controller/clickhouse/sync.go | 3 + internal/controller/keeper/sync.go | 7 +- internal/controller/versionprobe.go | 63 +++++--- internal/controller/versionprobe_test.go | 141 ++++++++++++++++++ 12 files changed, 233 insertions(+), 19 deletions(-) diff --git a/api/v1alpha1/clickhousecluster_types.go b/api/v1alpha1/clickhousecluster_types.go index ab38a7f0..9cabf663 100644 --- a/api/v1alpha1/clickhousecluster_types.go +++ b/api/v1alpha1/clickhousecluster_types.go @@ -199,6 +199,11 @@ type ClickHouseClusterStatus struct { // +optional // +operator-sdk:csv:customresourcedefinitions:type=status Version string `json:"version,omitempty"` + // VersionProbeRevision is the image hash of the last successful version probe. + // When this matches the current image hash, the cached Version is used directly. + // +optional + // +operator-sdk:csv:customresourcedefinitions:type=status + VersionProbeRevision string `json:"versionProbeRevision,omitempty"` } // ClickHouseCluster is the Schema for the `clickhouseclusters` API. diff --git a/api/v1alpha1/keepercluster_types.go b/api/v1alpha1/keepercluster_types.go index bfa74fed..0ec022a7 100644 --- a/api/v1alpha1/keepercluster_types.go +++ b/api/v1alpha1/keepercluster_types.go @@ -160,6 +160,11 @@ type KeeperClusterStatus struct { // +optional // +operator-sdk:csv:customresourcedefinitions:type=status Version string `json:"version,omitempty"` + // VersionProbeRevision is the image hash of the last successful version probe. + // When this matches the current image hash, the cached Version is used directly. + // +optional + // +operator-sdk:csv:customresourcedefinitions:type=status + VersionProbeRevision string `json:"versionProbeRevision,omitempty"` } // KeeperCluster is the Schema for the `keeperclusters` API. diff --git a/config/crd/bases/clickhouse.com_clickhouseclusters.yaml b/config/crd/bases/clickhouse.com_clickhouseclusters.yaml index 1b46c3b1..befc2e87 100644 --- a/config/crd/bases/clickhouse.com_clickhouseclusters.yaml +++ b/config/crd/bases/clickhouse.com_clickhouseclusters.yaml @@ -6963,6 +6963,11 @@ spec: description: Version indicates the version reported by the container image. type: string + versionProbeRevision: + description: |- + VersionProbeRevision is the image hash of the last successful version probe. + When this matches the current image hash, the cached Version is used directly. + type: string type: object type: object served: true diff --git a/config/crd/bases/clickhouse.com_keeperclusters.yaml b/config/crd/bases/clickhouse.com_keeperclusters.yaml index de7907b7..66e7383d 100644 --- a/config/crd/bases/clickhouse.com_keeperclusters.yaml +++ b/config/crd/bases/clickhouse.com_keeperclusters.yaml @@ -6875,6 +6875,11 @@ spec: description: Version indicates the version reported by the container image. type: string + versionProbeRevision: + description: |- + VersionProbeRevision is the image hash of the last successful version probe. + When this matches the current image hash, the cached Version is used directly. + type: string type: object type: object served: true diff --git a/dist/chart/templates/crd/clickhouseclusters.clickhouse.com.yaml b/dist/chart/templates/crd/clickhouseclusters.clickhouse.com.yaml index 0a1f1a65..3e92b3cf 100644 --- a/dist/chart/templates/crd/clickhouseclusters.clickhouse.com.yaml +++ b/dist/chart/templates/crd/clickhouseclusters.clickhouse.com.yaml @@ -6966,6 +6966,11 @@ spec: description: Version indicates the version reported by the container image. type: string + versionProbeRevision: + description: |- + VersionProbeRevision is the image hash of the last successful version probe. + When this matches the current image hash, the cached Version is used directly. + type: string type: object type: object served: true diff --git a/dist/chart/templates/crd/keeperclusters.clickhouse.com.yaml b/dist/chart/templates/crd/keeperclusters.clickhouse.com.yaml index e20b0c8c..71b32265 100644 --- a/dist/chart/templates/crd/keeperclusters.clickhouse.com.yaml +++ b/dist/chart/templates/crd/keeperclusters.clickhouse.com.yaml @@ -6878,6 +6878,11 @@ spec: description: Version indicates the version reported by the container image. type: string + versionProbeRevision: + description: |- + VersionProbeRevision is the image hash of the last successful version probe. + When this matches the current image hash, the cached Version is used directly. + type: string type: object type: object served: true diff --git a/docs/api_reference.md b/docs/api_reference.md index fa6f5a90..ae28df20 100644 --- a/docs/api_reference.md +++ b/docs/api_reference.md @@ -78,6 +78,7 @@ ClickHouseClusterStatus defines the observed state of ClickHouseCluster. | `updateRevision` | string | UpdateRevision indicates latest requested ClickHouseCluster spec revision. | true | | | `observedGeneration` | integer | ObservedGeneration indicates latest generation observed by controller. | true | | | `version` | string | Version indicates the version reported by the container image. | false | | +| `versionProbeRevision` | string | VersionProbeRevision is the image hash of the last successful version probe.
When this matches the current image hash, the cached Version is used directly. | false | | Appears in: - [ClickHouseCluster](#clickhousecluster) @@ -278,6 +279,7 @@ KeeperClusterStatus defines the observed state of KeeperCluster. | `updateRevision` | string | CurrentRevision indicates latest requested KeeperCluster spec revision. | true | | | `observedGeneration` | integer | ObservedGeneration indicates latest generation observed by controller. | true | | | `version` | string | Version indicates the version reported by the container image. | false | | +| `versionProbeRevision` | string | VersionProbeRevision is the image hash of the last successful version probe.
When this matches the current image hash, the cached Version is used directly. | false | | Appears in: - [KeeperCluster](#keepercluster) diff --git a/internal/controller/clickhouse/controller_test.go b/internal/controller/clickhouse/controller_test.go index 6b9fe73e..4b3a19b6 100644 --- a/internal/controller/clickhouse/controller_test.go +++ b/internal/controller/clickhouse/controller_test.go @@ -182,6 +182,12 @@ var _ = When("reconciling ClickHouseCluster", Ordered, func() { } Expect(suite.Client.Update(ctx, updatedCR)).To(Succeed()) + // Clear the cached version probe revision to force the probe to re-run, + // since the image didn't change but the overrides did. + Expect(suite.Client.Get(ctx, cr.NamespacedName(), updatedCR)).To(Succeed()) + updatedCR.Status.VersionProbeRevision = "" + Expect(suite.Client.Status().Update(ctx, updatedCR)).To(Succeed()) + // Delete old job so new one is created with overrides. for _, j := range jobs.Items { Expect(suite.Client.Delete(ctx, &j, client.PropagationPolicy(metav1.DeletePropagationBackground))).To(Succeed()) diff --git a/internal/controller/clickhouse/sync.go b/internal/controller/clickhouse/sync.go index 9323ec69..1ff56f0d 100644 --- a/internal/controller/clickhouse/sync.go +++ b/internal/controller/clickhouse/sync.go @@ -414,6 +414,8 @@ func (r *clickhouseReconciler) reconcileVersionProbe(ctx context.Context, log ct PodTemplate: r.Cluster.Spec.PodTemplate, ContainerTemplate: r.Cluster.Spec.ContainerTemplate, VersionProbe: r.Cluster.Spec.VersionProbeTemplate, + CachedVersion: r.Cluster.Status.Version, + CachedRevision: r.Cluster.Status.VersionProbeRevision, }) if err != nil { return chctrl.StepResult{}, fmt.Errorf("run version probe: %w", err) @@ -422,6 +424,7 @@ func (r *clickhouseReconciler) reconcileVersionProbe(ctx context.Context, log ct r.versionProbe = probeResult if probeResult.Completed() { r.Cluster.Status.Version = probeResult.Version + r.Cluster.Status.VersionProbeRevision = probeResult.Revision } return chctrl.StepContinue(), nil diff --git a/internal/controller/keeper/sync.go b/internal/controller/keeper/sync.go index 72d69f90..a4385d2a 100644 --- a/internal/controller/keeper/sync.go +++ b/internal/controller/keeper/sync.go @@ -216,13 +216,18 @@ func (r *keeperReconciler) reconcileClusterRevisions(ctx context.Context, log ct PodTemplate: r.Cluster.Spec.PodTemplate, ContainerTemplate: r.Cluster.Spec.ContainerTemplate, VersionProbe: r.Cluster.Spec.VersionProbeTemplate, + CachedVersion: r.Cluster.Status.Version, + CachedRevision: r.Cluster.Status.VersionProbeRevision, }) if err != nil { return chctrl.StepResult{}, fmt.Errorf("run version probe: %w", err) } r.versionProbe = probeResult - r.Cluster.Status.Version = r.versionProbe.Version + if probeResult.Completed() { + r.Cluster.Status.Version = probeResult.Version + r.Cluster.Status.VersionProbeRevision = probeResult.Revision + } if r.Checker != nil { cond, event := chctrl.GetUpgradeCondition(*r.Checker, r.versionProbe, r.Cluster.Spec.UpgradeChannel) diff --git a/internal/controller/versionprobe.go b/internal/controller/versionprobe.go index 379fb502..f6e71163 100644 --- a/internal/controller/versionprobe.go +++ b/internal/controller/versionprobe.go @@ -42,6 +42,10 @@ type VersionProbeConfig struct { ContainerTemplate v1.ContainerTemplateSpec // VersionProbe is the user-provided override for the version probe Job. VersionProbe *v1.VersionProbeTemplate + // CachedVersion is the previously detected version stored in CR Status. + CachedVersion string + // CachedRevision is the image hash that produced CachedVersion. + CachedRevision string } // VersionProbeResult holds the outcome of a version probe reconciliation. @@ -52,6 +56,8 @@ type VersionProbeResult struct { Pending bool // Err if version probe failed it contains the error. Err error + // Revision is the image hash of the probe, set on successful completion. + Revision string } // Completed returns true if probe completed successfully with a detected version, false otherwise. @@ -61,12 +67,27 @@ func (r *VersionProbeResult) Completed() bool { // VersionProbe manages a one-time Job to detect the version from a container image. // Returns the version string when available, or empty string if the Job is pending/running. +// When CachedRevision matches the current image hash and CachedVersion is non-empty, +// returns the cached version immediately without creating or reading a Job. func (rm *ResourceManager) VersionProbe( ctx context.Context, log controllerutil.Logger, cfg VersionProbeConfig, ) (VersionProbeResult, error) { - job, err := rm.buildVersionProbeJob(cfg) + // Compute revision once — used for cache check, Job name, and result. + revision, err := imageRevision(cfg) + if err != nil { + return VersionProbeResult{}, fmt.Errorf("compute image revision: %w", err) + } + + // Fast path: return cached version when image hasn't changed. + // Intentionally skips cleanupVersionProbeJobs and Job/Pod API calls. + // Orphaned Jobs from previous images are cleaned on next cache miss. + if cfg.CachedRevision == revision && cfg.CachedVersion != "" { + return VersionProbeResult{Version: cfg.CachedVersion, Revision: revision}, nil + } + + job, err := rm.buildVersionProbeJob(cfg, revision) if err != nil { return VersionProbeResult{}, fmt.Errorf("build version probe job: %w", err) } @@ -124,7 +145,7 @@ func (rm *ResourceManager) VersionProbe( return VersionProbeResult{Err: err}, nil } - return VersionProbeResult{Version: version}, nil + return VersionProbeResult{Version: version, Revision: revision}, nil } // GetVersionSyncCondition evaluates the VersionInSync condition based on the probe result and replica versions. @@ -209,7 +230,27 @@ func (rm *ResourceManager) cleanupVersionProbeJobs(ctx context.Context, log cont } } -func (rm *ResourceManager) buildVersionProbeJob(cfg VersionProbeConfig) (batchv1.Job, error) { +// imageRevision computes a hash of the image and pull policy to use as the cache key +// and Job name suffix. Extracted from buildVersionProbeJob to allow VersionProbe to +// compute it once and pass it through. +func imageRevision(cfg VersionProbeConfig) (string, error) { + type imageKey struct { + Image string + PullPolicy corev1.PullPolicy + } + + hash, err := controllerutil.DeepHashObject(imageKey{ + Image: cfg.ContainerTemplate.Image.String(), + PullPolicy: cfg.ContainerTemplate.ImagePullPolicy, + }) + if err != nil { + return "", fmt.Errorf("hash image key: %w", err) + } + + return hash, nil +} + +func (rm *ResourceManager) buildVersionProbeJob(cfg VersionProbeConfig, revision string) (batchv1.Job, error) { job := batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Namespace: rm.owner.GetNamespace(), @@ -269,21 +310,7 @@ func (rm *ResourceManager) buildVersionProbeJob(cfg VersionProbeConfig) (batchv1 return batchv1.Job{}, fmt.Errorf("set version probe job controller reference: %w", err) } - // Recreate successful probe only on Image or PullPolicy changes - type imageKey struct { - Image string - PullPolicy corev1.PullPolicy - } - - imageHash, err := controllerutil.DeepHashObject(imageKey{ - Image: cfg.ContainerTemplate.Image.String(), - PullPolicy: cfg.ContainerTemplate.ImagePullPolicy, - }) - if err != nil { - return batchv1.Job{}, fmt.Errorf("hash version probe job image: %w", err) - } - - job.Name = fmt.Sprintf("%s-version-probe-%s", rm.specificName, imageHash[:8]) + job.Name = fmt.Sprintf("%s-version-probe-%s", rm.specificName, revision[:8]) // Set reserved labels after overrides to ensure they are not modified by user overrides. job.Labels = controllerutil.MergeMaps(job.Labels, map[string]string{ diff --git a/internal/controller/versionprobe_test.go b/internal/controller/versionprobe_test.go index 73ffc440..2c6e73f4 100644 --- a/internal/controller/versionprobe_test.go +++ b/internal/controller/versionprobe_test.go @@ -1,6 +1,8 @@ package controller import ( + "context" + "github.com/google/go-cmp/cmp" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -8,8 +10,15 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/events" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log/zap" v1 "github.com/ClickHouse/clickhouse-operator/api/v1alpha1" + "github.com/ClickHouse/clickhouse-operator/internal/controllerutil" ) // baseJob builds a minimal operator-generated Job for testing overrides. @@ -261,3 +270,135 @@ var _ = Describe("patchResource with jobSchema (version probe overrides)", func( })) }) }) + +// setupProbeTest creates a fake Controller, owner CR, and ResourceManager +// for testing VersionProbe() with a fake Kubernetes client. +func setupProbeTest() (ResourceManager, controllerutil.Logger) { + scheme := runtime.NewScheme() + Expect(clientgoscheme.AddToScheme(scheme)).To(Succeed()) + Expect(v1.AddToScheme(scheme)).To(Succeed()) + + builder := fake.NewClientBuilder().WithScheme(scheme) + + fakeClient := builder.Build() + recorder := events.NewFakeRecorder(32) + log := controllerutil.NewLogger(zap.NewRaw(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + owner := &v1.ClickHouseCluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: "test", + UID: "test-uid", + }, + } + + cc := &fakeController{client: fakeClient, scheme: scheme, recorder: recorder} + + return NewResourceManager(cc, owner), log +} + +// fakeController implements the Controller interface for unit tests. +type fakeController struct { + client client.Client + scheme *runtime.Scheme + recorder events.EventRecorder +} + +func (f *fakeController) GetClient() client.Client { return f.client } +func (f *fakeController) GetScheme() *runtime.Scheme { return f.scheme } +func (f *fakeController) GetRecorder() events.EventRecorder { return f.recorder } + +// probeCfg returns a VersionProbeConfig with the given image and cache values. +func probeCfg(image, cachedVersion, cachedRevision string) VersionProbeConfig { + return VersionProbeConfig{ + Binary: "clickhouse-server", + ContainerTemplate: v1.ContainerTemplateSpec{ + Image: v1.ContainerImage{Repository: image, Tag: "latest"}, + }, + CachedVersion: cachedVersion, + CachedRevision: cachedRevision, + } +} + +var _ = Describe("VersionProbe caching", func() { + It("should return cached version on cache hit without creating a Job", func(ctx context.Context) { + rm, log := setupProbeTest() + + cfg := probeCfg("clickhouse/clickhouse-server", "", "") + + By("running the first probe to get the revision") + + revision, err := imageRevision(cfg) + Expect(err).NotTo(HaveOccurred()) + + By("setting up cache fields as if a previous probe succeeded") + + cfg.CachedVersion = "25.3.1.1" + cfg.CachedRevision = revision + + result, err := rm.VersionProbe(ctx, log, cfg) + Expect(err).NotTo(HaveOccurred()) + + By("verifying it returned the cached version without creating a Job") + Expect(result.Version).To(Equal("25.3.1.1")) + Expect(result.Revision).To(Equal(revision)) + Expect(result.Pending).To(BeFalse()) + Expect(result.Completed()).To(BeTrue()) + + By("verifying no Jobs were created") + + var jobs batchv1.JobList + Expect(rm.ctrl.GetClient().List(ctx, &jobs, client.InNamespace("default"))).To(Succeed()) + Expect(jobs.Items).To(BeEmpty()) + }) + + It("should miss cache when CachedVersion is empty even if revision matches", func(ctx context.Context) { + rm, log := setupProbeTest() + + cfg := probeCfg("clickhouse/clickhouse-server", "", "") + revision, err := imageRevision(cfg) + Expect(err).NotTo(HaveOccurred()) + + By("setting revision but leaving version empty (fresh CR)") + + cfg.CachedRevision = revision + cfg.CachedVersion = "" + + result, err := rm.VersionProbe(ctx, log, cfg) + Expect(err).NotTo(HaveOccurred()) + + By("verifying probe was created (cache miss)") + Expect(result.Pending).To(BeTrue()) + Expect(result.Version).To(BeEmpty()) + + By("verifying a Job was created") + + var jobs batchv1.JobList + Expect(rm.ctrl.GetClient().List(ctx, &jobs, client.InNamespace("default"))).To(Succeed()) + Expect(jobs.Items).To(HaveLen(1)) + }) + + It("should miss cache when image changes", func(ctx context.Context) { + rm, log := setupProbeTest() + + By("computing revision for the original image") + + originalCfg := probeCfg("clickhouse/clickhouse-server", "", "") + originalRevision, err := imageRevision(originalCfg) + Expect(err).NotTo(HaveOccurred()) + + By("creating config with a different image but the old revision cached") + + cfg := probeCfg("custom-registry/clickhouse-server", "25.3.1.1", originalRevision) + + result, err := rm.VersionProbe(ctx, log, cfg) + Expect(err).NotTo(HaveOccurred()) + + By("verifying a new probe Job was created (cache miss due to image change)") + Expect(result.Pending).To(BeTrue()) + + var jobs batchv1.JobList + Expect(rm.ctrl.GetClient().List(ctx, &jobs, client.InNamespace("default"))).To(Succeed()) + Expect(jobs.Items).To(HaveLen(1)) + }) +})