openstack-k8s-operators · lmiccini · Apr 14, 2026 · dciabrin · Apr 17, 2026 · lmiccini
diff --git a/internal/redis/statefulset.go b/internal/redis/statefulset.go
@@ -27,36 +27,15 @@ func StatefulSet(
 	}
 	ls := labels.GetLabels(r, "redis", matchls)
 
-	livenessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds:      5,
-		PeriodSeconds:       3,
-		InitialDelaySeconds: 3,
-	}
-	readinessProbe := &corev1.Probe{
-		// TODO might need tuning
-		TimeoutSeconds:      5,
-		PeriodSeconds:       5,
-		InitialDelaySeconds: 5,
-	}
 	sentinelLivenessProbe := &corev1.Probe{
-		// TODO might need tuning
 		TimeoutSeconds:      5,
 		PeriodSeconds:       3,
-		InitialDelaySeconds: 3,
+		InitialDelaySeconds: 40,
 	}
 	sentinelReadinessProbe := &corev1.Probe{
-		// TODO might need tuning
 		TimeoutSeconds:      5,
 		PeriodSeconds:       5,
-		InitialDelaySeconds: 5,
-	}
-
-	livenessProbe.TCPSocket = &corev1.TCPSocketAction{
-		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
-	}
-	readinessProbe.TCPSocket = &corev1.TCPSocketAction{
-		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
+		InitialDelaySeconds: 40,
 	}
 	sentinelLivenessProbe.TCPSocket = &corev1.TCPSocketAction{
 		Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(26379)},
@@ -78,6 +57,9 @@ func StatefulSet(
 	}, {
 		Name:  "CONFIG_HASH",
 		Value: configHash,
+	}, {
+		Name:  "REPLICAS",
+		Value: strconv.Itoa(int(*r.Spec.Replicas)),
 	}}
 
 	sts := &appsv1.StatefulSet{
@@ -115,13 +97,15 @@ func StatefulSet(
 										Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "liveness"},
 									},
 								},
+								InitialDelaySeconds: 40,
 							},
 							ReadinessProbe: &corev1.Probe{
 								ProbeHandler: corev1.ProbeHandler{
 									Exec: &corev1.ExecAction{
 										Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "readiness"},
 									},
 								},
+								InitialDelaySeconds: 40,
 							},
 						}, {
 							Image:   r.Spec.ContainerImage,

diff --git a/templates/redis/bin/common.sh b/templates/redis/bin/common.sh
@@ -80,8 +80,77 @@ function remove_pod_label() {
     local pod="$1"
     local label="$2"
     local patch="[{\"op\": \"remove\", \"path\": \"/metadata/labels/${label}\"}]"
-    # 200: OK, 422: not found
-    configure_pod_label $pod "$patch" "(200|422)"
+    # 200: OK, 404: pod not found, 422: label not found
+    configure_pod_label $pod "$patch" "(200|404|422)"
+}
+
+# Wait for a peer sentinel to report a valid master for the cluster.
+# Contacts each peer pod individually by FQDN (skipping self) to avoid
+# the headless service DNS resolving to our own uninitialized sentinel.
+# If a peer still reports US as master (stale info before
+# down-after-milliseconds triggers failover), keeps retrying until
+# failover completes and a different master is elected.
+# Falls back to querying peer redis directly via the ROLE command
+# when sentinel output cannot be parsed.
+# Prints the master address on success (FQDN or IP).
+function wait_for_master() {
+    local retries=${SENTINEL_RETRIES:-10}
+    local delay=${SENTINEL_RETRY_DELAY:-3}
+    local pod_ordinal=${POD_NAME##*-}
+    local pod_base=${POD_NAME%-*}
+    local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
+
+    for i in $(seq 1 $retries); do
+        local ordinal=0
+        while [ $ordinal -le $max_ordinal ]; do
+            if [ "$ordinal" != "$pod_ordinal" ]; then
+                local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
+                local output
+                output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 26379 sentinel master redis 2>/dev/null)
+                if [ $? -eq 0 ] && [ -n "$output" ]; then
+                    local master
+                    master=$(echo "$output" | tr -d '\r' | awk '/^ip$/{getline; print; exit}')
+                    if [ -n "$master" ] && [ "$master" != "$POD_IP" ] && ! echo "$master" | grep -q "^${POD_NAME}\."; then
+                        echo "$master"
+                        return 0
+                    fi
+                    log "Peer ${peer} sentinel reports master=${master} (stale, skipping)" >&2
+                else
+                    # Sentinel unreachable; try redis ROLE as fallback
+                    local role
+                    role=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 6379 role 2>/dev/null | head -1 | tr -d '\r')
+                    if [ "$role" = "master" ]; then
+                        echo "$peer"
+                        return 0
+                    fi
+                fi
+            fi
+            ordinal=$((ordinal + 1))
+        done
+        log "Attempt $i/$retries: no valid master found, retrying in ${delay}s..." >&2
+        sleep $delay
+    done
+    return 1
+}
+
+# Check if any peer redis is alive (responds to PING on port 6379).
+# Used as a safety net before bootstrapping to distinguish a fresh
+# deployment (no peers) from a pod restart (peers alive).
+function has_alive_peers() {
+    local pod_ordinal=${POD_NAME##*-}
+    local pod_base=${POD_NAME%-*}
+    local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
+    local ordinal=0
+    while [ $ordinal -le $max_ordinal ]; do
+        if [ "$ordinal" != "$pod_ordinal" ]; then
+            local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
+            if timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${peer} -p 6379 ping 2>/dev/null | grep -q PONG; then
+                return 0
+            fi
+        fi
+        ordinal=$((ordinal + 1))
+    done
+    return 1
 }
 
 function set_pod_label() {

diff --git a/templates/redis/bin/start_redis_replication.sh b/templates/redis/bin/start_redis_replication.sh
@@ -5,17 +5,19 @@
 generate_configs
 sudo -E kolla_set_configs
 
-# 1. check if a redis cluster is already running by contacting sentinel
-output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
+# 1. check if a redis cluster is already running by contacting peer sentinels
+master=$(wait_for_master)
 if [ $? -eq 0 ]; then
-    master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
-    # TODO skip if no master was found
     log "Connecting to the existing Redis cluster (master: ${master})"
     exec redis-server $REDIS_CONFIG --protected-mode no --replicaof "$master" 6379
 fi
 
-# 2. else bootstrap a new cluster (assume we should be the first redis pod)
+# 2. else bootstrap a new cluster if no peers are alive (fresh deployment)
 if is_bootstrap_pod $POD_NAME; then
+    if has_alive_peers; then
+        log_error "Peers are alive but no master found. Refusing to bootstrap to avoid split-brain."
+        exit 1
+    fi
     log "Bootstrapping a new Redis cluster from ${POD_NAME}"
     set_pod_label $POD_NAME redis~1master
     exec redis-server $REDIS_CONFIG --protected-mode no

diff --git a/templates/redis/bin/start_sentinel.sh b/templates/redis/bin/start_sentinel.sh
@@ -5,19 +5,21 @@
 generate_configs
 sudo -E kolla_set_configs
 
-# 1. check if a redis cluster is already running by contacting sentinel
-output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
+# 1. check if a redis cluster is already running by contacting peer sentinels
+master=$(wait_for_master)
 if [ $? -eq 0 ]; then
-    master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
-    # TODO skip if no master was found
     log "Connecting to the existing sentinel cluster (master: $master)"
     echo "sentinel monitor redis ${master} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
     exec redis-sentinel $SENTINEL_CONFIG
 fi
 
 # 2. else let the pod's redis server bootstrap a new cluster and monitor it
-# (assume we should be the first redis pod)
+# (only if no peers are alive, meaning this is a fresh deployment)
 if is_bootstrap_pod $POD_NAME; then
+    if has_alive_peers; then
+        log_error "Peers are alive but no master found. Refusing to bootstrap sentinel to avoid split-brain."
+        exit 1
+    fi
     log "Bootstrapping a new sentinel cluster"
     echo "sentinel monitor redis ${POD_FQDN} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
     exec redis-sentinel $SENTINEL_CONFIG