Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 7 additions & 23 deletions internal/redis/statefulset.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,15 @@ func StatefulSet(
}
ls := labels.GetLabels(r, "redis", matchls)

livenessProbe := &corev1.Probe{
// TODO might need tuning
TimeoutSeconds: 5,
PeriodSeconds: 3,
InitialDelaySeconds: 3,
}
readinessProbe := &corev1.Probe{
// TODO might need tuning
TimeoutSeconds: 5,
PeriodSeconds: 5,
InitialDelaySeconds: 5,
}
sentinelLivenessProbe := &corev1.Probe{
// TODO might need tuning
TimeoutSeconds: 5,
PeriodSeconds: 3,
InitialDelaySeconds: 3,
InitialDelaySeconds: 40,
}
sentinelReadinessProbe := &corev1.Probe{
// TODO might need tuning
TimeoutSeconds: 5,
PeriodSeconds: 5,
InitialDelaySeconds: 5,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we want to keep the probes for the main container, if only for the specific TCP ports 6379?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am just removing these unused variables, the actual probe is left intact (it is using the inline exec-based probes calling redis_probe.sh)

}

livenessProbe.TCPSocket = &corev1.TCPSocketAction{
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
}
readinessProbe.TCPSocket = &corev1.TCPSocketAction{
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(6379)},
InitialDelaySeconds: 40,
}
sentinelLivenessProbe.TCPSocket = &corev1.TCPSocketAction{
Port: intstr.IntOrString{Type: intstr.Int, IntVal: int32(26379)},
Expand All @@ -78,6 +57,9 @@ func StatefulSet(
}, {
Name: "CONFIG_HASH",
Value: configHash,
}, {
Name: "REPLICAS",
Value: strconv.Itoa(int(*r.Spec.Replicas)),
}}

sts := &appsv1.StatefulSet{
Expand Down Expand Up @@ -115,13 +97,15 @@ func StatefulSet(
Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "liveness"},
},
},
InitialDelaySeconds: 40,
},
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
Exec: &corev1.ExecAction{
Command: []string{"/var/lib/operator-scripts/redis_probe.sh", "readiness"},
},
},
InitialDelaySeconds: 40,
},
}, {
Image: r.Spec.ContainerImage,
Expand Down
73 changes: 71 additions & 2 deletions templates/redis/bin/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,77 @@ function remove_pod_label() {
local pod="$1"
local label="$2"
local patch="[{\"op\": \"remove\", \"path\": \"/metadata/labels/${label}\"}]"
# 200: OK, 422: not found
configure_pod_label $pod "$patch" "(200|422)"
# 200: OK, 404: pod not found, 422: label not found
configure_pod_label $pod "$patch" "(200|404|422)"
}

# Wait for a peer sentinel to report a valid master for the cluster.
# Contacts each peer pod individually by FQDN (skipping self) to avoid
# the headless service DNS resolving to our own uninitialized sentinel.
# If a peer still reports US as master (stale info before
# down-after-milliseconds triggers failover), keeps retrying until
# failover completes and a different master is elected.
# Falls back to querying peer redis directly via the ROLE command
# when sentinel output cannot be parsed.
# Prints the master address on success (FQDN or IP).
function wait_for_master() {
local retries=${SENTINEL_RETRIES:-10}
local delay=${SENTINEL_RETRY_DELAY:-3}
local pod_ordinal=${POD_NAME##*-}
local pod_base=${POD_NAME%-*}
local max_ordinal=$(( ${REPLICAS:-3} - 1 ))

for i in $(seq 1 $retries); do
local ordinal=0
while [ $ordinal -le $max_ordinal ]; do
if [ "$ordinal" != "$pod_ordinal" ]; then
local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
local output
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 26379 sentinel master redis 2>/dev/null)
if [ $? -eq 0 ] && [ -n "$output" ]; then
local master
master=$(echo "$output" | tr -d '\r' | awk '/^ip$/{getline; print; exit}')
if [ -n "$master" ] && [ "$master" != "$POD_IP" ] && ! echo "$master" | grep -q "^${POD_NAME}\."; then
echo "$master"
return 0
fi
log "Peer ${peer} sentinel reports master=${master} (stale, skipping)" >&2
else
# Sentinel unreachable; try redis ROLE as fallback
local role
role=$(timeout ${TIMEOUT} $REDIS_CLI_CMD --raw -h ${peer} -p 6379 role 2>/dev/null | head -1 | tr -d '\r')
if [ "$role" = "master" ]; then
echo "$peer"
return 0
fi
fi
fi
ordinal=$((ordinal + 1))
done
log "Attempt $i/$retries: no valid master found, retrying in ${delay}s..." >&2
sleep $delay
done
return 1
}

# Check if any peer redis is alive (responds to PING on port 6379).
# Used as a safety net before bootstrapping to distinguish a fresh
# deployment (no peers) from a pod restart (peers alive).
function has_alive_peers() {
local pod_ordinal=${POD_NAME##*-}
local pod_base=${POD_NAME%-*}
local max_ordinal=$(( ${REPLICAS:-3} - 1 ))
local ordinal=0
while [ $ordinal -le $max_ordinal ]; do
if [ "$ordinal" != "$pod_ordinal" ]; then
local peer="${pod_base}-${ordinal}.${SVC_FQDN}"
if timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${peer} -p 6379 ping 2>/dev/null | grep -q PONG; then
return 0
fi
fi
ordinal=$((ordinal + 1))
done
return 1
}

function set_pod_label() {
Expand Down
12 changes: 7 additions & 5 deletions templates/redis/bin/start_redis_replication.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,19 @@
generate_configs
sudo -E kolla_set_configs

# 1. check if a redis cluster is already running by contacting sentinel
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
# 1. check if a redis cluster is already running by contacting peer sentinels
master=$(wait_for_master)
if [ $? -eq 0 ]; then
master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
# TODO skip if no master was found
log "Connecting to the existing Redis cluster (master: ${master})"
exec redis-server $REDIS_CONFIG --protected-mode no --replicaof "$master" 6379
fi

# 2. else bootstrap a new cluster (assume we should be the first redis pod)
# 2. else bootstrap a new cluster if no peers are alive (fresh deployment)
if is_bootstrap_pod $POD_NAME; then
if has_alive_peers; then
log_error "Peers are alive but no master found. Refusing to bootstrap to avoid split-brain."
exit 1
fi
log "Bootstrapping a new Redis cluster from ${POD_NAME}"
set_pod_label $POD_NAME redis~1master
exec redis-server $REDIS_CONFIG --protected-mode no
Expand Down
12 changes: 7 additions & 5 deletions templates/redis/bin/start_sentinel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,21 @@
generate_configs
sudo -E kolla_set_configs

# 1. check if a redis cluster is already running by contacting sentinel
output=$(timeout ${TIMEOUT} $REDIS_CLI_CMD -h ${SVC_FQDN} -p 26379 sentinel master redis)
# 1. check if a redis cluster is already running by contacting peer sentinels
master=$(wait_for_master)
if [ $? -eq 0 ]; then
master=$(echo "$output" | awk '/^ip$/ {getline; print $0; exit}')
# TODO skip if no master was found
log "Connecting to the existing sentinel cluster (master: $master)"
echo "sentinel monitor redis ${master} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
exec redis-sentinel $SENTINEL_CONFIG
fi

# 2. else let the pod's redis server bootstrap a new cluster and monitor it
# (assume we should be the first redis pod)
# (only if no peers are alive, meaning this is a fresh deployment)
if is_bootstrap_pod $POD_NAME; then
if has_alive_peers; then
log_error "Peers are alive but no master found. Refusing to bootstrap sentinel to avoid split-brain."
exit 1
fi
log "Bootstrapping a new sentinel cluster"
echo "sentinel monitor redis ${POD_FQDN} 6379 ${SENTINEL_QUORUM}" >> $SENTINEL_CONFIG
exec redis-sentinel $SENTINEL_CONFIG
Expand Down
Loading