From c4563bae20b4d8d2b7e5c593f067d6df12ee60bf Mon Sep 17 00:00:00 2001 From: Marcelo Zani Date: Wed, 4 Feb 2026 18:41:47 -0300 Subject: [PATCH] feat: use FQDN instead of pod IP for Patroni connect_address This change fixes Citus metadata sync issues where nodes register in pg_dist_node with ephemeral pod IPs. When pods restart and get new IPs, the stale IPs in pg_dist_node cause metadata synchronization to fail. Changes: - PatroniConfigMap.java: Added PATRONI_CONFIG_SERVICE_NAME and PATRONI_POD_NAMESPACE env vars, changed PATRONI_POSTGRESQL_CONNECT_ADDRESS to use FQDN pattern - PatroniEnvironmentVariables.java: Changed PATRONI_RESTAPI_CONNECT_ADDRESS to use FQDN pattern - start-patroni.sh: Changed connect_address in restapi and postgresql sections to use the FQDN-based env vars instead of PATRONI_KUBERNETES_POD_IP The FQDN pattern used is: ${POD_NAME}.${PATRONI_CONFIG_SERVICE_NAME}.${PATRONI_POD_NAMESPACE}.svc.cluster.local This ensures stable node registration that survives pod restarts. --- .../factory/cluster/patroni/PatroniConfigMap.java | 6 +++++- .../cluster/patroni/PatroniEnvironmentVariables.java | 3 ++- .../operator/src/main/resources/templates/start-patroni.sh | 4 ++-- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniConfigMap.java b/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniConfigMap.java index 8508a8961..05d10d117 100644 --- a/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniConfigMap.java +++ b/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniConfigMap.java @@ -117,8 +117,12 @@ public static String name(ClusterContext clusterContext) { .map(Object::toString) .orElse("60")); data.put("PATRONI_POSTGRESQL_LISTEN", (isEnvoyDisabled ? "0.0.0.0,[::]:" : "127.0.0.1,[::1]:") + EnvoyUtil.PG_PORT); + // Use FQDN instead of POD_IP for stable node registration in Citus pg_dist_node + data.put("PATRONI_CONFIG_SERVICE_NAME", PatroniUtil.configName(cluster)); + data.put("PATRONI_POD_NAMESPACE", cluster.getMetadata().getNamespace()); data.put("PATRONI_POSTGRESQL_CONNECT_ADDRESS", - "${POD_IP}:" + (isEnvoyDisabled ? EnvoyUtil.PG_PORT : EnvoyUtil.PG_REPL_ENTRY_PORT)); + "${POD_NAME}.${PATRONI_CONFIG_SERVICE_NAME}.${PATRONI_POD_NAMESPACE}.svc.cluster.local:" + + (isEnvoyDisabled ? EnvoyUtil.PG_PORT : EnvoyUtil.PG_REPL_ENTRY_PORT)); data.put("PATRONI_RESTAPI_LISTEN", "*:" + EnvoyUtil.PATRONI_PORT); data.put("PATRONI_POSTGRESQL_DATA_DIR", ClusterPath.PG_DATA_PATH.path()); diff --git a/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniEnvironmentVariables.java b/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniEnvironmentVariables.java index 183564ce9..15126be5e 100644 --- a/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniEnvironmentVariables.java +++ b/stackgres-k8s/src/operator/src/main/java/io/stackgres/operator/conciliation/factory/cluster/patroni/PatroniEnvironmentVariables.java @@ -202,7 +202,8 @@ && getPostgresFlavorComponent(cluster) == StackGresComponent.BABELFISH) { .build(), new EnvVarBuilder() .withName("PATRONI_RESTAPI_CONNECT_ADDRESS") - .withValue("${POD_IP}:" + EnvoyUtil.PATRONI_PORT) + // Use FQDN instead of POD_IP for stable node registration in Citus pg_dist_node + .withValue("${POD_NAME}.${PATRONI_CONFIG_SERVICE_NAME}.${PATRONI_POD_NAMESPACE}.svc.cluster.local:" + EnvoyUtil.PATRONI_PORT) .build()); return ImmutableList.builder() diff --git a/stackgres-k8s/src/operator/src/main/resources/templates/start-patroni.sh b/stackgres-k8s/src/operator/src/main/resources/templates/start-patroni.sh index 100232c09..6b711e0fa 100644 --- a/stackgres-k8s/src/operator/src/main/resources/templates/start-patroni.sh +++ b/stackgres-k8s/src/operator/src/main/resources/templates/start-patroni.sh @@ -254,7 +254,7 @@ fi - 'host replication ${PATRONI_REPLICATION_USERNAME} 0.0.0.0/0 md5' - 'host replication ${PATRONI_REPLICATION_USERNAME} ::/0 md5' restapi: - connect_address: '${PATRONI_KUBERNETES_POD_IP}:8008' + connect_address: '${PATRONI_RESTAPI_CONNECT_ADDRESS}' listen: "*:8008" postgresql: use_slots: true @@ -262,7 +262,7 @@ postgresql: remove_data_directory_on_rewind_failure: true use_unix_socket: true use_unix_socket_repl: true - connect_address: '${PATRONI_KUBERNETES_POD_IP}:5432' + connect_address: '${PATRONI_POSTGRESQL_CONNECT_ADDRESS}' listen: 0.0.0.0,[::]:5432 pg_ctl_timeout: $PATRONI_PG_CTL_TIMEOUT authentication: