From 719105d30629561ee5aa38e67e25cde15332f38e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 21 Apr 2026 13:43:03 +0200
Subject: [PATCH 1/2] SOLR-18203: CloudSolrClient waits for state refresh
 before retrying 503 (no leader)

---
 ...18203-chaos-monkey-503-wait-for-leader.yml | 11 +++++++++
 .../client/solrj/impl/CloudSolrClient.java    | 23 +++++++++++--------
 2 files changed, 25 insertions(+), 9 deletions(-)
 create mode 100644 changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
diff --git a/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
new file mode 100644
index 000000000000..339dc9f86fdb
--- /dev/null
+++ b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
@@ -0,0 +1,11 @@
+title: >
+  CloudSolrClient now waits for cluster state refresh before retrying updates that fail with
+  503 (no leader). Previously all retries fired immediately, exhausting the retry budget before
+  leader election completed and causing spurious update failures during node restarts.
+type: fixed
+authors:
+  - name: Jan Høydahl
+    url: https://home.apache.org/phonebook.html?uid=janhoy
+links:
+  - name: SOLR-18203
+    url: https://issues.apache.org/jira/browse/SOLR-18203
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
index 3f79b11801cb..c8e2ca01fc74 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -1285,15 +1285,10 @@ protected NamedList<Object> requestWithRetryOnStaleState(
             String name = ext.getName();
             ExpiringCachedDocCollection cacheEntry = collectionStateCache.peek(name);
             if (cacheEntry != null) {
-              if (wasCommError) {
-                cacheEntry.maybeStale = true;
-              } else {
-                boolean markedStale =
-                    cacheEntry.markMaybeStaleIfOutsideBackoff(retryExpiryTimeNano);
-                if (markedStale && cacheEntry.shouldRetry()) {
-                  triggerCollectionRefresh(name);
-                }
-              }
+              // For both comm errors and 503 (no leader), mark state as stale immediately.
+              // For 503 we bypass the backoff since we will wait for the refresh below
+              // before retrying, which naturally throttles the retry rate.
+              cacheEntry.maybeStale = true;
             } else {
               triggerCollectionRefresh(name);
             }
@@ -1313,6 +1308,16 @@ protected NamedList<Object> requestWithRetryOnStaleState(
               MAX_STALE_RETRIES,
               wasCommError,
               errorCode);
+          // For 503 "no leader" errors (not plain comm errors where the node is fully dead),
+          // wait for the cluster state to refresh from ZooKeeper before retrying.
+          // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each
+          // other using the same stale routes, hitting the same dead leader repeatedly and
+          // exhausting all retries before leader election can complete.
+          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
+            for (DocCollection ext : requestedCollections) {
+              waitForCollectionRefresh(ext.getName(), triggerCollectionRefresh(ext.getName()));
+            }
+          }
           return requestWithRetryOnStaleState(
               request,
               retryCount + 1,

From 89655d173bdcdf2e4fd7408bb7a6f8512a95514d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= <janhoy@apache.org>
Date: Tue, 21 Apr 2026 14:49:24 +0200
Subject: [PATCH 2/2] SOLR-18203: Add 10s timeout to ZK state refresh wait
 before 503 retry
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without a timeout, waitForCollectionRefresh() would block indefinitely
if ZooKeeper is slow or leader election gets stuck. Now uses a bounded
10s get() on the CompletableFuture; on TimeoutException, logs a warning
and proceeds with the retry using current cached state — no worse than
the pre-fix behavior and avoids any possibility of hanging forever.
---
 .../client/solrj/impl/CloudSolrClient.java    | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
index c8e2ca01fc74..ea198d1396d9 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -1313,9 +1313,26 @@ protected NamedList<Object> requestWithRetryOnStaleState(
           // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each
           // other using the same stale routes, hitting the same dead leader repeatedly and
           // exhausting all retries before leader election can complete.
+          // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block
+          // the caller thread indefinitely; on timeout we log and continue with the retry
+          // using whatever state is currently available.
           if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
             for (DocCollection ext : requestedCollections) {
-              waitForCollectionRefresh(ext.getName(), triggerCollectionRefresh(ext.getName()));
+              try {
+                triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS);
+              } catch (TimeoutException te) {
+                log.warn(
+                    "Timed out waiting for cluster state refresh for collection {} before retry; "
+                        + "proceeding with retry using current state",
+                    ext.getName());
+              } catch (InterruptedException ie) {
+                Thread.currentThread().interrupt();
+              } catch (ExecutionException ee) {
+                log.warn(
+                    "Error refreshing cluster state for collection {} before retry",
+                    ext.getName(),
+                    ee.getCause());
+              }
             }
           }
           return requestWithRetryOnStaleState(