apache · janhoy · Apr 21, 2026 · Apr 21, 2026 · Copilot · Apr 21, 2026
diff --git a/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
@@ -0,0 +1,11 @@
+title: >
+  CloudSolrClient now waits for cluster state refresh before retrying updates that fail with
+  503 (no leader). Previously all retries fired immediately, exhausting the retry budget before
+  leader election completed and causing spurious update failures during node restarts.
+type: fixed
+authors:
+  - name: Jan Høydahl
+    url: https://home.apache.org/phonebook.html?uid=janhoy
+links:
+  - name: SOLR-18203
+    url: https://issues.apache.org/jira/browse/SOLR-18203
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -1285,15 +1285,10 @@ protected NamedList<Object> requestWithRetryOnStaleState(
             String name = ext.getName();
             ExpiringCachedDocCollection cacheEntry = collectionStateCache.peek(name);
             if (cacheEntry != null) {
-              if (wasCommError) {
-                cacheEntry.maybeStale = true;
-              } else {
-                boolean markedStale =
-                    cacheEntry.markMaybeStaleIfOutsideBackoff(retryExpiryTimeNano);
-                if (markedStale && cacheEntry.shouldRetry()) {
-                  triggerCollectionRefresh(name);
-                }
-              }
+              // For both comm errors and 503 (no leader), mark state as stale immediately.
+              // For 503 we bypass the backoff since we will wait for the refresh below
+              // before retrying, which naturally throttles the retry rate.
+              cacheEntry.maybeStale = true;
-              // For both comm errors and 503 (no leader), mark state as stale immediately.
-              // For 503 we bypass the backoff since we will wait for the refresh below
-              // before retrying, which naturally throttles the retry rate.
-              cacheEntry.maybeStale = true;
+              // Throttle stale marking so repeated comm errors or 503s do not cause
+              // excessive concurrent cluster-state refresh attempts.
+              markMaybeStaleIfOutsideBackoff(cacheEntry);
-              // For both comm errors and 503 (no leader), mark state as stale immediately.
-              // For 503 we bypass the backoff since we will wait for the refresh below
-              // before retrying, which naturally throttles the retry rate.
-              cacheEntry.maybeStale = true;
+              // Throttle stale marking so repeated comm errors or 503s do not cause
+              // excessive concurrent cluster-state refresh attempts.
+              markMaybeStaleIfOutsideBackoff(cacheEntry);
             } else {
               triggerCollectionRefresh(name);
             }
@@ -1313,6 +1308,33 @@ protected NamedList<Object> requestWithRetryOnStaleState(
               MAX_STALE_RETRIES,
               wasCommError,
               errorCode);
+          // For 503 "no leader" errors (not plain comm errors where the node is fully dead),
+          // wait for the cluster state to refresh from ZooKeeper before retrying.
+          // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each
+          // other using the same stale routes, hitting the same dead leader repeatedly and
+          // exhausting all retries before leader election can complete.
+          // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block
+          // the caller thread indefinitely; on timeout we log and continue with the retry
+          // using whatever state is currently available.
+          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
+            for (DocCollection ext : requestedCollections) {
+              try {
+                triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS);
+              } catch (TimeoutException te) {
+                log.warn(
+                    "Timed out waiting for cluster state refresh for collection {} before retry; "
+                        + "proceeding with retry using current state",
+                    ext.getName());
+              } catch (InterruptedException ie) {
+                Thread.currentThread().interrupt();
-                Thread.currentThread().interrupt();
+                Thread.currentThread().interrupt();
+                throw new SolrServerException(
+                    "Interrupted while waiting for cluster state refresh before retry for collection "
+                        + ext.getName(),
+                    ie);
-                Thread.currentThread().interrupt();
+                Thread.currentThread().interrupt();
+                throw new SolrServerException(
+                    "Interrupted while waiting for cluster state refresh before retry for collection "
+                        + ext.getName(),
+                    ie);
+              } catch (ExecutionException ee) {
+                log.warn(
+                    "Error refreshing cluster state for collection {} before retry",
+                    ext.getName(),
+                    ee.getCause());
+              }
+            }
-          // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block
-          // the caller thread indefinitely; on timeout we log and continue with the retry
-          // using whatever state is currently available.
-          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
-            for (DocCollection ext : requestedCollections) {
-              try {
-                triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS);
-              } catch (TimeoutException te) {
-                log.warn(
-                    "Timed out waiting for cluster state refresh for collection {} before retry; "
-                        + "proceeding with retry using current state",
-                    ext.getName());
-              } catch (InterruptedException ie) {
-                Thread.currentThread().interrupt();
-              } catch (ExecutionException ee) {
-                log.warn(
-                    "Error refreshing cluster state for collection {} before retry",
-                    ext.getName(),
-                    ee.getCause());
-              }
-            }
+          // We use a bounded wait (10 s total) so a stuck ZK / stalled election cannot block
+          // the caller thread indefinitely; on timeout we log and continue with the retry
+          // using whatever state is currently available.
+          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
+            List<CompletableFuture<?>> refreshFutures = new ArrayList<>(requestedCollections.size());
+            for (DocCollection ext : requestedCollections) {
+              final String collectionName = ext.getName();
+              refreshFutures.add(
+                  triggerCollectionRefresh(collectionName)
+                      .handle(
+                          (ignored, throwable) -> {
+                            if (throwable != null) {
+                              Throwable cause =
+                                  throwable.getCause() != null ? throwable.getCause() : throwable;
+                              log.warn(
+                                  "Error refreshing cluster state for collection {} before retry",
+                                  collectionName,
+                                  cause);
+                            }
+                            return null;
+                          }));
+            }
+            try {
+              CompletableFuture
+                  .allOf(refreshFutures.toArray(new CompletableFuture<?>[0]))
+                  .get(10, TimeUnit.SECONDS);
+            } catch (TimeoutException te) {
+              log.warn(
+                  "Timed out waiting for cluster state refresh for collections {} before retry; "
+                      + "proceeding with retry using current state",
+                  requestedCollections.stream()
+                      .map(DocCollection::getName)
+                      .collect(Collectors.toList()));
+            } catch (InterruptedException ie) {
+              Thread.currentThread().interrupt();
+            }
-          // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block
-          // the caller thread indefinitely; on timeout we log and continue with the retry
-          // using whatever state is currently available.
-          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
-            for (DocCollection ext : requestedCollections) {
-              try {
-                triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS);
-              } catch (TimeoutException te) {
-                log.warn(
-                    "Timed out waiting for cluster state refresh for collection {} before retry; "
-                        + "proceeding with retry using current state",
-                    ext.getName());
-              } catch (InterruptedException ie) {
-                Thread.currentThread().interrupt();
-              } catch (ExecutionException ee) {
-                log.warn(
-                    "Error refreshing cluster state for collection {} before retry",
-                    ext.getName(),
-                    ee.getCause());
-              }
-            }
+          // We use a bounded wait (10 s total) so a stuck ZK / stalled election cannot block
+          // the caller thread indefinitely; on timeout we log and continue with the retry
+          // using whatever state is currently available.
+          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
+            List<CompletableFuture<?>> refreshFutures = new ArrayList<>(requestedCollections.size());
+            for (DocCollection ext : requestedCollections) {
+              final String collectionName = ext.getName();
+              refreshFutures.add(
+                  triggerCollectionRefresh(collectionName)
+                      .handle(
+                          (ignored, throwable) -> {
+                            if (throwable != null) {
+                              Throwable cause =
+                                  throwable.getCause() != null ? throwable.getCause() : throwable;
+                              log.warn(
+                                  "Error refreshing cluster state for collection {} before retry",
+                                  collectionName,
+                                  cause);
+                            }
+                            return null;
+                          }));
+            }
+            try {
+              CompletableFuture
+                  .allOf(refreshFutures.toArray(new CompletableFuture<?>[0]))
+                  .get(10, TimeUnit.SECONDS);
+            } catch (TimeoutException te) {
+              log.warn(
+                  "Timed out waiting for cluster state refresh for collections {} before retry; "
+                      + "proceeding with retry using current state",
+                  requestedCollections.stream()
+                      .map(DocCollection::getName)
+                      .collect(Collectors.toList()));
+            } catch (InterruptedException ie) {
+              Thread.currentThread().interrupt();
+            }
+          }
           return requestWithRetryOnStaleState(
               request,
               retryCount + 1,