diff --git a/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
new file mode 100644
index 000000000000..339dc9f86fdb
--- /dev/null
+++ b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml
@@ -0,0 +1,11 @@
+title: >
+  CloudSolrClient now waits for cluster state refresh before retrying updates that fail with
+  503 (no leader). Previously all retries fired immediately, exhausting the retry budget before
+  leader election completed and causing spurious update failures during node restarts.
+type: fixed
+authors:
+  - name: Jan Høydahl
+    url: https://home.apache.org/phonebook.html?uid=janhoy
+links:
+  - name: SOLR-18203
+    url: https://issues.apache.org/jira/browse/SOLR-18203
diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
index 3f79b11801cb..ea198d1396d9 100644
--- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
+++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java
@@ -1285,15 +1285,10 @@ protected NamedList<Object> requestWithRetryOnStaleState(
             String name = ext.getName();
             ExpiringCachedDocCollection cacheEntry = collectionStateCache.peek(name);
             if (cacheEntry != null) {
-              if (wasCommError) {
-                cacheEntry.maybeStale = true;
-              } else {
-                boolean markedStale =
-                    cacheEntry.markMaybeStaleIfOutsideBackoff(retryExpiryTimeNano);
-                if (markedStale && cacheEntry.shouldRetry()) {
-                  triggerCollectionRefresh(name);
-                }
-              }
+              // For both comm errors and 503 (no leader), mark state as stale immediately.
+              // For 503 we bypass the backoff since we will wait for the refresh below
+              // before retrying, which naturally throttles the retry rate.
+              cacheEntry.maybeStale = true;
             } else {
               triggerCollectionRefresh(name);
             }
@@ -1313,6 +1308,33 @@ protected NamedList<Object> requestWithRetryOnStaleState(
               MAX_STALE_RETRIES,
               wasCommError,
               errorCode);
+          // For 503 "no leader" errors (not plain comm errors where the node is fully dead),
+          // wait for the cluster state to refresh from ZooKeeper before retrying.
+          // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each
+          // other using the same stale routes, hitting the same dead leader repeatedly and
+          // exhausting all retries before leader election can complete.
+          // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block
+          // the caller thread indefinitely; on timeout we log and continue with the retry
+          // using whatever state is currently available.
+          if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) {
+            for (DocCollection ext : requestedCollections) {
+              try {
+                triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS);
+              } catch (TimeoutException te) {
+                log.warn(
+                    "Timed out waiting for cluster state refresh for collection {} before retry; "
+                        + "proceeding with retry using current state",
+                    ext.getName());
+              } catch (InterruptedException ie) {
+                Thread.currentThread().interrupt();
+              } catch (ExecutionException ee) {
+                log.warn(
+                    "Error refreshing cluster state for collection {} before retry",
+                    ext.getName(),
+                    ee.getCause());
+              }
+            }
+          }
           return requestWithRetryOnStaleState(
               request,
               retryCount + 1,