From 719105d30629561ee5aa38e67e25cde15332f38e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 21 Apr 2026 13:43:03 +0200 Subject: [PATCH 1/2] SOLR-18203: CloudSolrClient waits for state refresh before retrying 503 (no leader) --- ...18203-chaos-monkey-503-wait-for-leader.yml | 11 +++++++++ .../client/solrj/impl/CloudSolrClient.java | 23 +++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) create mode 100644 changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml diff --git a/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml new file mode 100644 index 000000000000..339dc9f86fdb --- /dev/null +++ b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml @@ -0,0 +1,11 @@ +title: > + CloudSolrClient now waits for cluster state refresh before retrying updates that fail with + 503 (no leader). Previously all retries fired immediately, exhausting the retry budget before + leader election completed and causing spurious update failures during node restarts. +type: fixed +authors: + - name: Jan Høydahl + url: https://home.apache.org/phonebook.html?uid=janhoy +links: + - name: SOLR-18203 + url: https://issues.apache.org/jira/browse/SOLR-18203 diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index 3f79b11801cb..c8e2ca01fc74 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -1285,15 +1285,10 @@ protected NamedList requestWithRetryOnStaleState( String name = ext.getName(); ExpiringCachedDocCollection cacheEntry = collectionStateCache.peek(name); if (cacheEntry != null) { - if (wasCommError) { - cacheEntry.maybeStale = true; - } else { - boolean markedStale = - cacheEntry.markMaybeStaleIfOutsideBackoff(retryExpiryTimeNano); - if (markedStale && cacheEntry.shouldRetry()) { - triggerCollectionRefresh(name); - } - } + // For both comm errors and 503 (no leader), mark state as stale immediately. + // For 503 we bypass the backoff since we will wait for the refresh below + // before retrying, which naturally throttles the retry rate. + cacheEntry.maybeStale = true; } else { triggerCollectionRefresh(name); } @@ -1313,6 +1308,16 @@ protected NamedList requestWithRetryOnStaleState( MAX_STALE_RETRIES, wasCommError, errorCode); + // For 503 "no leader" errors (not plain comm errors where the node is fully dead), + // wait for the cluster state to refresh from ZooKeeper before retrying. + // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each + // other using the same stale routes, hitting the same dead leader repeatedly and + // exhausting all retries before leader election can complete. + if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) { + for (DocCollection ext : requestedCollections) { + waitForCollectionRefresh(ext.getName(), triggerCollectionRefresh(ext.getName())); + } + } return requestWithRetryOnStaleState( request, retryCount + 1, From 89655d173bdcdf2e4fd7408bb7a6f8512a95514d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Tue, 21 Apr 2026 14:49:24 +0200 Subject: [PATCH 2/2] SOLR-18203: Add 10s timeout to ZK state refresh wait before 503 retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without a timeout, waitForCollectionRefresh() would block indefinitely if ZooKeeper is slow or leader election gets stuck. Now uses a bounded 10s get() on the CompletableFuture; on TimeoutException, logs a warning and proceeds with the retry using current cached state — no worse than the pre-fix behavior and avoids any possibility of hanging forever. --- .../client/solrj/impl/CloudSolrClient.java | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index c8e2ca01fc74..ea198d1396d9 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -1313,9 +1313,26 @@ protected NamedList requestWithRetryOnStaleState( // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each // other using the same stale routes, hitting the same dead leader repeatedly and // exhausting all retries before leader election can complete. + // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block + // the caller thread indefinitely; on timeout we log and continue with the retry + // using whatever state is currently available. if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) { for (DocCollection ext : requestedCollections) { - waitForCollectionRefresh(ext.getName(), triggerCollectionRefresh(ext.getName())); + try { + triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS); + } catch (TimeoutException te) { + log.warn( + "Timed out waiting for cluster state refresh for collection {} before retry; " + + "proceeding with retry using current state", + ext.getName()); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } catch (ExecutionException ee) { + log.warn( + "Error refreshing cluster state for collection {} before retry", + ext.getName(), + ee.getCause()); + } } } return requestWithRetryOnStaleState(