diff --git a/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml new file mode 100644 index 000000000000..339dc9f86fdb --- /dev/null +++ b/changelog/unreleased/SOLR-18203-chaos-monkey-503-wait-for-leader.yml @@ -0,0 +1,11 @@ +title: > + CloudSolrClient now waits for cluster state refresh before retrying updates that fail with + 503 (no leader). Previously all retries fired immediately, exhausting the retry budget before + leader election completed and causing spurious update failures during node restarts. +type: fixed +authors: + - name: Jan Høydahl + url: https://home.apache.org/phonebook.html?uid=janhoy +links: + - name: SOLR-18203 + url: https://issues.apache.org/jira/browse/SOLR-18203 diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index 3f79b11801cb..ea198d1396d9 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -1285,15 +1285,10 @@ protected NamedList requestWithRetryOnStaleState( String name = ext.getName(); ExpiringCachedDocCollection cacheEntry = collectionStateCache.peek(name); if (cacheEntry != null) { - if (wasCommError) { - cacheEntry.maybeStale = true; - } else { - boolean markedStale = - cacheEntry.markMaybeStaleIfOutsideBackoff(retryExpiryTimeNano); - if (markedStale && cacheEntry.shouldRetry()) { - triggerCollectionRefresh(name); - } - } + // For both comm errors and 503 (no leader), mark state as stale immediately. + // For 503 we bypass the backoff since we will wait for the refresh below + // before retrying, which naturally throttles the retry rate. + cacheEntry.maybeStale = true; } else { triggerCollectionRefresh(name); } @@ -1313,6 +1308,33 @@ protected NamedList requestWithRetryOnStaleState( MAX_STALE_RETRIES, wasCommError, errorCode); + // For 503 "no leader" errors (not plain comm errors where the node is fully dead), + // wait for the cluster state to refresh from ZooKeeper before retrying. + // Without this wait, all MAX_STALE_RETRIES retries fire within milliseconds of each + // other using the same stale routes, hitting the same dead leader repeatedly and + // exhausting all retries before leader election can complete. + // We use a bounded wait (10 s) so a stuck ZK / stalled election cannot block + // the caller thread indefinitely; on timeout we log and continue with the retry + // using whatever state is currently available. + if (!wasCommError && requestedCollections != null && !requestedCollections.isEmpty()) { + for (DocCollection ext : requestedCollections) { + try { + triggerCollectionRefresh(ext.getName()).get(10, TimeUnit.SECONDS); + } catch (TimeoutException te) { + log.warn( + "Timed out waiting for cluster state refresh for collection {} before retry; " + + "proceeding with retry using current state", + ext.getName()); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + } catch (ExecutionException ee) { + log.warn( + "Error refreshing cluster state for collection {} before retry", + ext.getName(), + ee.getCause()); + } + } + } return requestWithRetryOnStaleState( request, retryCount + 1,