diff --git a/changelog/unreleased/solr-17821-fix-restore-error-scenario.yml b/changelog/unreleased/solr-17821-fix-restore-error-scenario.yml new file mode 100644 index 000000000000..dbadb3d6fe38 --- /dev/null +++ b/changelog/unreleased/solr-17821-fix-restore-error-scenario.yml @@ -0,0 +1,9 @@ +# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc +title: Fix error scenario in InstallShardData and Restore +type: fixed # added, changed, fixed, deprecated, removed, dependency_update, security, other +authors: + - name: Houston Putman + nick: HoustonPutman +links: + - name: SOLR-17821 + url: https://issues.apache.org/jira/browse/SOLR-17821 diff --git a/solr/api/src/java/org/apache/solr/client/api/model/InstallShardDataRequestBody.java b/solr/api/src/java/org/apache/solr/client/api/model/InstallShardDataRequestBody.java index 31bec8eb4346..05b27f1dcab3 100644 --- a/solr/api/src/java/org/apache/solr/client/api/model/InstallShardDataRequestBody.java +++ b/solr/api/src/java/org/apache/solr/client/api/model/InstallShardDataRequestBody.java @@ -24,5 +24,9 @@ public class InstallShardDataRequestBody { @JsonProperty public String repository; + @JsonProperty public String name; + + @JsonProperty public String shardBackupId; + @JsonProperty public String async; } diff --git a/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java b/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java index cfbad7cd7e16..d400e90d14d4 100644 --- a/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java +++ b/solr/core/src/java/org/apache/solr/cloud/SyncStrategy.java @@ -74,11 +74,6 @@ public SyncStrategy(CoreContainer cc) { updateExecutor = updateShardHandler.getUpdateExecutor(); } - private static class ShardCoreRequest extends ShardRequest { - String coreName; - public String baseUrl; - } - public PeerSync.PeerSyncResult sync( ZkController zkController, SolrCore core, ZkNodeProps leaderProps) { return sync(zkController, core, leaderProps, false, false); @@ -322,8 +317,8 @@ private void syncToMe( } else { RecoveryRequest rr = new RecoveryRequest(); rr.leaderProps = leaderProps; - rr.baseUrl = ((ShardCoreRequest) srsp.getShardRequest()).baseUrl; - rr.coreName = ((ShardCoreRequest) srsp.getShardRequest()).coreName; + rr.baseUrl = srsp.getShardRequest().nodeName; + rr.coreName = srsp.getShardRequest().coreName; recoveryRequests.add(rr); } } else { @@ -355,9 +350,9 @@ private boolean handleResponse(ShardResponse srsp) { private void requestSync( String baseUrl, String replica, String leaderUrl, String coreName, int nUpdates) { // TODO should we use peerSyncWithLeader instead? - ShardCoreRequest sreq = new ShardCoreRequest(); + ShardRequest sreq = new ShardRequest(); sreq.coreName = coreName; - sreq.baseUrl = baseUrl; + sreq.nodeName = baseUrl; sreq.purpose = ShardRequest.PURPOSE_PRIVATE; sreq.shards = new String[] {replica}; sreq.actualShards = sreq.shards; diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/InstallShardDataCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/InstallShardDataCmd.java index ca654a150aed..cc58912dc2a7 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/InstallShardDataCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/InstallShardDataCmd.java @@ -17,20 +17,27 @@ package org.apache.solr.cloud.api.collections; -import static org.apache.solr.cloud.Overseer.QUEUE_OPERATION; -import static org.apache.solr.common.params.CommonAdminParams.ASYNC; - +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.ObjectMapper; import java.lang.invoke.MethodHandles; -import java.util.HashMap; +import java.util.Collection; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import org.apache.solr.cloud.ZkShardTerms; +import org.apache.solr.common.SolrErrorWrappingException; import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.DocCollection; +import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkNodeProps; -import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; @@ -80,27 +87,114 @@ public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList // Build the core-admin request final ModifiableSolrParams coreApiParams = new ModifiableSolrParams(); coreApiParams.set( - CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.INSTALLCOREDATA.toString()); - typedMessage.toMap(new HashMap<>()).forEach((k, v) -> coreApiParams.set(k, v.toString())); + CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.RESTORECORE.toString()); + coreApiParams.set(CoreAdminParams.BACKUP_LOCATION, typedMessage.location); + coreApiParams.set(CoreAdminParams.BACKUP_REPOSITORY, typedMessage.repository); + coreApiParams.set(CoreAdminParams.NAME, typedMessage.name); + coreApiParams.set(CoreAdminParams.SHARD_BACKUP_ID, typedMessage.shardBackupId); // Send the core-admin request to each replica in the slice final ShardHandler shardHandler = ccc.newShardHandler(); - shardRequestTracker.sliceCmd(clusterState, coreApiParams, null, installSlice, shardHandler); + List notLiveReplicas = + shardRequestTracker.sliceCmd(clusterState, coreApiParams, null, installSlice, shardHandler); final String errorMessage = String.format( Locale.ROOT, - "Could not install data to collection [%s] and shard [%s]", + "Could not install data to collection [%s] and shard [%s] on any leader-eligible replicas", typedMessage.collection, typedMessage.shard); - shardRequestTracker.processResponses(results, shardHandler, true, errorMessage); + shardRequestTracker.processResponses(results, shardHandler, false, errorMessage); + Collection allReplicas = + clusterState + .getCollection(typedMessage.collection) + .getSlice(typedMessage.shard) + .getReplicas(); + + // Ensure that terms are correct for this shard after the execution is done + // We only care about leader eligible replicas, all others will eventually get updated. + List leaderEligibleReplicas = + allReplicas.stream().filter(r -> r.getType().leaderEligible).collect(Collectors.toList()); + + NamedList failures = (NamedList) results.get("failure"); + Set successfulReplicas = + leaderEligibleReplicas.stream() + .filter(replica -> !notLiveReplicas.contains(replica)) + .filter( + replica -> + failures == null + || failures.get(CollectionHandlingUtils.requestKey(replica)) == null) + .collect(Collectors.toSet()); + + if (successfulReplicas.isEmpty()) { + // No leader-eligible replicas succeeded, return failure + if (failures == null) { + throw new SolrException( + SolrException.ErrorCode.SERVER_ERROR, + errorMessage + ". No leader-eligible replicas are live."); + } else { + throw new SolrErrorWrappingException( + SolrException.ErrorCode.SERVER_ERROR, + errorMessage, + Collections.singletonList(failures.asMap(1))); + } + } else if (successfulReplicas.size() < leaderEligibleReplicas.size()) { + // Some, but not all, leader-eligible replicas succeeded. + // Ensure the shard terms are correct so that the non-successful replicas go into recovery + ZkShardTerms shardTerms = + ccc.getCoreContainer() + .getZkController() + .getShardTerms(typedMessage.collection, typedMessage.shard); + final Set replicasToStartRecovery = new HashSet<>(); + leaderEligibleReplicas.stream() + .filter(r -> !successfulReplicas.contains(r)) + .map(Replica::getName) + .forEach(replicasToStartRecovery::add); + log.info("Putting the unsuccessful replicas into recovery: {}", replicasToStartRecovery); + shardTerms.ensureHighestTerms( + installCollection, + successfulReplicas.stream().map(Replica::getName).collect(Collectors.toSet())); + ccc.getZkStateReader() + .waitForState( + typedMessage.collection, + 30, + TimeUnit.SECONDS, + (liveNodes, collectionState) -> { + collectionState.getSlice(typedMessage.shard).getReplicas().stream() + .filter(r -> Replica.State.RECOVERING.equals(r.getState())) + .map(Replica::getName) + .forEach(replicasToStartRecovery::remove); + return replicasToStartRecovery.isEmpty(); + }); + + // In order for the async request to succeed, we need to ensure that there is no failure + // message + NamedList successes = (NamedList) results.get("success"); + failures.forEach( + (replicaKey, value) -> { + successes.add( + replicaKey, + new NamedList<>( + Map.of( + "explanation", + "Core install failed, but is now recovering from the leader", + "failure", + value))); + }); + results.remove("failure"); + } else { + // other replicas to-be-created will know that they are out of date by + // looking at their term : 0 compare to term of this core : 1 + ccc.getCoreContainer() + .getZkController() + .getShardTerms(typedMessage.collection, typedMessage.shard) + .ensureHighestTermsAreNotZero(); + } } /** A value-type representing the message received by {@link InstallShardDataCmd} */ + @JsonIgnoreProperties(ignoreUnknown = true) public static class RemoteMessage implements JacksonReflectMapWriter { - @JsonProperty(QUEUE_OPERATION) - public String operation = CollectionParams.CollectionAction.INSTALLSHARDDATA.toLower(); - @JsonProperty public String collection; @JsonProperty public String shard; @@ -109,8 +203,9 @@ public static class RemoteMessage implements JacksonReflectMapWriter { @JsonProperty public String location; - @JsonProperty(ASYNC) - public String asyncId; + @JsonProperty public String name = ""; + + @JsonProperty public String shardBackupId; public void validate() { if (StrUtils.isBlank(collection)) { diff --git a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java index c4ef360fa581..dabaf64420e4 100644 --- a/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/api/collections/RestoreCmd.java @@ -24,6 +24,7 @@ import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE; import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD; +import static org.apache.solr.common.params.CollectionParams.CollectionAction.INSTALLSHARDDATA; import static org.apache.solr.common.params.CollectionParams.CollectionAction.MODIFYCOLLECTION; import static org.apache.solr.common.params.CommonParams.NAME; @@ -108,7 +109,7 @@ public void call(AdminCmdContext adminCmdContext, ZkNodeProps message, NamedList } } - private void requestReplicasToRestore( + private void requestShardsToRestore( NamedList results, DocCollection restoreCollection, AdminCmdContext adminCmdContext, @@ -117,11 +118,13 @@ private void requestReplicasToRestore( String repo, ShardHandler shardHandler) { ShardRequestTracker shardRequestTracker = - CollectionHandlingUtils.asyncRequestTracker(adminCmdContext, ccc); + CollectionHandlingUtils.asyncRequestTracker(adminCmdContext, "/admin/collections", ccc); // Copy data from backed up index to each replica for (Slice slice : restoreCollection.getSlices()) { ModifiableSolrParams params = new ModifiableSolrParams(); - params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.RESTORECORE.toString()); + params.set(CollectionAdminParams.COLLECTION, slice.getCollection()); + params.set(CollectionAdminParams.SHARD, slice.getName()); + params.set(CoreAdminParams.ACTION, INSTALLSHARDDATA.toString()); Optional shardBackupId = backupProperties.getShardBackupIdFor(slice.getName()); if (shardBackupId.isPresent()) { params.set(CoreAdminParams.SHARD_BACKUP_ID, shardBackupId.get().getIdAsString()); @@ -130,11 +133,24 @@ private void requestReplicasToRestore( } params.set(CoreAdminParams.BACKUP_LOCATION, backupPath.toASCIIString()); params.set(CoreAdminParams.BACKUP_REPOSITORY, repo); - shardRequestTracker.sliceCmd( - adminCmdContext.getClusterState(), params, null, slice, shardHandler); + Replica replica = slice.getLeader(); + if (replica == null) { + replica = + slice.getReplicas().stream() + .findFirst() + .orElseThrow( + () -> + new SolrException( + ErrorCode.INVALID_STATE, + String.format( + Locale.ROOT, + "No replicas for shard %s in collection %s. Cannot restore to a shard with no replicas", + slice.getName(), + slice.getCollection()))); + } + shardRequestTracker.sendShardRequest(replica, params, shardHandler); } - shardRequestTracker.processResponses( - new NamedList<>(), shardHandler, true, "Could not restore core"); + shardRequestTracker.processResponses(results, shardHandler, true, "Could not restore shard"); } /** Encapsulates the parsing and access for common parameters restore parameters and values */ @@ -273,7 +289,7 @@ public void process(NamedList results, RestoreContext rc) throws Excepti // refresh the location copy of collection state restoreCollection = rc.zkStateReader.getClusterState().getCollection(rc.restoreCollectionName); - requestReplicasToRestore( + requestShardsToRestore( results, restoreCollection, rc.adminCmdContext.withClusterState(rc.zkStateReader.getClusterState()), @@ -625,7 +641,7 @@ public void process(RestoreContext rc, NamedList results) throws Excepti rc.adminCmdContext.withClusterState(rc.zkStateReader.getClusterState()), restoreCollection); try { - requestReplicasToRestore( + requestShardsToRestore( results, restoreCollection, rc.adminCmdContext.withClusterState(rc.zkStateReader.getClusterState()), @@ -649,8 +665,7 @@ private void disableReadOnly(AdminCmdContext adminCmdContext, DocCollection rest ZkStateReader.COLLECTION_PROP, restoreCollection.getName(), ZkStateReader.READ_ONLY, null); new CollApiCmds.ModifyCollectionCmd(ccc) - .call( - adminCmdContext.subRequestContext(MODIFYCOLLECTION, null), params, new NamedList<>()); + .call(adminCmdContext.subRequestContext(MODIFYCOLLECTION), params, new NamedList<>()); } private void enableReadOnly(AdminCmdContext adminCmdContext, DocCollection restoreCollection) @@ -662,8 +677,7 @@ private void enableReadOnly(AdminCmdContext adminCmdContext, DocCollection resto ZkStateReader.COLLECTION_PROP, restoreCollection.getName(), ZkStateReader.READ_ONLY, "true"); new CollApiCmds.ModifyCollectionCmd(ccc) - .call( - adminCmdContext.subRequestContext(MODIFYCOLLECTION, null), params, new NamedList<>()); + .call(adminCmdContext.subRequestContext(MODIFYCOLLECTION), params, new NamedList<>()); } } } diff --git a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java index be9f394d26c5..6e3c9e453d78 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/CollectionsHandler.java @@ -101,6 +101,7 @@ import static org.apache.solr.common.params.CommonParams.VALUE_LONG; import static org.apache.solr.common.params.CoreAdminParams.BACKUP_LOCATION; import static org.apache.solr.common.params.CoreAdminParams.BACKUP_REPOSITORY; +import static org.apache.solr.common.params.CoreAdminParams.SHARD_BACKUP_ID; import static org.apache.solr.common.util.StrUtils.formatString; import java.lang.invoke.MethodHandles; @@ -1068,6 +1069,8 @@ public Map execute( reqBody.async = req.getParams().get(ASYNC); reqBody.repository = req.getParams().get(BACKUP_REPOSITORY); reqBody.location = req.getParams().get(BACKUP_LOCATION); + reqBody.name = req.getParams().get(NAME); + reqBody.shardBackupId = req.getParams().get(SHARD_BACKUP_ID); final InstallShardData installApi = new InstallShardData(h.coreContainer, req, rsp); final SolrJerseyResponse installResponse = diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/InstallCoreData.java b/solr/core/src/java/org/apache/solr/handler/admin/api/InstallCoreData.java index a91a0688e0a0..5d6290e14975 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/InstallCoreData.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/InstallCoreData.java @@ -95,12 +95,6 @@ public SolrJerseyResponse installCoreData(String coreName, InstallCoreDataReques SolrException.ErrorCode.SERVER_ERROR, "Failed to install data to core=" + core.getName()); } - - // other replicas to-be-created will know that they are out of date by - // looking at their term : 0 compare to term of this core : 1 - zkController - .getShardTerms(cd.getCollectionName(), cd.getShardId()) - .ensureHighestTermsAreNotZero(); } return response; diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/InstallShardData.java b/solr/core/src/java/org/apache/solr/handler/admin/api/InstallShardData.java index 840bbeab4a52..4033fa6f0975 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/InstallShardData.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/InstallShardData.java @@ -78,10 +78,10 @@ public SubResponseAccumulatingJerseyResponse installShardData( // Only install data to shards which belong to a collection in read-only mode final DocCollection dc = coreContainer.getZkController().getZkStateReader().getCollection(collName); - if (!dc.isReadOnly()) { + if (dc.getSlice(shardName).getReplicas().size() > 1 && !dc.isReadOnly()) { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, - "Collection must be in readOnly mode before installing data to shard"); + "Collection must be in readOnly mode before installing data to shard with more than 1 replica"); } submitRemoteMessageAndHandleResponse( @@ -112,6 +112,8 @@ public static ZkNodeProps createRemoteMessage( if (requestBody != null) { messageTyped.location = requestBody.location; messageTyped.repository = requestBody.repository; + messageTyped.name = requestBody.name; + messageTyped.shardBackupId = requestBody.shardBackupId; } messageTyped.validate(); diff --git a/solr/core/src/java/org/apache/solr/handler/admin/api/RestoreCore.java b/solr/core/src/java/org/apache/solr/handler/admin/api/RestoreCore.java index 3997b1971b4e..dcf1cfe85c1b 100644 --- a/solr/core/src/java/org/apache/solr/handler/admin/api/RestoreCore.java +++ b/solr/core/src/java/org/apache/solr/handler/admin/api/RestoreCore.java @@ -132,12 +132,6 @@ private void doRestore(String coreName, RestoreCoreRequestBody requestBody) thro throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Failed to restore core=" + core.getName()); } - // other replicas to-be-created will know that they are out of date by - // looking at their term : 0 compare to term of this core : 1 - coreContainer - .getZkController() - .getShardTerms(cd.getCollectionName(), cd.getShardId()) - .ensureHighestTermsAreNotZero(); // transitions state of update log to ACTIVE UpdateLog updateLog = core.getUpdateHandler().getUpdateLog(); diff --git a/solr/core/src/java/org/apache/solr/handler/component/ShardRequest.java b/solr/core/src/java/org/apache/solr/handler/component/ShardRequest.java index 5222b38abee3..ecaee01c7fa8 100644 --- a/solr/core/src/java/org/apache/solr/handler/component/ShardRequest.java +++ b/solr/core/src/java/org/apache/solr/handler/component/ShardRequest.java @@ -60,6 +60,9 @@ public class ShardRequest { /** may be null */ public String coreNodeName; + /** may be null */ + public String coreName; + /** may be null */ public Map headers; diff --git a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java index 60636d30c922..bc071699788e 100644 --- a/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/CollectionsAPISolrJTest.java @@ -113,8 +113,7 @@ public void testCreateWithDefaultConfigSet() throws Exception { assertEquals(0, (int) status.get("status")); assertTrue(status.get("QTime") > 0); } - // Sometimes multiple cores land on the same node so it's less than 4 - int nodesCreated = response.getCollectionNodesStatus().size(); + // Use of _default configset should generate a warning for data-driven functionality in // production use assertTrue( @@ -126,7 +125,7 @@ public void testCreateWithDefaultConfigSet() throws Exception { assertEquals(0, response.getStatus()); assertTrue(response.isSuccess()); Map> nodesStatus = response.getCollectionNodesStatus(); - assertEquals(nodesStatus.toString(), nodesCreated, nodesStatus.size()); + assertEquals(nodesStatus.toString(), 4, nodesStatus.size()); waitForState( "Expected " + collectionName + " to disappear from cluster state", @@ -246,36 +245,43 @@ public void testCreateAndDeleteCollection() throws Exception { assertTrue(status.get("QTime") > 0); } - // Sometimes multiple cores land on the same node so it's less than 4 - // int nodesCreated = response.getCollectionNodesStatus().size(); - // response = - // - // CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); - // - // assertEquals(0, response.getStatus()); - // assertTrue(response.isSuccess()); - // Map> nodesStatus = response.getCollectionNodesStatus(); - // // Delete could have been sent before the collection was finished coming online - // assertEquals(nodesStatus.toString(), nodesCreated, nodesStatus.size()); - // - // waitForState( - // "Expected " + collectionName + " to disappear from cluster state", - // collectionName, - // Objects::isNull); - // - // // Test Creating a new collection. - // collectionName = "solrj_test2"; - // - // response = - // CollectionAdminRequest.createCollection(collectionName, "conf", 2, 2) - // .process(cluster.getSolrClient()); - // assertEquals(0, response.getStatus()); - // assertTrue(response.isSuccess()); - // - // waitForState( - // "Expected " + collectionName + " to appear in cluster state", - // collectionName, - // Objects::nonNull); + waitForState( + "Expected " + collectionName + " to disappear from cluster state", + collectionName, + ((liveNodes, collectionState) -> + collectionState.getSlices().stream() + .flatMap( + s -> s.getReplicas(r -> !r.getState().equals(Replica.State.ACTIVE)).stream()) + .findAny() + .isEmpty())); + + response = + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + + assertEquals(0, response.getStatus()); + assertTrue(response.isSuccess()); + Map> nodesStatus = response.getCollectionNodesStatus(); + // Delete could have been sent before the collection was finished coming online + assertEquals(nodesStatus.toString(), 4, nodesStatus.size()); + + waitForState( + "Expected " + collectionName + " to disappear from cluster state", + collectionName, + Objects::isNull); + + // Test Creating a new collection. + collectionName = "solrj_test2"; + + response = + CollectionAdminRequest.createCollection(collectionName, "conf", 2, 2) + .process(cluster.getSolrClient()); + assertEquals(0, response.getStatus()); + assertTrue(response.isSuccess()); + + waitForState( + "Expected " + collectionName + " to appear in cluster state", + collectionName, + Objects::nonNull); } @Test diff --git a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java index f52b0fce8db1..1a8982e2c4f7 100644 --- a/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/ZkShardTermsRecoveryTest.java @@ -53,6 +53,13 @@ public static void setupCluster() throws Exception { .process(cluster.getSolrClient()) .getStatus()); cluster.waitForActiveCollection(COLLECTION, 10, TimeUnit.SECONDS, 2, NUM_SHARDS * NUM_REPLICAS); + UpdateRequest up = new UpdateRequest(); + + for (int i = 0; i < 200; i++) { + up.add("id", "id-" + i); + } + up.commit(cluster.getSolrClient(), COLLECTION); + NUM_DOCS += 200; } @Before diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSCloudIncrementalBackupTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSCloudIncrementalBackupTest.java index eabd81014806..6cf4e994d684 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSCloudIncrementalBackupTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSCloudIncrementalBackupTest.java @@ -58,6 +58,12 @@ public class LocalFSCloudIncrementalBackupTest extends AbstractIncrementalBackup + " \n" + " \n" + " \n" + + " \n" + + " localfs\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " localfs\n" + " \n" diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSInstallShardTest.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSInstallShardTest.java index 690ff4471946..989c894b9783 100644 --- a/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSInstallShardTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/LocalFSInstallShardTest.java @@ -30,6 +30,12 @@ public class LocalFSInstallShardTest extends AbstractInstallShardTest { + " \n" + " localfs\n" + " \n" + + " \n" + + " localfs\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " \n" + " \n"; @@ -43,7 +49,7 @@ public static void setupClass() throws Exception { final String tmpDirPrefix = whitespacesInPath ? "my install" : "myinstall"; final String backupLocation = createTempDir(tmpDirPrefix).toAbsolutePath().toString(); - configureCluster(1) // nodes + configureCluster(2) // nodes .addConfig( "conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf")) .withSolrXml(SOLR_XML.replace("ALLOWPATHS_TEMPLATE_VAL", backupLocation)) diff --git a/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSIncrementalBackupTest.java b/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSIncrementalBackupTest.java index 846563b929f5..d955da11e1eb 100644 --- a/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSIncrementalBackupTest.java +++ b/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSIncrementalBackupTest.java @@ -55,6 +55,12 @@ public class GCSIncrementalBackupTest extends AbstractIncrementalBackupTest { + " \n" + " \n" + " \n" + + " \n" + + " localfs\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " localfs\n" + " \n" diff --git a/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSInstallShardTest.java b/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSInstallShardTest.java index ecb08fa01926..4b78c0cc8053 100644 --- a/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSInstallShardTest.java +++ b/solr/modules/gcs-repository/src/test/org/apache/solr/gcs/GCSInstallShardTest.java @@ -19,6 +19,7 @@ import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.solr.cloud.api.collections.AbstractIncrementalBackupTest; import org.apache.solr.cloud.api.collections.AbstractInstallShardTest; import org.apache.solr.handler.admin.api.InstallShardData; import org.junit.AfterClass; @@ -40,6 +41,12 @@ public class GCSInstallShardTest extends AbstractInstallShardTest { + " \n" + " localfs\n" + " \n" + + " \n" + + " localfs\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " someBucketName\n" + " backup1\n" @@ -51,7 +58,7 @@ public class GCSInstallShardTest extends AbstractInstallShardTest { @BeforeClass public static void setupClass() throws Exception { - configureCluster(1) // nodes + configureCluster(2) // nodes .addConfig("conf1", getFile("conf/solrconfig.xml").getParent()) .withSolrXml(SOLR_XML) .configure(); diff --git a/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3IncrementalBackupTest.java b/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3IncrementalBackupTest.java index 80c5207505b1..c35dbc17ab0e 100644 --- a/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3IncrementalBackupTest.java +++ b/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3IncrementalBackupTest.java @@ -22,6 +22,7 @@ import java.lang.invoke.MethodHandles; import org.apache.lucene.tests.util.LuceneTestCase; import org.apache.solr.cloud.api.collections.AbstractIncrementalBackupTest; +import org.apache.solr.util.LogLevel; import org.junit.BeforeClass; import org.junit.ClassRule; import org.slf4j.Logger; @@ -31,6 +32,9 @@ // Backups do checksum validation against a footer value not present in 'SimpleText' @LuceneTestCase.SuppressCodecs({"SimpleText"}) @ThreadLeakLingering(linger = 10) +@LogLevel( + value = + "org.apache.solr.cloud=DEBUG;org.apache.solr.cloud.api.collections=DEBUG;org.apache.solr.cloud.overseer=DEBUG") public class S3IncrementalBackupTest extends AbstractIncrementalBackupTest { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -64,6 +68,12 @@ public class S3IncrementalBackupTest extends AbstractIncrementalBackupTest { + " \n" + " \n" + " \n" + + " \n" + + " s3\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " s3\n" + " \n" @@ -107,6 +117,11 @@ public static void setupClass() throws Exception { .addConfig("conf1", getFile("conf/solrconfig.xml").getParent()) .withSolrXml( SOLR_XML + // Only a single node will have a bad bucket name, all else should succeed. + // The bad node will be added later + .replace("BAD_BUCKET_ALL_BUT_ONE", "non-existent") + .replace("BAD_BUCKET_ONE", BUCKET_NAME) + .replace("BAD_BUCKET", BUCKET_NAME) .replace("BUCKET", BUCKET_NAME) .replace("REGION", Region.US_EAST_1.id()) .replace("ENDPOINT", "http://localhost:" + S3_MOCK_RULE.getHttpPort())) diff --git a/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3InstallShardTest.java b/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3InstallShardTest.java index 194b2ffddc6c..c44e2170a39e 100644 --- a/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3InstallShardTest.java +++ b/solr/modules/s3-repository/src/test/org/apache/solr/s3/S3InstallShardTest.java @@ -20,6 +20,7 @@ import com.adobe.testing.s3mock.junit4.S3MockRule; import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering; import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.solr.cloud.api.collections.AbstractIncrementalBackupTest; import org.apache.solr.cloud.api.collections.AbstractInstallShardTest; import org.apache.solr.handler.admin.api.InstallShardData; import org.junit.BeforeClass; @@ -44,6 +45,12 @@ public class S3InstallShardTest extends AbstractInstallShardTest { + " \n" + " s3\n" + " \n" + + " \n" + + " s3\n" + + " ${hostPort:8983}\n" + + " \n" + " \n" + " BUCKET\n" + " REGION\n" @@ -65,7 +72,7 @@ public static void setupClass() throws Exception { AbstractS3ClientTest.setS3ConfFile(); - configureCluster(1) // nodes + configureCluster(2) // nodes .addConfig("conf1", getFile("conf/solrconfig.xml").getParent()) .withSolrXml( SOLR_XML diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java index f2bbfb87f9ba..858172994c80 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/MiniSolrCloudCluster.java @@ -511,6 +511,15 @@ public JettySolrRunner startJettySolrRunner() throws Exception { return startJettySolrRunner(newNodeName(), jettyConfig, null); } + /** + * Start a new Solr instance, using the default config but with a custom Solr xml + * + * @return a JettySolrRunner + */ + public JettySolrRunner startJettySolrRunner(String solrXml) throws Exception { + return startJettySolrRunner(newNodeName(), jettyConfig, solrXml); + } + /** * Add a previously stopped node back to the cluster on a different port * diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractIncrementalBackupTest.java b/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractIncrementalBackupTest.java index 5f62d6669f5b..6e2fa183af1c 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractIncrementalBackupTest.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractIncrementalBackupTest.java @@ -61,6 +61,7 @@ import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.Slice; import org.apache.solr.common.cloud.ZkStateReader; +import org.apache.solr.common.util.NamedList; import org.apache.solr.core.DirectoryFactory; import org.apache.solr.core.SolrCore; import org.apache.solr.core.TrackingBackupRepository; @@ -89,12 +90,13 @@ public abstract class AbstractIncrementalBackupTest extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static long docsSeed; // see indexDocs() - protected static final int NUM_NODES = 2; + protected static final int NUM_NODES = 3; protected static final int NUM_SHARDS = 2; // granted we sometimes shard split to get more protected static final int LARGE_NUM_SHARDS = 11; // Periodically chosen via randomization protected static final int REPL_FACTOR = 2; protected static final String BACKUPNAME_PREFIX = "mytestbackup"; protected static final String BACKUP_REPO_NAME = "trackingBackupRepository"; + protected static final String ERROR_BACKUP_REPO_NAME = "errorBackupRepository"; protected String testSuffix = "test1"; @@ -235,7 +237,7 @@ public void testRestoreToOriginalCollection() throws Exception { @SuppressWarnings("unchecked") @Test - @Nightly + // @Nightly public void testBackupIncremental() throws Exception { setTestSuffix("testbackupinc"); CloudSolrClient solrClient = cluster.getSolrClient(); @@ -491,6 +493,125 @@ public void testBackupProperties() throws IOException { } } + @Test + public void testRestoreToOriginalSucceedsWithErrors() throws Exception { + setTestSuffix("testRestoreToOriginalSucceedsOnASingleError"); + final String backupCollectionName = getCollectionName(); + final String backupName = BACKUPNAME_PREFIX + testSuffix; + + // Bootstrap the backup collection with seed docs + CollectionAdminRequest.createCollection(backupCollectionName, "conf1", NUM_SHARDS, NUM_NODES) + .process(cluster.getSolrClient()); + int backupDocs = indexDocs(backupCollectionName, true); + + // Backup and immediately add more docs to the collection + try (BackupRepository repository = + cluster + .getJettySolrRunner(0) + .getCoreContainer() + .newBackupRepository(ERROR_BACKUP_REPO_NAME)) { + final String backupLocation = repository.getBackupLocation(getBackupLocation()); + final RequestStatusState result = + CollectionAdminRequest.backupCollection(backupCollectionName, backupName) + .setBackupConfigset(false) + .setLocation(backupLocation) + .setRepositoryName(ERROR_BACKUP_REPO_NAME) + .processAndWait(cluster.getSolrClient(), 20); + assertEquals(RequestStatusState.COMPLETED, result); + } + assertEquals(backupDocs, getNumDocsInCollection(backupCollectionName)); + clearDocs(backupCollectionName); + assertEquals(0, getNumDocsInCollection(backupCollectionName)); + + /* + Restore original docs and validate that doc count is correct + */ + // Test a single bad node + try (BackupRepository repository = + cluster + .getJettySolrRunner(0) + .getCoreContainer() + .newBackupRepository(ERROR_BACKUP_REPO_NAME)) { + // Only the first jetty will fail + ErrorThrowingTrackingBackupRepository.portsToFailOn = + Set.of(cluster.getJettySolrRunner(0).getLocalPort()); + final String backupLocation = repository.getBackupLocation(getBackupLocation()); + final RequestStatusState result = + CollectionAdminRequest.restoreCollection(backupCollectionName, backupName) + .setLocation(backupLocation) + .setRepositoryName(ERROR_BACKUP_REPO_NAME) + .processAndWait(cluster.getSolrClient(), 30); + assertEquals(RequestStatusState.COMPLETED, result); + waitForState( + "The failed core-install should recover and become healthy", + backupCollectionName, + 30, + TimeUnit.SECONDS, + SolrCloudTestCase.activeClusterShape(NUM_SHARDS, NUM_SHARDS * NUM_NODES)); + } + assertEquals(backupDocs, getNumDocsInCollection(backupCollectionName)); + clearDocs(backupCollectionName); + assertEquals(0, getNumDocsInCollection(backupCollectionName)); + + // Test a single good node + try (BackupRepository repository = + cluster + .getJettySolrRunner(0) + .getCoreContainer() + .newBackupRepository(ERROR_BACKUP_REPO_NAME)) { + final String backupLocation = repository.getBackupLocation(getBackupLocation()); + // All but the first jetty will fail + ErrorThrowingTrackingBackupRepository.portsToFailOn = + cluster.getJettySolrRunners().subList(1, NUM_NODES).stream() + .map(JettySolrRunner::getLocalPort) + .collect(Collectors.toSet()); + final RequestStatusState result = + CollectionAdminRequest.restoreCollection(backupCollectionName, backupName) + .setLocation(backupLocation) + .setRepositoryName(ERROR_BACKUP_REPO_NAME) + .processAndWait(cluster.getSolrClient(), 30); + assertEquals(RequestStatusState.COMPLETED, result); + waitForState( + "The failed core-install should recover and become healthy", + backupCollectionName, + 30, + TimeUnit.SECONDS, + SolrCloudTestCase.activeClusterShape(NUM_SHARDS, NUM_SHARDS * NUM_NODES)); + } + assertEquals(backupDocs, getNumDocsInCollection(backupCollectionName)); + } + + public static class ErrorThrowingTrackingBackupRepository extends TrackingBackupRepository { + + public static Set portsToFailOn = new HashSet<>(); + + private int port; + + @Override + public void init(NamedList args) { + super.init(args); + port = Integer.parseInt((String) args.get("hostPort")); + } + + @Override + public void copyFileTo(URI sourceRepo, String fileName, Directory dest) throws IOException { + if (portsToFailOn.contains(port)) { + throw new UnsupportedOperationException(); + } + super.copyFileTo(sourceRepo, fileName, dest); + } + + @Override + public void copyIndexFileTo( + URI sourceRepo, String sourceFileName, Directory dest, String destFileName) + throws IOException { + if (portsToFailOn.contains(port)) { + throw new UnsupportedOperationException(); + } + super.copyIndexFileTo(sourceRepo, sourceFileName, dest, destFileName); + } + } + protected void corruptIndexFiles() throws IOException { List slices = new ArrayList<>(getCollectionState(getCollectionName()).getSlices()); Replica leader = slices.get(random().nextInt(slices.size())).getLeader(); @@ -567,6 +688,14 @@ private void simpleRestoreAndCheckDocCount( CollectionAdminRequest.deleteCollection(restoreCollectionName).process(solrClient); } + protected void clearDocs(String collectionName) throws Exception { + CollectionAdminRequest.deleteCollection(collectionName).process(cluster.getSolrClient()); + CollectionAdminRequest.createCollection(collectionName, "conf1", NUM_SHARDS, NUM_NODES) + .process(cluster.getSolrClient()); + + log.info("Cleared all docs in collection: {}", collectionName); + } + private void indexDocs(String collectionName, int numDocs, boolean useUUID) throws Exception { Random random = new Random(docsSeed); @@ -605,7 +734,7 @@ private void randomlyPrecreateRestoreCollection( } } - private long getNumDocsInCollection(String collectionName) throws Exception { + protected long getNumDocsInCollection(String collectionName) throws Exception { return new QueryRequest(new SolrQuery("*:*")) .process(cluster.getSolrClient(), collectionName) .getResults() diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractInstallShardTest.java b/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractInstallShardTest.java index ac1cc7b2b449..86d12f4a7bdf 100644 --- a/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractInstallShardTest.java +++ b/solr/test-framework/src/java/org/apache/solr/cloud/api/collections/AbstractInstallShardTest.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.Map; import java.util.Random; +import java.util.Set; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; @@ -72,6 +73,7 @@ public abstract class AbstractInstallShardTest extends SolrCloudTestCase { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); protected static final String BACKUP_REPO_NAME = "trackingBackupRepository"; + protected static final String ERROR_BACKUP_REPO_NAME = "errorBackupRepository"; private static long docsSeed; // see indexDocs() @@ -93,20 +95,20 @@ public void deleteTestCollections() throws Exception { } } - private String deleteAfterTest(String collName) { + protected String deleteAfterTest(String collName) { collectionsToDelete.add(collName); return collName; } // Populated by 'bootstrapBackupRepositoryData' - private static int singleShardNumDocs = -1; - private static int replicasPerShard = -1; - private static int multiShardNumDocs = -1; - private static URI singleShard1Uri = null; - private static URI nonExistentLocationUri = null; - private static URI[] multiShardUris = null; + protected static int singleShardNumDocs = -1; + protected static int replicasPerShard = -1; + protected static int multiShardNumDocs = -1; + protected static URI singleShard1Uri = null; + protected static URI nonExistentLocationUri = null; + protected static URI[] multiShardUris = null; - private List collectionsToDelete; + protected List collectionsToDelete; public static void bootstrapBackupRepositoryData(String baseRepositoryLocation) throws Exception { final int numShards = /*random().nextInt(3) + 2*/ 4; @@ -175,6 +177,12 @@ public void testInstallToSingleShardCollection() throws Exception { CollectionAdminRequest.installDataToShard( collectionName, "shard1", singleShardLocation, BACKUP_REPO_NAME) .process(cluster.getSolrClient()); + waitForState( + "The failed core-install (previous leader) should recover and become healthy", + collectionName, + 30, + TimeUnit.SECONDS, + SolrCloudTestCase.activeClusterShape(1, replicasPerShard)); assertCollectionHasNumDocs(collectionName, singleShardNumDocs); } @@ -238,6 +246,45 @@ public void testParallelInstallToMultiShardCollection() throws Exception { assertCollectionHasNumDocs(collectionName, multiShardNumDocs); } + @Test + public void testInstallSucceedsOnASingleError() throws Exception { + final String collectionName = createAndAwaitEmptyCollection(1, 2); + deleteAfterTest(collectionName); + enableReadOnly(collectionName); + + AbstractIncrementalBackupTest.ErrorThrowingTrackingBackupRepository.portsToFailOn = + Set.of(cluster.getJettySolrRunner(0).getLocalPort()); + final String singleShardLocation = singleShard1Uri.toString(); + { // Test synchronous request error reporting + CollectionAdminRequest.installDataToShard( + collectionName, "shard1", singleShardLocation, ERROR_BACKUP_REPO_NAME) + .process(cluster.getSolrClient()); + waitForState( + "The failed core-install should recover and become healthy", + collectionName, + 30, + TimeUnit.SECONDS, + SolrCloudTestCase.activeClusterShape(1, 2)); + assertCollectionHasNumDocs(collectionName, singleShardNumDocs); + } + + { // Test asynchronous request error reporting + final var requestStatusState = + CollectionAdminRequest.installDataToShard( + collectionName, "shard1", singleShardLocation, ERROR_BACKUP_REPO_NAME) + .processAndWait(cluster.getSolrClient(), 15); + + assertEquals(RequestStatusState.COMPLETED, requestStatusState); + waitForState( + "The failed core-install should recover and become healthy", + collectionName, + 30, + TimeUnit.SECONDS, + SolrCloudTestCase.activeClusterShape(1, 2)); + assertCollectionHasNumDocs(collectionName, singleShardNumDocs); + } + } + /** * Builds a string representation of a valid solr.xml configuration, with the provided * backup-repository configuration inserted @@ -272,7 +319,7 @@ public static String defaultSolrXmlTextWithBackupRepository(String backupReposit + "\n"; } - private static void assertCollectionHasNumDocs(String collection, int expectedNumDocs) + protected static void assertCollectionHasNumDocs(String collection, int expectedNumDocs) throws Exception { final SolrClient solrClient = cluster.getSolrClient(); assertEquals( @@ -364,7 +411,7 @@ private static void indexDocs(String collectionName, int numDocs, boolean useUUI log.info("Indexed {} docs to collection: {}", numDocs, collectionName); } - private static String createAndAwaitEmptyCollection(int numShards, int replicasPerShard) + protected static String createAndAwaitEmptyCollection(int numShards, int replicasPerShard) throws Exception { final SolrClient solrClient = cluster.getSolrClient(); @@ -377,7 +424,7 @@ private static String createAndAwaitEmptyCollection(int numShards, int replicasP return collectionName; } - private static void enableReadOnly(String collectionName) throws Exception { + protected static void enableReadOnly(String collectionName) throws Exception { CollectionAdminRequest.modifyCollection(collectionName, Map.of("readOnly", true)) .process(cluster.getSolrClient()); }