diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index f2ae5fb3dd1..de1d93d9754 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -75,6 +75,9 @@ Bug Fixes more than the given term's frequency in overridden FloatDocValues.floatVal(). (Michael Kosten, Erik Hatcher, Steve Rowe) +* SOLR-11484: CloudSolrClient does not invalidate cache or retry for RouteException (noble, hossman) + + Optimizations ---------------------- * SOLR-11285: Refactor autoscaling framework to avoid direct references to Zookeeper and Solr diff --git a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java index f63eedd7426..772c74139c2 100644 --- a/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java +++ b/solr/solrj/src/java/org/apache/solr/client/solrj/impl/CloudSolrClient.java @@ -607,6 +607,14 @@ public class CloudSolrClient extends SolrClient { String name = slice.getName(); List urls = new ArrayList<>(); Replica leader = slice.getLeader(); + if (directUpdatesToLeadersOnly && leader == null) { + for (Replica replica : slice.getReplicas( + replica -> replica.isActive(getClusterStateProvider().getLiveNodes()) + && replica.getType() == Replica.Type.NRT)) { + leader = replica; + break; + } + } if (leader == null) { if (directUpdatesToLeadersOnly) { continue; @@ -908,7 +916,7 @@ public class CloudSolrClient extends SolrClient { rootCause instanceof NoHttpResponseException || rootCause instanceof SocketException); - if (wasCommError) { + if (wasCommError || (exc instanceof RouteException)) { // it was a communication error. it is likely that // the node to which the request to be sent is down . So , expire the state // so that the next attempt would fetch the fresh state diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java index 2a441b90ab3..8779a7108db 100644 --- a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java +++ b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientTest.java @@ -787,6 +787,65 @@ public class CloudSolrClientTest extends SolrCloudTestCase { } } + public void testRetryUpdatesWhenClusterStateIsStale() throws Exception { + final String COL = "stale_state_test_col"; + assert cluster.getJettySolrRunners().size() >= 2; + + final JettySolrRunner old_leader_node = cluster.getJettySolrRunners().get(0); + final JettySolrRunner new_leader_node = cluster.getJettySolrRunners().get(1); + + // start with exactly 1 shard/replica... + assertEquals("Couldn't create collection", 0, + CollectionAdminRequest.createCollection(COL, "conf", 1, 1) + .setCreateNodeSet(old_leader_node.getNodeName()) + .process(cluster.getSolrClient()).getStatus()); + AbstractDistribZkTestBase.waitForRecoveriesToFinish + (COL, cluster.getSolrClient().getZkStateReader(), true, true, 330); + + // determine the coreNodeName of only current replica + Collection slices = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COL).getSlices(); + assertEquals(1, slices.size()); // sanity check + Slice slice = slices.iterator().next(); + assertEquals(1, slice.getReplicas().size()); // sanity check + final String old_leader_core_node_name = slice.getLeader().getName(); + + // NOTE: creating our own CloudSolrClient whose settings we can muck with... + try (CloudSolrClient stale_client = getCloudSolrClient(cluster.getZkServer().getZkAddress())) { + // don't let collection cache entries get expired, even on a slow machine... + stale_client.setCollectionCacheTTl(Integer.MAX_VALUE); + stale_client.setDefaultCollection(COL); + + // do a query to populate stale_client's cache... + assertEquals(0, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound()); + + // add 1 replica on a diff node... + assertEquals("Couldn't create collection", 0, + CollectionAdminRequest.addReplicaToShard(COL, "shard1") + .setNode(new_leader_node.getNodeName()) + // NOTE: don't use our stale_client for this -- don't tip it off of a collection change + .process(cluster.getSolrClient()).getStatus()); + AbstractDistribZkTestBase.waitForRecoveriesToFinish + (COL, cluster.getSolrClient().getZkStateReader(), true, true, 330); + + // ...and delete our original leader. + assertEquals("Couldn't create collection", 0, + CollectionAdminRequest.deleteReplica(COL, "shard1", old_leader_core_node_name) + // NOTE: don't use our stale_client for this -- don't tip it off of a collection change + .process(cluster.getSolrClient()).getStatus()); + AbstractDistribZkTestBase.waitForRecoveriesToFinish + (COL, cluster.getSolrClient().getZkStateReader(), true, true, 330); + + // stale_client's collection state cache should now only point at a leader that no longer exists. + + // attempt a (direct) update that should succeed in spite of cached cluster state + // pointing solely to a node that's no longer part of our collection... + assertEquals(0, (new UpdateRequest().add("id", "1").commit(stale_client, COL)).getStatus()); + assertEquals(1, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound()); + + } + } + + private static void checkSingleServer(NamedList response) { final CloudSolrClient.RouteResponse rr = (CloudSolrClient.RouteResponse) response; final Map routes = rr.getRoutes();