SOLR-11484: CloudSolrClient does not invalidate cache or retry for RouteException

This commit is contained in:
Noble Paul 2017-10-27 15:28:24 +10:30
parent 161be0a4ae
commit 0d29f7a1a2
3 changed files with 71 additions and 1 deletions

View File

@ -75,6 +75,9 @@ Bug Fixes
more than the given term's frequency in overridden FloatDocValues.floatVal(). more than the given term's frequency in overridden FloatDocValues.floatVal().
(Michael Kosten, Erik Hatcher, Steve Rowe) (Michael Kosten, Erik Hatcher, Steve Rowe)
* SOLR-11484: CloudSolrClient does not invalidate cache or retry for RouteException (noble, hossman)
Optimizations Optimizations
---------------------- ----------------------
* SOLR-11285: Refactor autoscaling framework to avoid direct references to Zookeeper and Solr * SOLR-11285: Refactor autoscaling framework to avoid direct references to Zookeeper and Solr

View File

@ -607,6 +607,14 @@ public class CloudSolrClient extends SolrClient {
String name = slice.getName(); String name = slice.getName();
List<String> urls = new ArrayList<>(); List<String> urls = new ArrayList<>();
Replica leader = slice.getLeader(); Replica leader = slice.getLeader();
if (directUpdatesToLeadersOnly && leader == null) {
for (Replica replica : slice.getReplicas(
replica -> replica.isActive(getClusterStateProvider().getLiveNodes())
&& replica.getType() == Replica.Type.NRT)) {
leader = replica;
break;
}
}
if (leader == null) { if (leader == null) {
if (directUpdatesToLeadersOnly) { if (directUpdatesToLeadersOnly) {
continue; continue;
@ -908,7 +916,7 @@ public class CloudSolrClient extends SolrClient {
rootCause instanceof NoHttpResponseException || rootCause instanceof NoHttpResponseException ||
rootCause instanceof SocketException); rootCause instanceof SocketException);
if (wasCommError) { if (wasCommError || (exc instanceof RouteException)) {
// it was a communication error. it is likely that // it was a communication error. it is likely that
// the node to which the request to be sent is down . So , expire the state // the node to which the request to be sent is down . So , expire the state
// so that the next attempt would fetch the fresh state // so that the next attempt would fetch the fresh state

View File

@ -787,6 +787,65 @@ public class CloudSolrClientTest extends SolrCloudTestCase {
} }
} }
public void testRetryUpdatesWhenClusterStateIsStale() throws Exception {
final String COL = "stale_state_test_col";
assert cluster.getJettySolrRunners().size() >= 2;
final JettySolrRunner old_leader_node = cluster.getJettySolrRunners().get(0);
final JettySolrRunner new_leader_node = cluster.getJettySolrRunners().get(1);
// start with exactly 1 shard/replica...
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.createCollection(COL, "conf", 1, 1)
.setCreateNodeSet(old_leader_node.getNodeName())
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// determine the coreNodeName of only current replica
Collection<Slice> slices = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COL).getSlices();
assertEquals(1, slices.size()); // sanity check
Slice slice = slices.iterator().next();
assertEquals(1, slice.getReplicas().size()); // sanity check
final String old_leader_core_node_name = slice.getLeader().getName();
// NOTE: creating our own CloudSolrClient whose settings we can muck with...
try (CloudSolrClient stale_client = getCloudSolrClient(cluster.getZkServer().getZkAddress())) {
// don't let collection cache entries get expired, even on a slow machine...
stale_client.setCollectionCacheTTl(Integer.MAX_VALUE);
stale_client.setDefaultCollection(COL);
// do a query to populate stale_client's cache...
assertEquals(0, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound());
// add 1 replica on a diff node...
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.addReplicaToShard(COL, "shard1")
.setNode(new_leader_node.getNodeName())
// NOTE: don't use our stale_client for this -- don't tip it off of a collection change
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// ...and delete our original leader.
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.deleteReplica(COL, "shard1", old_leader_core_node_name)
// NOTE: don't use our stale_client for this -- don't tip it off of a collection change
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// stale_client's collection state cache should now only point at a leader that no longer exists.
// attempt a (direct) update that should succeed in spite of cached cluster state
// pointing solely to a node that's no longer part of our collection...
assertEquals(0, (new UpdateRequest().add("id", "1").commit(stale_client, COL)).getStatus());
assertEquals(1, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound());
}
}
private static void checkSingleServer(NamedList<Object> response) { private static void checkSingleServer(NamedList<Object> response) {
final CloudSolrClient.RouteResponse rr = (CloudSolrClient.RouteResponse) response; final CloudSolrClient.RouteResponse rr = (CloudSolrClient.RouteResponse) response;
final Map<String,LBHttpSolrClient.Req> routes = rr.getRoutes(); final Map<String,LBHttpSolrClient.Req> routes = rr.getRoutes();