SOLR-11484: CloudSolrClient does not invalidate cache or retry for RouteException

This commit is contained in:
Noble Paul 2017-10-27 15:28:24 +10:30
parent 161be0a4ae
commit 0d29f7a1a2
3 changed files with 71 additions and 1 deletions

View File

@ -75,6 +75,9 @@ Bug Fixes
more than the given term's frequency in overridden FloatDocValues.floatVal().
(Michael Kosten, Erik Hatcher, Steve Rowe)
* SOLR-11484: CloudSolrClient does not invalidate cache or retry for RouteException (noble, hossman)
Optimizations
----------------------
* SOLR-11285: Refactor autoscaling framework to avoid direct references to Zookeeper and Solr

View File

@ -607,6 +607,14 @@ public class CloudSolrClient extends SolrClient {
String name = slice.getName();
List<String> urls = new ArrayList<>();
Replica leader = slice.getLeader();
if (directUpdatesToLeadersOnly && leader == null) {
for (Replica replica : slice.getReplicas(
replica -> replica.isActive(getClusterStateProvider().getLiveNodes())
&& replica.getType() == Replica.Type.NRT)) {
leader = replica;
break;
}
}
if (leader == null) {
if (directUpdatesToLeadersOnly) {
continue;
@ -908,7 +916,7 @@ public class CloudSolrClient extends SolrClient {
rootCause instanceof NoHttpResponseException ||
rootCause instanceof SocketException);
if (wasCommError) {
if (wasCommError || (exc instanceof RouteException)) {
// it was a communication error. it is likely that
// the node to which the request to be sent is down . So , expire the state
// so that the next attempt would fetch the fresh state

View File

@ -787,6 +787,65 @@ public class CloudSolrClientTest extends SolrCloudTestCase {
}
}
public void testRetryUpdatesWhenClusterStateIsStale() throws Exception {
final String COL = "stale_state_test_col";
assert cluster.getJettySolrRunners().size() >= 2;
final JettySolrRunner old_leader_node = cluster.getJettySolrRunners().get(0);
final JettySolrRunner new_leader_node = cluster.getJettySolrRunners().get(1);
// start with exactly 1 shard/replica...
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.createCollection(COL, "conf", 1, 1)
.setCreateNodeSet(old_leader_node.getNodeName())
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// determine the coreNodeName of only current replica
Collection<Slice> slices = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COL).getSlices();
assertEquals(1, slices.size()); // sanity check
Slice slice = slices.iterator().next();
assertEquals(1, slice.getReplicas().size()); // sanity check
final String old_leader_core_node_name = slice.getLeader().getName();
// NOTE: creating our own CloudSolrClient whose settings we can muck with...
try (CloudSolrClient stale_client = getCloudSolrClient(cluster.getZkServer().getZkAddress())) {
// don't let collection cache entries get expired, even on a slow machine...
stale_client.setCollectionCacheTTl(Integer.MAX_VALUE);
stale_client.setDefaultCollection(COL);
// do a query to populate stale_client's cache...
assertEquals(0, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound());
// add 1 replica on a diff node...
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.addReplicaToShard(COL, "shard1")
.setNode(new_leader_node.getNodeName())
// NOTE: don't use our stale_client for this -- don't tip it off of a collection change
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// ...and delete our original leader.
assertEquals("Couldn't create collection", 0,
CollectionAdminRequest.deleteReplica(COL, "shard1", old_leader_core_node_name)
// NOTE: don't use our stale_client for this -- don't tip it off of a collection change
.process(cluster.getSolrClient()).getStatus());
AbstractDistribZkTestBase.waitForRecoveriesToFinish
(COL, cluster.getSolrClient().getZkStateReader(), true, true, 330);
// stale_client's collection state cache should now only point at a leader that no longer exists.
// attempt a (direct) update that should succeed in spite of cached cluster state
// pointing solely to a node that's no longer part of our collection...
assertEquals(0, (new UpdateRequest().add("id", "1").commit(stale_client, COL)).getStatus());
assertEquals(1, stale_client.query(new SolrQuery("*:*")).getResults().getNumFound());
}
}
private static void checkSingleServer(NamedList<Object> response) {
final CloudSolrClient.RouteResponse rr = (CloudSolrClient.RouteResponse) response;
final Map<String,LBHttpSolrClient.Req> routes = rr.getRoutes();