SOLR-12716: NodeLostTrigger should support deleting replicas from lost nodes by setting preferredOperation=deletenode

This commit adds support for preferredOperation configuration for NodeLostTrigger. The ComputePlanAction now creates DeleteNodeSuggester for each lost node serially when preferredOperation=deletenode. A new section for node lost trigger with exampls is added to the ref guide.
This commit is contained in:
Shalin Shekhar Mangar 2018-09-05 15:40:10 +05:30
parent 6be01e2ade
commit b6ee0ed5d5
9 changed files with 164 additions and 19 deletions

View File

@ -201,6 +201,9 @@ New Features
* SOLR-11861: When creating a configSet via the API, the "baseConfigSet" parameter now defaults to "_default".
(Amrit Sarkar, David Smiley)
* SOLR-12716: NodeLostTrigger should support deleting replicas from lost nodes by setting preferredOperation=deletenode.
(shalin)
Bug Fixes
----------------------

View File

@ -25,7 +25,6 @@ import java.util.Locale;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
@ -55,9 +54,6 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ocmh.checkRequired(message, "node");
String node = message.getStr("node");
if (!state.liveNodesContain(node)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
}
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
List<String> singleReplicas = verifyReplicaAvailability(sourceReplicas, state);
if (!singleReplicas.isEmpty()) {

View File

@ -48,6 +48,8 @@ import org.apache.solr.core.SolrResourceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.cloud.autoscaling.TriggerEvent.NODE_NAMES;
/**
* This class is responsible for using the configured policy and preferences
* with the hints provided by the trigger event to compute the required cluster operations.
@ -206,8 +208,29 @@ public class ComputePlanAction extends TriggerActionBase {
suggester = getNodeAddedSuggester(cloudManager, session, event);
break;
case NODELOST:
suggester = session.getSuggester(CollectionParams.CollectionAction.MOVEREPLICA)
.hint(Suggester.Hint.SRC_NODE, event.getProperty(TriggerEvent.NODE_NAMES));
String preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
switch (action) {
case MOVEREPLICA:
suggester = session.getSuggester(action)
.hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
break;
case DELETENODE:
int start = (Integer)event.getProperty(START, 0);
List<String> srcNodes = (List<String>) event.getProperty(NODE_NAMES);
if (srcNodes.isEmpty() || start >= srcNodes.size()) {
return NoneSuggester.get(session);
}
String sourceNode = srcNodes.get(start);
suggester = session.getSuggester(action)
.hint(Suggester.Hint.SRC_NODE, Collections.singletonList(sourceNode));
event.getProperties().put(START, ++start);
break;
case NONE:
return NoneSuggester.get(session);
default:
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported preferredOperation: " + action.toLower() + " specified for node lost trigger");
}
break;
case SEARCHRATE:
case METRIC:
@ -227,16 +250,15 @@ public class ComputePlanAction extends TriggerActionBase {
suggester = suggester.hint(e.getKey(), e.getValue());
}
suggester = suggester.forceOperation(true);
start++;
event.getProperties().put(START, start);
event.getProperties().put(START, ++start);
break;
case SCHEDULED:
String preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
action = CollectionParams.CollectionAction.get(preferredOp);
suggester = session.getSuggester(action);
break;
default:
throw new UnsupportedOperationException("No support for events other than nodeAdded, nodeLost, searchRate, metric and indexSize. Received: " + event.getEventType());
throw new UnsupportedOperationException("No support for events other than nodeAdded, nodeLost, searchRate, metric, scheduled and indexSize. Received: " + event.getEventType());
}
return suggester;
}
@ -246,7 +268,7 @@ public class ComputePlanAction extends TriggerActionBase {
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
Suggester suggester = session.getSuggester(action)
.hint(Suggester.Hint.TARGET_NODE, event.getProperty(TriggerEvent.NODE_NAMES));
.hint(Suggester.Hint.TARGET_NODE, event.getProperty(NODE_NAMES));
switch (action) {
case ADDREPLICA:
// add all collection/shard pairs and let policy engine figure out which one

View File

@ -24,17 +24,23 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.core.SolrResourceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.AutoScalingParams.PREFERRED_OP;
/**
* Trigger for the {@link TriggerEventType#NODELOST} event
*/
@ -45,8 +51,11 @@ public class NodeLostTrigger extends TriggerBase {
private Map<String, Long> nodeNameVsTimeRemoved = new HashMap<>();
private String preferredOp;
public NodeLostTrigger(String name) {
super(TriggerEventType.NODELOST, name);
TriggerUtils.validProperties(validProperties, PREFERRED_OP);
}
@Override
@ -72,6 +81,23 @@ public class NodeLostTrigger extends TriggerBase {
}
}
@Override
public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, Map<String, Object> properties) throws TriggerValidationException {
super.configure(loader, cloudManager, properties);
preferredOp = (String) properties.getOrDefault(PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
preferredOp = preferredOp.toLowerCase(Locale.ROOT);
// verify
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
switch (action) {
case MOVEREPLICA:
case DELETENODE:
case NONE:
break;
default:
throw new TriggerValidationException("Unsupported preferredOperation=" + preferredOp + " specified for node lost trigger");
}
}
@Override
public void restoreState(AutoScaling.Trigger old) {
assert old.isClosed();
@ -154,7 +180,7 @@ public class NodeLostTrigger extends TriggerBase {
if (!nodeNames.isEmpty()) {
if (processor != null) {
log.debug("NodeLostTrigger firing registered processor for lost nodes: {}", nodeNames);
if (processor.process(new NodeLostEvent(getEventType(), getName(), times, nodeNames))) {
if (processor.process(new NodeLostEvent(getEventType(), getName(), times, nodeNames, preferredOp))) {
// remove from tracking set only if the fire was accepted
nodeNames.forEach(n -> {
nodeNameVsTimeRemoved.remove(n);
@ -191,11 +217,12 @@ public class NodeLostTrigger extends TriggerBase {
public static class NodeLostEvent extends TriggerEvent {
public NodeLostEvent(TriggerEventType eventType, String source, List<Long> times, List<String> nodeNames) {
public NodeLostEvent(TriggerEventType eventType, String source, List<Long> times, List<String> nodeNames, String preferredOp) {
// use the oldest time as the time of the event
super(eventType, source, times.get(0), null);
properties.put(NODE_NAMES, nodeNames);
properties.put(EVENT_TIMES, times);
properties.put(PREFERRED_OP, preferredOp);
}
}
}

View File

@ -154,7 +154,7 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
@SuppressForbidden(reason = "Needs currentTimeMillis to create unique id")
private List<SolrRequest> getOperations(JettySolrRunner actionJetty, String lostNodeName) throws Exception {
try (AutoAddReplicasPlanAction action = new AutoAddReplicasPlanAction()) {
TriggerEvent lostNode = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST, ".auto_add_replicas", Collections.singletonList(System.currentTimeMillis()), Collections.singletonList(lostNodeName));
TriggerEvent lostNode = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST, ".auto_add_replicas", Collections.singletonList(System.currentTimeMillis()), Collections.singletonList(lostNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
ActionContext context = new ActionContext(actionJetty.getCoreContainer().getZkController().getSolrCloudManager(), null, new HashMap<>());
action.process(lostNode, context);
List<SolrRequest> operations = (List) context.getProperty("operations");

View File

@ -540,7 +540,6 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
// the default policy limits 1 replica per node, we need more right now
String setClusterPolicyCommand = "{" +
" 'set-cluster-policy': [" +
" {'cores':'<" + (1 + numCollections * numShards) + "', 'node':'#ANY'}," +
@ -612,4 +611,66 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
assertEquals(numShards, affectedShards.size());
assertEquals(numCollections * numShards, affectedCollShards.size());
}
@Test
public void testNodeLostTriggerWithDeleteNodePreferredOp() throws Exception {
String collectionNamePrefix = "testNodeLostTriggerWithDeleteNodePreferredOp";
int numCollections = 1 + random().nextInt(3), numShards = 1 + random().nextInt(3);
CloudSolrClient solrClient = cluster.getSolrClient();
String setTriggerCommand = "{" +
"'set-trigger' : {" +
"'name' : 'node_lost_trigger'," +
"'event' : 'nodeLost'," +
"'waitFor' : '1s'," +
"'enabled' : true," +
"'" + AutoScalingParams.PREFERRED_OP + "':'deletenode'," +
"'actions' : [{'name':'compute_plan', 'class' : 'solr.ComputePlanAction'}," +
"{'name':'execute_plan','class':'solr.ExecutePlanAction'}" +
"{'name':'test','class':'" + AssertingTriggerAction.class.getName() + "'}]" +
"}}";
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
NamedList<Object> response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
String setClusterPolicyCommand = "{" +
" 'set-cluster-policy': [" +
" {'cores':'<" + (1 + numCollections * numShards) + "', 'node':'#ANY'}," +
" {'replica':'<2', 'shard': '#EACH', 'node': '#ANY'}," +
" {'nodeRole':'overseer', 'replica':0}" +
" ]" +
"}";
req = createAutoScalingRequest(SolrRequest.METHOD.POST, setClusterPolicyCommand);
response = solrClient.request(req);
assertEquals(response.get("result").toString(), "success");
JettySolrRunner newNode = cluster.startJettySolrRunner();
// cache the node name because it won't be available once the node is shutdown
String newNodeName = newNode.getNodeName();
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionNamePrefix + "_0",
"conf", numShards, 2);
create.process(solrClient);
waitForState("Timed out waiting for replicas of new collection to be active",
collectionNamePrefix + "_0", (liveNodes, collectionState) ->
collectionState.getReplicas().stream().allMatch(replica -> replica.isActive(liveNodes)));
cluster.stopJettySolrRunner(newNode);
assertTrue(triggerFiredLatch.await(30, TimeUnit.SECONDS));
assertTrue(fired.get());
Map actionContext = actionContextPropsRef.get();
List operations = (List) actionContext.get("operations");
assertNotNull(operations);
assertEquals(1, operations.size());
for (Object operation : operations) {
assertTrue(operation instanceof CollectionAdminRequest.DeleteNode);
CollectionAdminRequest.DeleteNode deleteNode = (CollectionAdminRequest.DeleteNode) operation;
SolrParams deleteNodeParams = deleteNode.getParams();
assertEquals(newNodeName, deleteNodeParams.get("node"));
}
waitForState("Timed out waiting for all shards to have only 1 replica",
collectionNamePrefix + "_0", clusterShape(numShards, 1));
}
}

View File

@ -148,7 +148,7 @@ public class ExecutePlanActionTest extends SolrCloudTestCase {
List<CollectionAdminRequest.AsyncCollectionAdminRequest> operations = Lists.asList(moveReplica, new CollectionAdminRequest.AsyncCollectionAdminRequest[]{mockRequest});
NodeLostTrigger.NodeLostEvent nodeLostEvent = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST,
"mock_trigger_name", Collections.singletonList(TimeSource.CURRENT_TIME.getTimeNs()),
Collections.singletonList(sourceNodeName));
Collections.singletonList(sourceNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
ActionContext actionContext = new ActionContext(survivor.getCoreContainer().getZkController().getSolrCloudManager(), null,
new HashMap<>(Collections.singletonMap("operations", operations)));
action.process(nodeLostEvent, actionContext);

View File

@ -137,7 +137,7 @@ public class TestSimExecutePlanAction extends SimSolrCloudTestCase {
List<CollectionAdminRequest.AsyncCollectionAdminRequest> operations = Lists.asList(moveReplica, new CollectionAdminRequest.AsyncCollectionAdminRequest[]{mockRequest});
NodeLostTrigger.NodeLostEvent nodeLostEvent = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST,
"mock_trigger_name", Collections.singletonList(TimeSource.CURRENT_TIME.getTimeNs()),
Collections.singletonList(sourceNodeName));
Collections.singletonList(sourceNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
ActionContext actionContext = new ActionContext(cluster, null,
new HashMap<>(Collections.singletonMap("operations", operations)));
action.process(nodeLostEvent, actionContext);

View File

@ -86,11 +86,47 @@ Apart from the parameters described at <<#trigger-configuration, Trigger Configu
"name": "node_added_trigger",
"event": "nodeAdded",
"waitFor": "5s",
"preferredOperation": "addreplica"
"preferredOperation": "ADDREPLICA"
}
}
----
== Node Lost Trigger
The `NodeLostTrigger` generates `nodeLost` events when a node leaves the cluster. It can be used to either move replicas
that were hosted by the lost node to other nodes or to delete them from the cluster.
Apart from the parameters described at <<#trigger-configuration, Trigger Configuration>>, this trigger supports the following configuration:
`preferredOperation`:: (string, optional, defaults to `MOVEREPLICA`) The operation to be performed in response to an event generated by this trigger. By default, replicas will be moved from the lost nodes to the other nodes in the cluster. The only other supported value is `DELETENODE` which deletes all information about replicas that were hosted by the lost node.
.Example: Node Lost Trigger to move replicas to new node
[source,json]
----
{
"set-trigger": {
"name": "node_lost_trigger",
"event": "nodeLost",
"waitFor": "120s"
}
}
----
.Example: Node Lost Trigger to delete replicas
[source,json]
----
{
"set-trigger": {
"name": "node_lost_trigger",
"event": "nodeLost",
"waitFor": "120s",
"preferredOperation": "DELETENODE"
}
}
----
TIP: It is recommended that the value of `waitFor` configuration for node lost trigger be larger than a minute so that large full garbage collection pauses do not cause this trigger to generate events and needlessly move or delete replicas in the cluster.
== Auto Add Replicas Trigger
When a collection has the parameter `autoAddReplicas` set to true then a trigger configuration named `.auto_add_replicas` is automatically created to watch for nodes going away. This trigger produces `nodeLost` events,