mirror of https://github.com/apache/lucene.git
SOLR-12716: NodeLostTrigger should support deleting replicas from lost nodes by setting preferredOperation=deletenode
This commit adds support for preferredOperation configuration for NodeLostTrigger. The ComputePlanAction now creates DeleteNodeSuggester for each lost node serially when preferredOperation=deletenode. A new section for node lost trigger with exampls is added to the ref guide.
This commit is contained in:
parent
6be01e2ade
commit
b6ee0ed5d5
|
@ -201,6 +201,9 @@ New Features
|
|||
* SOLR-11861: When creating a configSet via the API, the "baseConfigSet" parameter now defaults to "_default".
|
||||
(Amrit Sarkar, David Smiley)
|
||||
|
||||
* SOLR-12716: NodeLostTrigger should support deleting replicas from lost nodes by setting preferredOperation=deletenode.
|
||||
(shalin)
|
||||
|
||||
Bug Fixes
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Locale;
|
|||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.cloud.ClusterState;
|
||||
import org.apache.solr.common.cloud.DocCollection;
|
||||
import org.apache.solr.common.cloud.Replica;
|
||||
|
@ -55,9 +54,6 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||
ocmh.checkRequired(message, "node");
|
||||
String node = message.getStr("node");
|
||||
if (!state.liveNodesContain(node)) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
|
||||
}
|
||||
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
|
||||
List<String> singleReplicas = verifyReplicaAvailability(sourceReplicas, state);
|
||||
if (!singleReplicas.isEmpty()) {
|
||||
|
|
|
@ -48,6 +48,8 @@ import org.apache.solr.core.SolrResourceLoader;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static org.apache.solr.cloud.autoscaling.TriggerEvent.NODE_NAMES;
|
||||
|
||||
/**
|
||||
* This class is responsible for using the configured policy and preferences
|
||||
* with the hints provided by the trigger event to compute the required cluster operations.
|
||||
|
@ -206,8 +208,29 @@ public class ComputePlanAction extends TriggerActionBase {
|
|||
suggester = getNodeAddedSuggester(cloudManager, session, event);
|
||||
break;
|
||||
case NODELOST:
|
||||
suggester = session.getSuggester(CollectionParams.CollectionAction.MOVEREPLICA)
|
||||
.hint(Suggester.Hint.SRC_NODE, event.getProperty(TriggerEvent.NODE_NAMES));
|
||||
String preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
|
||||
switch (action) {
|
||||
case MOVEREPLICA:
|
||||
suggester = session.getSuggester(action)
|
||||
.hint(Suggester.Hint.SRC_NODE, event.getProperty(NODE_NAMES));
|
||||
break;
|
||||
case DELETENODE:
|
||||
int start = (Integer)event.getProperty(START, 0);
|
||||
List<String> srcNodes = (List<String>) event.getProperty(NODE_NAMES);
|
||||
if (srcNodes.isEmpty() || start >= srcNodes.size()) {
|
||||
return NoneSuggester.get(session);
|
||||
}
|
||||
String sourceNode = srcNodes.get(start);
|
||||
suggester = session.getSuggester(action)
|
||||
.hint(Suggester.Hint.SRC_NODE, Collections.singletonList(sourceNode));
|
||||
event.getProperties().put(START, ++start);
|
||||
break;
|
||||
case NONE:
|
||||
return NoneSuggester.get(session);
|
||||
default:
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unsupported preferredOperation: " + action.toLower() + " specified for node lost trigger");
|
||||
}
|
||||
break;
|
||||
case SEARCHRATE:
|
||||
case METRIC:
|
||||
|
@ -227,16 +250,15 @@ public class ComputePlanAction extends TriggerActionBase {
|
|||
suggester = suggester.hint(e.getKey(), e.getValue());
|
||||
}
|
||||
suggester = suggester.forceOperation(true);
|
||||
start++;
|
||||
event.getProperties().put(START, start);
|
||||
event.getProperties().put(START, ++start);
|
||||
break;
|
||||
case SCHEDULED:
|
||||
String preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
|
||||
preferredOp = (String) event.getProperty(AutoScalingParams.PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
action = CollectionParams.CollectionAction.get(preferredOp);
|
||||
suggester = session.getSuggester(action);
|
||||
break;
|
||||
default:
|
||||
throw new UnsupportedOperationException("No support for events other than nodeAdded, nodeLost, searchRate, metric and indexSize. Received: " + event.getEventType());
|
||||
throw new UnsupportedOperationException("No support for events other than nodeAdded, nodeLost, searchRate, metric, scheduled and indexSize. Received: " + event.getEventType());
|
||||
}
|
||||
return suggester;
|
||||
}
|
||||
|
@ -246,7 +268,7 @@ public class ComputePlanAction extends TriggerActionBase {
|
|||
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
|
||||
|
||||
Suggester suggester = session.getSuggester(action)
|
||||
.hint(Suggester.Hint.TARGET_NODE, event.getProperty(TriggerEvent.NODE_NAMES));
|
||||
.hint(Suggester.Hint.TARGET_NODE, event.getProperty(NODE_NAMES));
|
||||
switch (action) {
|
||||
case ADDREPLICA:
|
||||
// add all collection/shard pairs and let policy engine figure out which one
|
||||
|
|
|
@ -24,17 +24,23 @@ import java.util.HashMap;
|
|||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
|
||||
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.cloud.ZkStateReader;
|
||||
import org.apache.solr.common.params.CollectionParams;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static org.apache.solr.common.params.AutoScalingParams.PREFERRED_OP;
|
||||
|
||||
/**
|
||||
* Trigger for the {@link TriggerEventType#NODELOST} event
|
||||
*/
|
||||
|
@ -45,8 +51,11 @@ public class NodeLostTrigger extends TriggerBase {
|
|||
|
||||
private Map<String, Long> nodeNameVsTimeRemoved = new HashMap<>();
|
||||
|
||||
private String preferredOp;
|
||||
|
||||
public NodeLostTrigger(String name) {
|
||||
super(TriggerEventType.NODELOST, name);
|
||||
TriggerUtils.validProperties(validProperties, PREFERRED_OP);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -72,6 +81,23 @@ public class NodeLostTrigger extends TriggerBase {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void configure(SolrResourceLoader loader, SolrCloudManager cloudManager, Map<String, Object> properties) throws TriggerValidationException {
|
||||
super.configure(loader, cloudManager, properties);
|
||||
preferredOp = (String) properties.getOrDefault(PREFERRED_OP, CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
preferredOp = preferredOp.toLowerCase(Locale.ROOT);
|
||||
// verify
|
||||
CollectionParams.CollectionAction action = CollectionParams.CollectionAction.get(preferredOp);
|
||||
switch (action) {
|
||||
case MOVEREPLICA:
|
||||
case DELETENODE:
|
||||
case NONE:
|
||||
break;
|
||||
default:
|
||||
throw new TriggerValidationException("Unsupported preferredOperation=" + preferredOp + " specified for node lost trigger");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void restoreState(AutoScaling.Trigger old) {
|
||||
assert old.isClosed();
|
||||
|
@ -154,7 +180,7 @@ public class NodeLostTrigger extends TriggerBase {
|
|||
if (!nodeNames.isEmpty()) {
|
||||
if (processor != null) {
|
||||
log.debug("NodeLostTrigger firing registered processor for lost nodes: {}", nodeNames);
|
||||
if (processor.process(new NodeLostEvent(getEventType(), getName(), times, nodeNames))) {
|
||||
if (processor.process(new NodeLostEvent(getEventType(), getName(), times, nodeNames, preferredOp))) {
|
||||
// remove from tracking set only if the fire was accepted
|
||||
nodeNames.forEach(n -> {
|
||||
nodeNameVsTimeRemoved.remove(n);
|
||||
|
@ -191,11 +217,12 @@ public class NodeLostTrigger extends TriggerBase {
|
|||
|
||||
public static class NodeLostEvent extends TriggerEvent {
|
||||
|
||||
public NodeLostEvent(TriggerEventType eventType, String source, List<Long> times, List<String> nodeNames) {
|
||||
public NodeLostEvent(TriggerEventType eventType, String source, List<Long> times, List<String> nodeNames, String preferredOp) {
|
||||
// use the oldest time as the time of the event
|
||||
super(eventType, source, times.get(0), null);
|
||||
properties.put(NODE_NAMES, nodeNames);
|
||||
properties.put(EVENT_TIMES, times);
|
||||
properties.put(PREFERRED_OP, preferredOp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -154,7 +154,7 @@ public class AutoAddReplicasPlanActionTest extends SolrCloudTestCase{
|
|||
@SuppressForbidden(reason = "Needs currentTimeMillis to create unique id")
|
||||
private List<SolrRequest> getOperations(JettySolrRunner actionJetty, String lostNodeName) throws Exception {
|
||||
try (AutoAddReplicasPlanAction action = new AutoAddReplicasPlanAction()) {
|
||||
TriggerEvent lostNode = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST, ".auto_add_replicas", Collections.singletonList(System.currentTimeMillis()), Collections.singletonList(lostNodeName));
|
||||
TriggerEvent lostNode = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST, ".auto_add_replicas", Collections.singletonList(System.currentTimeMillis()), Collections.singletonList(lostNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
ActionContext context = new ActionContext(actionJetty.getCoreContainer().getZkController().getSolrCloudManager(), null, new HashMap<>());
|
||||
action.process(lostNode, context);
|
||||
List<SolrRequest> operations = (List) context.getProperty("operations");
|
||||
|
|
|
@ -540,7 +540,6 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
|
|||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
|
||||
// the default policy limits 1 replica per node, we need more right now
|
||||
String setClusterPolicyCommand = "{" +
|
||||
" 'set-cluster-policy': [" +
|
||||
" {'cores':'<" + (1 + numCollections * numShards) + "', 'node':'#ANY'}," +
|
||||
|
@ -612,4 +611,66 @@ public class ComputePlanActionTest extends SolrCloudTestCase {
|
|||
assertEquals(numShards, affectedShards.size());
|
||||
assertEquals(numCollections * numShards, affectedCollShards.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNodeLostTriggerWithDeleteNodePreferredOp() throws Exception {
|
||||
String collectionNamePrefix = "testNodeLostTriggerWithDeleteNodePreferredOp";
|
||||
int numCollections = 1 + random().nextInt(3), numShards = 1 + random().nextInt(3);
|
||||
|
||||
CloudSolrClient solrClient = cluster.getSolrClient();
|
||||
String setTriggerCommand = "{" +
|
||||
"'set-trigger' : {" +
|
||||
"'name' : 'node_lost_trigger'," +
|
||||
"'event' : 'nodeLost'," +
|
||||
"'waitFor' : '1s'," +
|
||||
"'enabled' : true," +
|
||||
"'" + AutoScalingParams.PREFERRED_OP + "':'deletenode'," +
|
||||
"'actions' : [{'name':'compute_plan', 'class' : 'solr.ComputePlanAction'}," +
|
||||
"{'name':'execute_plan','class':'solr.ExecutePlanAction'}" +
|
||||
"{'name':'test','class':'" + AssertingTriggerAction.class.getName() + "'}]" +
|
||||
"}}";
|
||||
SolrRequest req = createAutoScalingRequest(SolrRequest.METHOD.POST, setTriggerCommand);
|
||||
NamedList<Object> response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
|
||||
String setClusterPolicyCommand = "{" +
|
||||
" 'set-cluster-policy': [" +
|
||||
" {'cores':'<" + (1 + numCollections * numShards) + "', 'node':'#ANY'}," +
|
||||
" {'replica':'<2', 'shard': '#EACH', 'node': '#ANY'}," +
|
||||
" {'nodeRole':'overseer', 'replica':0}" +
|
||||
" ]" +
|
||||
"}";
|
||||
req = createAutoScalingRequest(SolrRequest.METHOD.POST, setClusterPolicyCommand);
|
||||
response = solrClient.request(req);
|
||||
assertEquals(response.get("result").toString(), "success");
|
||||
|
||||
JettySolrRunner newNode = cluster.startJettySolrRunner();
|
||||
// cache the node name because it won't be available once the node is shutdown
|
||||
String newNodeName = newNode.getNodeName();
|
||||
|
||||
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collectionNamePrefix + "_0",
|
||||
"conf", numShards, 2);
|
||||
create.process(solrClient);
|
||||
|
||||
waitForState("Timed out waiting for replicas of new collection to be active",
|
||||
collectionNamePrefix + "_0", (liveNodes, collectionState) ->
|
||||
collectionState.getReplicas().stream().allMatch(replica -> replica.isActive(liveNodes)));
|
||||
|
||||
cluster.stopJettySolrRunner(newNode);
|
||||
assertTrue(triggerFiredLatch.await(30, TimeUnit.SECONDS));
|
||||
assertTrue(fired.get());
|
||||
Map actionContext = actionContextPropsRef.get();
|
||||
List operations = (List) actionContext.get("operations");
|
||||
assertNotNull(operations);
|
||||
assertEquals(1, operations.size());
|
||||
for (Object operation : operations) {
|
||||
assertTrue(operation instanceof CollectionAdminRequest.DeleteNode);
|
||||
CollectionAdminRequest.DeleteNode deleteNode = (CollectionAdminRequest.DeleteNode) operation;
|
||||
SolrParams deleteNodeParams = deleteNode.getParams();
|
||||
assertEquals(newNodeName, deleteNodeParams.get("node"));
|
||||
}
|
||||
|
||||
waitForState("Timed out waiting for all shards to have only 1 replica",
|
||||
collectionNamePrefix + "_0", clusterShape(numShards, 1));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -148,7 +148,7 @@ public class ExecutePlanActionTest extends SolrCloudTestCase {
|
|||
List<CollectionAdminRequest.AsyncCollectionAdminRequest> operations = Lists.asList(moveReplica, new CollectionAdminRequest.AsyncCollectionAdminRequest[]{mockRequest});
|
||||
NodeLostTrigger.NodeLostEvent nodeLostEvent = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST,
|
||||
"mock_trigger_name", Collections.singletonList(TimeSource.CURRENT_TIME.getTimeNs()),
|
||||
Collections.singletonList(sourceNodeName));
|
||||
Collections.singletonList(sourceNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
ActionContext actionContext = new ActionContext(survivor.getCoreContainer().getZkController().getSolrCloudManager(), null,
|
||||
new HashMap<>(Collections.singletonMap("operations", operations)));
|
||||
action.process(nodeLostEvent, actionContext);
|
||||
|
|
|
@ -137,7 +137,7 @@ public class TestSimExecutePlanAction extends SimSolrCloudTestCase {
|
|||
List<CollectionAdminRequest.AsyncCollectionAdminRequest> operations = Lists.asList(moveReplica, new CollectionAdminRequest.AsyncCollectionAdminRequest[]{mockRequest});
|
||||
NodeLostTrigger.NodeLostEvent nodeLostEvent = new NodeLostTrigger.NodeLostEvent(TriggerEventType.NODELOST,
|
||||
"mock_trigger_name", Collections.singletonList(TimeSource.CURRENT_TIME.getTimeNs()),
|
||||
Collections.singletonList(sourceNodeName));
|
||||
Collections.singletonList(sourceNodeName), CollectionParams.CollectionAction.MOVEREPLICA.toLower());
|
||||
ActionContext actionContext = new ActionContext(cluster, null,
|
||||
new HashMap<>(Collections.singletonMap("operations", operations)));
|
||||
action.process(nodeLostEvent, actionContext);
|
||||
|
|
|
@ -86,11 +86,47 @@ Apart from the parameters described at <<#trigger-configuration, Trigger Configu
|
|||
"name": "node_added_trigger",
|
||||
"event": "nodeAdded",
|
||||
"waitFor": "5s",
|
||||
"preferredOperation": "addreplica"
|
||||
"preferredOperation": "ADDREPLICA"
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
== Node Lost Trigger
|
||||
|
||||
The `NodeLostTrigger` generates `nodeLost` events when a node leaves the cluster. It can be used to either move replicas
|
||||
that were hosted by the lost node to other nodes or to delete them from the cluster.
|
||||
|
||||
Apart from the parameters described at <<#trigger-configuration, Trigger Configuration>>, this trigger supports the following configuration:
|
||||
|
||||
`preferredOperation`:: (string, optional, defaults to `MOVEREPLICA`) The operation to be performed in response to an event generated by this trigger. By default, replicas will be moved from the lost nodes to the other nodes in the cluster. The only other supported value is `DELETENODE` which deletes all information about replicas that were hosted by the lost node.
|
||||
|
||||
.Example: Node Lost Trigger to move replicas to new node
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"set-trigger": {
|
||||
"name": "node_lost_trigger",
|
||||
"event": "nodeLost",
|
||||
"waitFor": "120s"
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
.Example: Node Lost Trigger to delete replicas
|
||||
[source,json]
|
||||
----
|
||||
{
|
||||
"set-trigger": {
|
||||
"name": "node_lost_trigger",
|
||||
"event": "nodeLost",
|
||||
"waitFor": "120s",
|
||||
"preferredOperation": "DELETENODE"
|
||||
}
|
||||
}
|
||||
----
|
||||
|
||||
TIP: It is recommended that the value of `waitFor` configuration for node lost trigger be larger than a minute so that large full garbage collection pauses do not cause this trigger to generate events and needlessly move or delete replicas in the cluster.
|
||||
|
||||
== Auto Add Replicas Trigger
|
||||
|
||||
When a collection has the parameter `autoAddReplicas` set to true then a trigger configuration named `.auto_add_replicas` is automatically created to watch for nodes going away. This trigger produces `nodeLost` events,
|
||||
|
|
Loading…
Reference in New Issue