mirror of https://github.com/apache/lucene.git
SOLR-9320: Improve logging
This commit is contained in:
parent
8f9c04e4a1
commit
70d27aec83
|
@ -20,6 +20,7 @@ package org.apache.solr.cloud;
|
|||
|
||||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
|
@ -27,11 +28,13 @@ import org.apache.solr.common.SolrException;
|
|||
import org.apache.solr.common.cloud.ClusterState;
|
||||
import org.apache.solr.common.cloud.ZkNodeProps;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.Utils;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
|
||||
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
|
||||
|
||||
public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||
|
||||
|
@ -42,43 +45,44 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||
ocmh.checkRequired(message, "node");
|
||||
String node = message.getStr("node");
|
||||
if (!state.liveNodesContain(node)) {
|
||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
|
||||
}
|
||||
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
|
||||
cleanupReplicas(results, state, sourceReplicas, ocmh);
|
||||
return null;
|
||||
cleanupReplicas(results, state, sourceReplicas, ocmh, node);
|
||||
}
|
||||
|
||||
static void cleanupReplicas(NamedList results,
|
||||
ClusterState clusterState,
|
||||
List<ZkNodeProps> sourceReplicas,
|
||||
OverseerCollectionMessageHandler ocmh) throws InterruptedException {
|
||||
OverseerCollectionMessageHandler ocmh, String node) throws InterruptedException {
|
||||
CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size());
|
||||
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
||||
log.info("deleting replica from from node {} ", Utils.toJSONString(sourceReplica));
|
||||
log.info("Deleting replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node);
|
||||
NamedList deleteResult = new NamedList();
|
||||
try {
|
||||
ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> {
|
||||
cleanupLatch.countDown();
|
||||
if (deleteResult.get("failure") != null) {
|
||||
synchronized (results) {
|
||||
results.add("failure", "could not delete because " + deleteResult.get("failure") + " " + Utils.toJSONString(sourceReplica));
|
||||
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
|
||||
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node));
|
||||
}
|
||||
}
|
||||
});
|
||||
} catch (KeeperException e) {
|
||||
log.info("Error deleting ", e);
|
||||
log.warn("Error deleting ", e);
|
||||
cleanupLatch.countDown();
|
||||
} catch (Exception e) {
|
||||
log.warn("Error deleting ", e);
|
||||
cleanupLatch.countDown();
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
log.info("Waiting for deletes to complete");
|
||||
log.debug("Waiting for delete node action to complete");
|
||||
cleanupLatch.await(5, TimeUnit.MINUTES);
|
||||
}
|
||||
|
||||
|
|
|
@ -316,7 +316,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
|||
default: {
|
||||
Cmd command = commandMap.get(action);
|
||||
if (command != null) {
|
||||
command.call(zkStateReader.getClusterState(),message, results);
|
||||
command.call(zkStateReader.getClusterState(), message, results);
|
||||
} else {
|
||||
throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:"
|
||||
+ operation);
|
||||
|
@ -617,7 +617,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
|||
@SuppressWarnings("unchecked")
|
||||
void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete)
|
||||
throws KeeperException, InterruptedException {
|
||||
log.info("deleteReplica() : {}", Utils.toJSONString(message));
|
||||
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP);
|
||||
String collectionName = message.getStr(COLLECTION_PROP);
|
||||
String shard = message.getStr(SHARD_ID_PROP);
|
||||
|
@ -664,7 +663,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
|||
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
|
||||
|
||||
sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get());
|
||||
AtomicReference<Exception> exp = new AtomicReference<>();
|
||||
|
||||
Callable<Boolean> callable = () -> {
|
||||
try {
|
||||
|
@ -2809,7 +2807,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
|||
|
||||
interface Cmd {
|
||||
|
||||
Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception;
|
||||
void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception;
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.solr.cloud;
|
|||
import java.lang.invoke.MethodHandles;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
@ -35,7 +36,6 @@ import org.apache.solr.common.cloud.ZkNodeProps;
|
|||
import org.apache.solr.common.cloud.ZkStateReader;
|
||||
import org.apache.solr.common.params.CoreAdminParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.Utils;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -53,7 +53,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||
ZkStateReader zkStateReader = ocmh.zkStateReader;
|
||||
ocmh.checkRequired(message, "source", "target");
|
||||
String source = message.getStr("source");
|
||||
|
@ -76,23 +76,25 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
|
||||
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
||||
NamedList nl = new NamedList();
|
||||
log.info("going to create replica {}", Utils.toJSONString(sourceReplica));
|
||||
log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||
ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
|
||||
final ZkNodeProps addedReplica = ocmh.addReplica(clusterState,
|
||||
msg, nl, () -> {
|
||||
countDownLatch.countDown();
|
||||
if (nl.get("failure") != null) {
|
||||
log.warn("failed to create : " + Utils.toJSONString(msg));
|
||||
String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" +
|
||||
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||
log.warn(errorString);
|
||||
// one replica creation failed. Make the best attempt to
|
||||
// delete all the replicas created so far in the target
|
||||
// and exit
|
||||
synchronized (results) {
|
||||
results.add("failure", "Could not create copy of replica " + Utils.toJSONString(sourceReplica));
|
||||
results.add("failure", errorString);
|
||||
anyOneFailed.set(true);
|
||||
}
|
||||
} else {
|
||||
log.info("successfully created : " + Utils.toJSONString(msg));
|
||||
|
||||
log.debug("Successfully created replica for collection={} shard={} on node={}",
|
||||
sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -101,12 +103,12 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
}
|
||||
}
|
||||
|
||||
log.info("Waiting for creates to complete ");
|
||||
log.debug("Waiting for replace node action to complete");
|
||||
countDownLatch.await(5, TimeUnit.MINUTES);
|
||||
log.info("Waiting over for creates to complete ");
|
||||
log.debug("Finished waiting for replace node action to complete");
|
||||
|
||||
if (anyOneFailed.get()) {
|
||||
log.info("failed to create some cores delete all " + Utils.toJSONString(createdReplicas));
|
||||
log.info("Failed to create some replicas. Cleaning up all replicas on target node");
|
||||
CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
|
||||
for (ZkNodeProps createdReplica : createdReplicas) {
|
||||
NamedList deleteResult = new NamedList();
|
||||
|
@ -115,29 +117,27 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
cleanupLatch.countDown();
|
||||
if (deleteResult.get("failure") != null) {
|
||||
synchronized (results) {
|
||||
results.add("failure", "could not cleanup, because : " + deleteResult.get("failure") + " " + Utils.toJSONString(createdReplica));
|
||||
results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
|
||||
}
|
||||
}
|
||||
});
|
||||
} catch (KeeperException e) {
|
||||
cleanupLatch.countDown();
|
||||
log.info("Error deleting ", e);
|
||||
log.warn("Error deleting replica ", e);
|
||||
} catch (Exception e) {
|
||||
log.error("Unknown Error deleteing", e);
|
||||
log.warn("Error deleting replica ", e);
|
||||
cleanupLatch.countDown();
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
cleanupLatch.await(5, TimeUnit.MINUTES);
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
// we have reached this far means all replicas could be recreated
|
||||
//now cleanup the replicas in the source node
|
||||
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh);
|
||||
results.add("success", "REPLACENODE completed successfully from : " + source + " to : " + target);
|
||||
return null;
|
||||
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source);
|
||||
results.add("success", "REPLACENODE action completed successfully from : " + source + " to : " + target);
|
||||
}
|
||||
|
||||
static List<ZkNodeProps> getReplicasOfNode(String source, ClusterState state) {
|
||||
|
@ -152,7 +152,6 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
|||
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
|
||||
ZkStateReader.REPLICA_PROP, replica.getName(),
|
||||
CoreAdminParams.NODE, source);
|
||||
log.info("src_core : {}", Utils.toJSONString(props));
|
||||
sourceReplicas.add(props
|
||||
);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue