SOLR-9320: Improve logging

This commit is contained in:
Varun Thacker 2016-08-13 16:20:51 +05:30
parent 99785c0e09
commit 5fc35a65c3
3 changed files with 32 additions and 31 deletions

View File

@ -20,6 +20,7 @@ package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -27,11 +28,13 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Utils;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd { public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -42,43 +45,44 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
} }
@Override @Override
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ocmh.checkRequired(message, "node"); ocmh.checkRequired(message, "node");
String node = message.getStr("node"); String node = message.getStr("node");
if (!state.liveNodesContain(node)) { if (!state.liveNodesContain(node)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live"); throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
} }
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state); List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
cleanupReplicas(results, state, sourceReplicas, ocmh); cleanupReplicas(results, state, sourceReplicas, ocmh, node);
return null;
} }
static void cleanupReplicas(NamedList results, static void cleanupReplicas(NamedList results,
ClusterState clusterState, ClusterState clusterState,
List<ZkNodeProps> sourceReplicas, List<ZkNodeProps> sourceReplicas,
OverseerCollectionMessageHandler ocmh) throws InterruptedException { OverseerCollectionMessageHandler ocmh, String node) throws InterruptedException {
CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size()); CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size());
for (ZkNodeProps sourceReplica : sourceReplicas) { for (ZkNodeProps sourceReplica : sourceReplicas) {
log.info("deleting replica from from node {} ", Utils.toJSONString(sourceReplica)); log.info("Deleting replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node);
NamedList deleteResult = new NamedList(); NamedList deleteResult = new NamedList();
try { try {
ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> { ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> {
cleanupLatch.countDown(); cleanupLatch.countDown();
if (deleteResult.get("failure") != null) { if (deleteResult.get("failure") != null) {
synchronized (results) { synchronized (results) {
results.add("failure", "could not delete because " + deleteResult.get("failure") + " " + Utils.toJSONString(sourceReplica)); results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node));
} }
} }
}); });
} catch (KeeperException e) { } catch (KeeperException e) {
log.info("Error deleting ", e); log.warn("Error deleting ", e);
cleanupLatch.countDown(); cleanupLatch.countDown();
} catch (Exception e) { } catch (Exception e) {
log.warn("Error deleting ", e);
cleanupLatch.countDown(); cleanupLatch.countDown();
throw e; throw e;
} }
} }
log.info("Waiting for deletes to complete"); log.debug("Waiting for delete node action to complete");
cleanupLatch.await(5, TimeUnit.MINUTES); cleanupLatch.await(5, TimeUnit.MINUTES);
} }

View File

@ -316,7 +316,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
default: { default: {
Cmd command = commandMap.get(action); Cmd command = commandMap.get(action);
if (command != null) { if (command != null) {
command.call(zkStateReader.getClusterState(),message, results); command.call(zkStateReader.getClusterState(), message, results);
} else { } else {
throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:" throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:"
+ operation); + operation);
@ -617,7 +617,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete) void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete)
throws KeeperException, InterruptedException { throws KeeperException, InterruptedException {
log.info("deleteReplica() : {}", Utils.toJSONString(message));
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP); checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP);
String collectionName = message.getStr(COLLECTION_PROP); String collectionName = message.getStr(COLLECTION_PROP);
String shard = message.getStr(SHARD_ID_PROP); String shard = message.getStr(SHARD_ID_PROP);
@ -664,7 +663,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true)); params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get()); sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get());
AtomicReference<Exception> exp = new AtomicReference<>();
Callable<Boolean> callable = () -> { Callable<Boolean> callable = () -> {
try { try {
@ -2809,7 +2807,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
interface Cmd { interface Cmd {
Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception; void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception;
} }

View File

@ -21,6 +21,7 @@ package org.apache.solr.cloud;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.Map; import java.util.Map;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -35,7 +36,6 @@ import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Utils;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -53,7 +53,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
} }
@Override @Override
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ZkStateReader zkStateReader = ocmh.zkStateReader; ZkStateReader zkStateReader = ocmh.zkStateReader;
ocmh.checkRequired(message, "source", "target"); ocmh.checkRequired(message, "source", "target");
String source = message.getStr("source"); String source = message.getStr("source");
@ -76,23 +76,25 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
for (ZkNodeProps sourceReplica : sourceReplicas) { for (ZkNodeProps sourceReplica : sourceReplicas) {
NamedList nl = new NamedList(); NamedList nl = new NamedList();
log.info("going to create replica {}", Utils.toJSONString(sourceReplica)); log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target); ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
final ZkNodeProps addedReplica = ocmh.addReplica(clusterState, final ZkNodeProps addedReplica = ocmh.addReplica(clusterState,
msg, nl, () -> { msg, nl, () -> {
countDownLatch.countDown(); countDownLatch.countDown();
if (nl.get("failure") != null) { if (nl.get("failure") != null) {
log.warn("failed to create : " + Utils.toJSONString(msg)); String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" +
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
log.warn(errorString);
// one replica creation failed. Make the best attempt to // one replica creation failed. Make the best attempt to
// delete all the replicas created so far in the target // delete all the replicas created so far in the target
// and exit // and exit
synchronized (results) { synchronized (results) {
results.add("failure", "Could not create copy of replica " + Utils.toJSONString(sourceReplica)); results.add("failure", errorString);
anyOneFailed.set(true); anyOneFailed.set(true);
} }
} else { } else {
log.info("successfully created : " + Utils.toJSONString(msg)); log.debug("Successfully created replica for collection={} shard={} on node={}",
sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
} }
}); });
@ -101,12 +103,12 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
} }
} }
log.info("Waiting for creates to complete "); log.debug("Waiting for replace node action to complete");
countDownLatch.await(5, TimeUnit.MINUTES); countDownLatch.await(5, TimeUnit.MINUTES);
log.info("Waiting over for creates to complete "); log.debug("Finished waiting for replace node action to complete");
if (anyOneFailed.get()) { if (anyOneFailed.get()) {
log.info("failed to create some cores delete all " + Utils.toJSONString(createdReplicas)); log.info("Failed to create some replicas. Cleaning up all replicas on target node");
CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size()); CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
for (ZkNodeProps createdReplica : createdReplicas) { for (ZkNodeProps createdReplica : createdReplicas) {
NamedList deleteResult = new NamedList(); NamedList deleteResult = new NamedList();
@ -115,29 +117,27 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
cleanupLatch.countDown(); cleanupLatch.countDown();
if (deleteResult.get("failure") != null) { if (deleteResult.get("failure") != null) {
synchronized (results) { synchronized (results) {
results.add("failure", "could not cleanup, because : " + deleteResult.get("failure") + " " + Utils.toJSONString(createdReplica)); results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
} }
} }
}); });
} catch (KeeperException e) { } catch (KeeperException e) {
cleanupLatch.countDown(); cleanupLatch.countDown();
log.info("Error deleting ", e); log.warn("Error deleting replica ", e);
} catch (Exception e) { } catch (Exception e) {
log.error("Unknown Error deleteing", e); log.warn("Error deleting replica ", e);
cleanupLatch.countDown(); cleanupLatch.countDown();
throw e; throw e;
} }
} }
cleanupLatch.await(5, TimeUnit.MINUTES); cleanupLatch.await(5, TimeUnit.MINUTES);
return null;
} }
// we have reached this far means all replicas could be recreated // we have reached this far means all replicas could be recreated
//now cleanup the replicas in the source node //now cleanup the replicas in the source node
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh); DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source);
results.add("success", "REPLACENODE completed successfully from : " + source + " to : " + target); results.add("success", "REPLACENODE action completed successfully from : " + source + " to : " + target);
return null;
} }
static List<ZkNodeProps> getReplicasOfNode(String source, ClusterState state) { static List<ZkNodeProps> getReplicasOfNode(String source, ClusterState state) {
@ -152,7 +152,6 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(), ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
ZkStateReader.REPLICA_PROP, replica.getName(), ZkStateReader.REPLICA_PROP, replica.getName(),
CoreAdminParams.NODE, source); CoreAdminParams.NODE, source);
log.info("src_core : {}", Utils.toJSONString(props));
sourceReplicas.add(props sourceReplicas.add(props
); );
} }