From 5fc35a65c39b7fb4ca49675d3ef9cd7f9d6c0fa8 Mon Sep 17 00:00:00 2001 From: Varun Thacker Date: Sat, 13 Aug 2016 16:20:51 +0530 Subject: [PATCH] SOLR-9320: Improve logging --- .../org/apache/solr/cloud/DeleteNodeCmd.java | 22 +++++++----- .../OverseerCollectionMessageHandler.java | 6 ++-- .../org/apache/solr/cloud/ReplaceNodeCmd.java | 35 +++++++++---------- 3 files changed, 32 insertions(+), 31 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java index cbcfa8847df..3e600908cd6 100644 --- a/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/DeleteNodeCmd.java @@ -20,6 +20,7 @@ package org.apache.solr.cloud; import java.lang.invoke.MethodHandles; import java.util.List; +import java.util.Locale; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -27,11 +28,13 @@ import org.apache.solr.common.SolrException; import org.apache.solr.common.cloud.ClusterState; import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.util.Utils; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP; +import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP; + public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd { private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); @@ -42,43 +45,44 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd { } @Override - public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { + public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { ocmh.checkRequired(message, "node"); String node = message.getStr("node"); if (!state.liveNodesContain(node)) { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live"); } List sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state); - cleanupReplicas(results, state, sourceReplicas, ocmh); - return null; + cleanupReplicas(results, state, sourceReplicas, ocmh, node); } static void cleanupReplicas(NamedList results, ClusterState clusterState, List sourceReplicas, - OverseerCollectionMessageHandler ocmh) throws InterruptedException { + OverseerCollectionMessageHandler ocmh, String node) throws InterruptedException { CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size()); for (ZkNodeProps sourceReplica : sourceReplicas) { - log.info("deleting replica from from node {} ", Utils.toJSONString(sourceReplica)); + log.info("Deleting replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node); NamedList deleteResult = new NamedList(); try { ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> { cleanupLatch.countDown(); if (deleteResult.get("failure") != null) { synchronized (results) { - results.add("failure", "could not delete because " + deleteResult.get("failure") + " " + Utils.toJSONString(sourceReplica)); + results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" + + " on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node)); } } }); } catch (KeeperException e) { - log.info("Error deleting ", e); + log.warn("Error deleting ", e); cleanupLatch.countDown(); } catch (Exception e) { + log.warn("Error deleting ", e); cleanupLatch.countDown(); throw e; } } - log.info("Waiting for deletes to complete"); + log.debug("Waiting for delete node action to complete"); cleanupLatch.await(5, TimeUnit.MINUTES); } diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java index 908d35c3e6b..0588446c653 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerCollectionMessageHandler.java @@ -316,7 +316,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler default: { Cmd command = commandMap.get(action); if (command != null) { - command.call(zkStateReader.getClusterState(),message, results); + command.call(zkStateReader.getClusterState(), message, results); } else { throw new SolrException(ErrorCode.BAD_REQUEST, "Unknown operation:" + operation); @@ -617,7 +617,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler @SuppressWarnings("unchecked") void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete) throws KeeperException, InterruptedException { - log.info("deleteReplica() : {}", Utils.toJSONString(message)); checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP); String collectionName = message.getStr(COLLECTION_PROP); String shard = message.getStr(SHARD_ID_PROP); @@ -664,7 +663,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true)); sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get()); - AtomicReference exp = new AtomicReference<>(); Callable callable = () -> { try { @@ -2809,7 +2807,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler interface Cmd { - Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception; + void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception; } diff --git a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java index 0cfd0894ffe..aad9cc721fb 100644 --- a/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java +++ b/solr/core/src/java/org/apache/solr/cloud/ReplaceNodeCmd.java @@ -21,6 +21,7 @@ package org.apache.solr.cloud; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -35,7 +36,6 @@ import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.util.NamedList; -import org.apache.solr.common.util.Utils; import org.apache.zookeeper.KeeperException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,7 +53,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd { } @Override - public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { + public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception { ZkStateReader zkStateReader = ocmh.zkStateReader; ocmh.checkRequired(message, "source", "target"); String source = message.getStr("source"); @@ -76,23 +76,25 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd { for (ZkNodeProps sourceReplica : sourceReplicas) { NamedList nl = new NamedList(); - log.info("going to create replica {}", Utils.toJSONString(sourceReplica)); + log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target); ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target); final ZkNodeProps addedReplica = ocmh.addReplica(clusterState, msg, nl, () -> { countDownLatch.countDown(); if (nl.get("failure") != null) { - log.warn("failed to create : " + Utils.toJSONString(msg)); + String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + + " on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target); + log.warn(errorString); // one replica creation failed. Make the best attempt to // delete all the replicas created so far in the target // and exit synchronized (results) { - results.add("failure", "Could not create copy of replica " + Utils.toJSONString(sourceReplica)); + results.add("failure", errorString); anyOneFailed.set(true); } } else { - log.info("successfully created : " + Utils.toJSONString(msg)); - + log.debug("Successfully created replica for collection={} shard={} on node={}", + sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target); } }); @@ -101,12 +103,12 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd { } } - log.info("Waiting for creates to complete "); + log.debug("Waiting for replace node action to complete"); countDownLatch.await(5, TimeUnit.MINUTES); - log.info("Waiting over for creates to complete "); + log.debug("Finished waiting for replace node action to complete"); if (anyOneFailed.get()) { - log.info("failed to create some cores delete all " + Utils.toJSONString(createdReplicas)); + log.info("Failed to create some replicas. Cleaning up all replicas on target node"); CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size()); for (ZkNodeProps createdReplica : createdReplicas) { NamedList deleteResult = new NamedList(); @@ -115,29 +117,27 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd { cleanupLatch.countDown(); if (deleteResult.get("failure") != null) { synchronized (results) { - results.add("failure", "could not cleanup, because : " + deleteResult.get("failure") + " " + Utils.toJSONString(createdReplica)); + results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure")); } } }); } catch (KeeperException e) { cleanupLatch.countDown(); - log.info("Error deleting ", e); + log.warn("Error deleting replica ", e); } catch (Exception e) { - log.error("Unknown Error deleteing", e); + log.warn("Error deleting replica ", e); cleanupLatch.countDown(); throw e; } } cleanupLatch.await(5, TimeUnit.MINUTES); - return null; } // we have reached this far means all replicas could be recreated //now cleanup the replicas in the source node - DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh); - results.add("success", "REPLACENODE completed successfully from : " + source + " to : " + target); - return null; + DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source); + results.add("success", "REPLACENODE action completed successfully from : " + source + " to : " + target); } static List getReplicasOfNode(String source, ClusterState state) { @@ -152,7 +152,6 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd { ZkStateReader.CORE_NAME_PROP, replica.getCoreName(), ZkStateReader.REPLICA_PROP, replica.getName(), CoreAdminParams.NODE, source); - log.info("src_core : {}", Utils.toJSONString(props)); sourceReplicas.add(props ); }