mirror of https://github.com/apache/lucene.git
SOLR-9320: Improve logging
This commit is contained in:
parent
8f9c04e4a1
commit
70d27aec83
|
@ -20,6 +20,7 @@ package org.apache.solr.cloud;
|
||||||
|
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
@ -27,11 +28,13 @@ import org.apache.solr.common.SolrException;
|
||||||
import org.apache.solr.common.cloud.ClusterState;
|
import org.apache.solr.common.cloud.ClusterState;
|
||||||
import org.apache.solr.common.cloud.ZkNodeProps;
|
import org.apache.solr.common.cloud.ZkNodeProps;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
import org.apache.solr.common.util.Utils;
|
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
|
||||||
|
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
|
||||||
|
|
||||||
public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
|
||||||
|
|
||||||
|
@ -42,43 +45,44 @@ public class DeleteNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||||
ocmh.checkRequired(message, "node");
|
ocmh.checkRequired(message, "node");
|
||||||
String node = message.getStr("node");
|
String node = message.getStr("node");
|
||||||
if (!state.liveNodesContain(node)) {
|
if (!state.liveNodesContain(node)) {
|
||||||
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
|
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + node + " is not live");
|
||||||
}
|
}
|
||||||
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
|
List<ZkNodeProps> sourceReplicas = ReplaceNodeCmd.getReplicasOfNode(node, state);
|
||||||
cleanupReplicas(results, state, sourceReplicas, ocmh);
|
cleanupReplicas(results, state, sourceReplicas, ocmh, node);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void cleanupReplicas(NamedList results,
|
static void cleanupReplicas(NamedList results,
|
||||||
ClusterState clusterState,
|
ClusterState clusterState,
|
||||||
List<ZkNodeProps> sourceReplicas,
|
List<ZkNodeProps> sourceReplicas,
|
||||||
OverseerCollectionMessageHandler ocmh) throws InterruptedException {
|
OverseerCollectionMessageHandler ocmh, String node) throws InterruptedException {
|
||||||
CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size());
|
CountDownLatch cleanupLatch = new CountDownLatch(sourceReplicas.size());
|
||||||
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
||||||
log.info("deleting replica from from node {} ", Utils.toJSONString(sourceReplica));
|
log.info("Deleting replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node);
|
||||||
NamedList deleteResult = new NamedList();
|
NamedList deleteResult = new NamedList();
|
||||||
try {
|
try {
|
||||||
ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> {
|
ocmh.deleteReplica(clusterState, sourceReplica.plus("parallel", "true"), deleteResult, () -> {
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
if (deleteResult.get("failure") != null) {
|
if (deleteResult.get("failure") != null) {
|
||||||
synchronized (results) {
|
synchronized (results) {
|
||||||
results.add("failure", "could not delete because " + deleteResult.get("failure") + " " + Utils.toJSONString(sourceReplica));
|
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" +
|
||||||
|
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), node));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (KeeperException e) {
|
} catch (KeeperException e) {
|
||||||
log.info("Error deleting ", e);
|
log.warn("Error deleting ", e);
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
log.warn("Error deleting ", e);
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
log.info("Waiting for deletes to complete");
|
log.debug("Waiting for delete node action to complete");
|
||||||
cleanupLatch.await(5, TimeUnit.MINUTES);
|
cleanupLatch.await(5, TimeUnit.MINUTES);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -617,7 +617,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete)
|
void deleteReplica(ClusterState clusterState, ZkNodeProps message, NamedList results, Runnable onComplete)
|
||||||
throws KeeperException, InterruptedException {
|
throws KeeperException, InterruptedException {
|
||||||
log.info("deleteReplica() : {}", Utils.toJSONString(message));
|
|
||||||
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP);
|
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP);
|
||||||
String collectionName = message.getStr(COLLECTION_PROP);
|
String collectionName = message.getStr(COLLECTION_PROP);
|
||||||
String shard = message.getStr(SHARD_ID_PROP);
|
String shard = message.getStr(SHARD_ID_PROP);
|
||||||
|
@ -664,7 +663,6 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
||||||
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
|
params.set(CoreAdminParams.DELETE_DATA_DIR, message.getBool(CoreAdminParams.DELETE_DATA_DIR, true));
|
||||||
|
|
||||||
sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get());
|
sendShardRequest(replica.getNodeName(), params, shardHandler, asyncId, requestMap.get());
|
||||||
AtomicReference<Exception> exp = new AtomicReference<>();
|
|
||||||
|
|
||||||
Callable<Boolean> callable = () -> {
|
Callable<Boolean> callable = () -> {
|
||||||
try {
|
try {
|
||||||
|
@ -2809,7 +2807,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler
|
||||||
|
|
||||||
interface Cmd {
|
interface Cmd {
|
||||||
|
|
||||||
Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception;
|
void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.solr.cloud;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.concurrent.CountDownLatch;
|
import java.util.concurrent.CountDownLatch;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
@ -35,7 +36,6 @@ import org.apache.solr.common.cloud.ZkNodeProps;
|
||||||
import org.apache.solr.common.cloud.ZkStateReader;
|
import org.apache.solr.common.cloud.ZkStateReader;
|
||||||
import org.apache.solr.common.params.CoreAdminParams;
|
import org.apache.solr.common.params.CoreAdminParams;
|
||||||
import org.apache.solr.common.util.NamedList;
|
import org.apache.solr.common.util.NamedList;
|
||||||
import org.apache.solr.common.util.Utils;
|
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -53,7 +53,7 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Object call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
|
||||||
ZkStateReader zkStateReader = ocmh.zkStateReader;
|
ZkStateReader zkStateReader = ocmh.zkStateReader;
|
||||||
ocmh.checkRequired(message, "source", "target");
|
ocmh.checkRequired(message, "source", "target");
|
||||||
String source = message.getStr("source");
|
String source = message.getStr("source");
|
||||||
|
@ -76,23 +76,25 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
|
|
||||||
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
for (ZkNodeProps sourceReplica : sourceReplicas) {
|
||||||
NamedList nl = new NamedList();
|
NamedList nl = new NamedList();
|
||||||
log.info("going to create replica {}", Utils.toJSONString(sourceReplica));
|
log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||||
ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
|
ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
|
||||||
final ZkNodeProps addedReplica = ocmh.addReplica(clusterState,
|
final ZkNodeProps addedReplica = ocmh.addReplica(clusterState,
|
||||||
msg, nl, () -> {
|
msg, nl, () -> {
|
||||||
countDownLatch.countDown();
|
countDownLatch.countDown();
|
||||||
if (nl.get("failure") != null) {
|
if (nl.get("failure") != null) {
|
||||||
log.warn("failed to create : " + Utils.toJSONString(msg));
|
String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" +
|
||||||
|
" on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||||
|
log.warn(errorString);
|
||||||
// one replica creation failed. Make the best attempt to
|
// one replica creation failed. Make the best attempt to
|
||||||
// delete all the replicas created so far in the target
|
// delete all the replicas created so far in the target
|
||||||
// and exit
|
// and exit
|
||||||
synchronized (results) {
|
synchronized (results) {
|
||||||
results.add("failure", "Could not create copy of replica " + Utils.toJSONString(sourceReplica));
|
results.add("failure", errorString);
|
||||||
anyOneFailed.set(true);
|
anyOneFailed.set(true);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log.info("successfully created : " + Utils.toJSONString(msg));
|
log.debug("Successfully created replica for collection={} shard={} on node={}",
|
||||||
|
sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -101,12 +103,12 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Waiting for creates to complete ");
|
log.debug("Waiting for replace node action to complete");
|
||||||
countDownLatch.await(5, TimeUnit.MINUTES);
|
countDownLatch.await(5, TimeUnit.MINUTES);
|
||||||
log.info("Waiting over for creates to complete ");
|
log.debug("Finished waiting for replace node action to complete");
|
||||||
|
|
||||||
if (anyOneFailed.get()) {
|
if (anyOneFailed.get()) {
|
||||||
log.info("failed to create some cores delete all " + Utils.toJSONString(createdReplicas));
|
log.info("Failed to create some replicas. Cleaning up all replicas on target node");
|
||||||
CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
|
CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
|
||||||
for (ZkNodeProps createdReplica : createdReplicas) {
|
for (ZkNodeProps createdReplica : createdReplicas) {
|
||||||
NamedList deleteResult = new NamedList();
|
NamedList deleteResult = new NamedList();
|
||||||
|
@ -115,29 +117,27 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
if (deleteResult.get("failure") != null) {
|
if (deleteResult.get("failure") != null) {
|
||||||
synchronized (results) {
|
synchronized (results) {
|
||||||
results.add("failure", "could not cleanup, because : " + deleteResult.get("failure") + " " + Utils.toJSONString(createdReplica));
|
results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
} catch (KeeperException e) {
|
} catch (KeeperException e) {
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
log.info("Error deleting ", e);
|
log.warn("Error deleting replica ", e);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Unknown Error deleteing", e);
|
log.warn("Error deleting replica ", e);
|
||||||
cleanupLatch.countDown();
|
cleanupLatch.countDown();
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cleanupLatch.await(5, TimeUnit.MINUTES);
|
cleanupLatch.await(5, TimeUnit.MINUTES);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// we have reached this far means all replicas could be recreated
|
// we have reached this far means all replicas could be recreated
|
||||||
//now cleanup the replicas in the source node
|
//now cleanup the replicas in the source node
|
||||||
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh);
|
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source);
|
||||||
results.add("success", "REPLACENODE completed successfully from : " + source + " to : " + target);
|
results.add("success", "REPLACENODE action completed successfully from : " + source + " to : " + target);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static List<ZkNodeProps> getReplicasOfNode(String source, ClusterState state) {
|
static List<ZkNodeProps> getReplicasOfNode(String source, ClusterState state) {
|
||||||
|
@ -152,7 +152,6 @@ public class ReplaceNodeCmd implements OverseerCollectionMessageHandler.Cmd {
|
||||||
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
|
ZkStateReader.CORE_NAME_PROP, replica.getCoreName(),
|
||||||
ZkStateReader.REPLICA_PROP, replica.getName(),
|
ZkStateReader.REPLICA_PROP, replica.getName(),
|
||||||
CoreAdminParams.NODE, source);
|
CoreAdminParams.NODE, source);
|
||||||
log.info("src_core : {}", Utils.toJSONString(props));
|
|
||||||
sourceReplicas.add(props
|
sourceReplicas.add(props
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue