Gateway: add logging around gateway shard allocation
This commit adds more logs around the gateway shard allocation. Any errors while reaching out to nodes to list the local shards are logged in `WARN`. Shard info loading time is logged under DEBUG. Also, we log a `WARN` message if an exception forces a full checksum check during reading the store metadata Closes #9562
This commit is contained in:
parent
95f46f1212
commit
9362ba200d
|
@ -26,6 +26,7 @@ import com.carrotsearch.hppc.predicates.ObjectPredicate;
|
|||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Sets;
|
||||
import org.elasticsearch.ExceptionsHelper;
|
||||
import org.elasticsearch.action.FailedNodeException;
|
||||
import org.elasticsearch.cluster.metadata.IndexMetaData;
|
||||
import org.elasticsearch.cluster.node.DiscoveryNode;
|
||||
import org.elasticsearch.cluster.node.DiscoveryNodes;
|
||||
|
@ -48,7 +49,9 @@ import org.elasticsearch.index.store.StoreFileMetaData;
|
|||
import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;
|
||||
import org.elasticsearch.transport.ConnectTransportException;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
|
||||
/**
|
||||
|
@ -397,19 +400,7 @@ public class GatewayAllocator extends AbstractComponent {
|
|||
|
||||
String[] nodesIdsArray = nodeIds.toArray(String.class);
|
||||
TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response = listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet();
|
||||
if (logger.isDebugEnabled()) {
|
||||
if (response.failures().length > 0) {
|
||||
StringBuilder sb = new StringBuilder(shard + ": failures when trying to list shards on nodes:");
|
||||
for (int i = 0; i < response.failures().length; i++) {
|
||||
Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]);
|
||||
if (cause instanceof ConnectTransportException) {
|
||||
continue;
|
||||
}
|
||||
sb.append("\n -> ").append(response.failures()[i].getDetailedMessage());
|
||||
}
|
||||
logger.debug(sb.toString());
|
||||
}
|
||||
}
|
||||
logListActionFailures(shard, "state", response.failures());
|
||||
|
||||
for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) {
|
||||
// -1 version means it does not exists, which is what the API returns, and what we expect to
|
||||
|
@ -420,6 +411,17 @@ public class GatewayAllocator extends AbstractComponent {
|
|||
return shardStates;
|
||||
}
|
||||
|
||||
private void logListActionFailures(MutableShardRouting shard, String actionType, FailedNodeException[] failures) {
|
||||
for (final FailedNodeException failure : failures) {
|
||||
Throwable cause = ExceptionsHelper.unwrapCause(failure);
|
||||
if (cause instanceof ConnectTransportException) {
|
||||
continue;
|
||||
}
|
||||
// we log warn here. debug logs with full stack traces will be logged if debug logging is turned on for TransportNodeListGatewayStartedShards
|
||||
logger.warn("{}: failed to list shard {} on node [{}]", failure, shard.shardId(), actionType, failure.nodeId());
|
||||
}
|
||||
}
|
||||
|
||||
private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
|
||||
Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId());
|
||||
ObjectOpenHashSet<String> nodesIds;
|
||||
|
@ -448,19 +450,7 @@ public class GatewayAllocator extends AbstractComponent {
|
|||
if (!nodesIds.isEmpty()) {
|
||||
String[] nodesIdsArray = nodesIds.toArray(String.class);
|
||||
TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData.list(shard.shardId(), false, nodesIdsArray, listTimeout).actionGet();
|
||||
if (logger.isTraceEnabled()) {
|
||||
if (nodesStoreFilesMetaData.failures().length > 0) {
|
||||
StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:");
|
||||
for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
|
||||
Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
|
||||
if (cause instanceof ConnectTransportException) {
|
||||
continue;
|
||||
}
|
||||
sb.append("\n -> ").append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
|
||||
}
|
||||
logger.trace(sb.toString());
|
||||
}
|
||||
}
|
||||
logListActionFailures(shard, "stores", nodesStoreFilesMetaData.failures());
|
||||
|
||||
for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) {
|
||||
if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
|
||||
|
|
|
@ -58,9 +58,6 @@ public final class ShardStateInfo {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ShardStateInfo{" +
|
||||
"version=" + version +
|
||||
", primary=" + primary +
|
||||
'}';
|
||||
return "version [" + version + "], primary [" + primary + "]";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -117,10 +117,13 @@ public class TransportNodesListGatewayStartedShards extends TransportNodesOperat
|
|||
@Override
|
||||
protected NodeGatewayStartedShards nodeOperation(NodeRequest request) throws ElasticsearchException {
|
||||
try {
|
||||
logger.trace("loading shard state info for {}", request.shardId);
|
||||
ShardStateInfo shardStateInfo = shardsState.loadShardInfo(request.shardId);
|
||||
if (shardStateInfo != null) {
|
||||
logger.debug("{} shard state info found: [{}]", request.shardId, shardStateInfo);
|
||||
return new NodeGatewayStartedShards(clusterService.localNode(), shardStateInfo.version);
|
||||
}
|
||||
logger.trace("no shard info found for {}", request.shardId);
|
||||
return new NodeGatewayStartedShards(clusterService.localNode(), -1);
|
||||
} catch (Exception e) {
|
||||
throw new ElasticsearchException("failed to load started shards", e);
|
||||
|
|
|
@ -22,7 +22,6 @@ package org.elasticsearch.index.store;
|
|||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Iterables;
|
||||
|
||||
import org.apache.lucene.codecs.CodecUtil;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.*;
|
||||
|
@ -717,13 +716,16 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
|
|||
} else {
|
||||
builder.put(segmentsFile, new StoreFileMetaData(segmentsFile, directory.fileLength(segmentsFile), legacyChecksum, maxVersion, hashFile(directory, segmentsFile)));
|
||||
}
|
||||
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
|
||||
} catch (CorruptIndexException | IndexNotFoundException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
|
||||
// we either know the index is corrupted or it's just not there
|
||||
throw ex;
|
||||
} catch (Throwable ex) {
|
||||
try {
|
||||
// Lucene checks the checksum after it tries to lookup the codec etc.
|
||||
// in that case we might get only IAE or similar exceptions while we are really corrupt...
|
||||
// TODO we should check the checksum in lucene if we hit an exception
|
||||
logger.warn("failed to build store metadata. checking segment info integrity (with commit [{}])",
|
||||
ex, commit == null ? "no" : "yes");
|
||||
Lucene.checkSegmentInfoIntegrity(directory);
|
||||
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException cex) {
|
||||
cex.addSuppressed(ex);
|
||||
|
|
|
@ -41,8 +41,8 @@ import org.elasticsearch.common.settings.Settings;
|
|||
import org.elasticsearch.common.unit.TimeValue;
|
||||
import org.elasticsearch.env.NodeEnvironment;
|
||||
import org.elasticsearch.index.IndexService;
|
||||
import org.elasticsearch.index.shard.ShardId;
|
||||
import org.elasticsearch.index.shard.IndexShard;
|
||||
import org.elasticsearch.index.shard.ShardId;
|
||||
import org.elasticsearch.index.store.Store;
|
||||
import org.elasticsearch.index.store.StoreFileMetaData;
|
||||
import org.elasticsearch.indices.IndicesService;
|
||||
|
@ -51,7 +51,10 @@ import org.elasticsearch.transport.TransportService;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.*;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicReferenceArray;
|
||||
|
||||
/**
|
||||
|
@ -143,38 +146,51 @@ public class TransportNodesListShardStoreMetaData extends TransportNodesOperatio
|
|||
}
|
||||
|
||||
private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException {
|
||||
IndexService indexService = indicesService.indexService(shardId.index().name());
|
||||
if (indexService != null) {
|
||||
IndexShard indexShard = indexService.shard(shardId.id());
|
||||
if (indexShard != null) {
|
||||
final Store store = indexShard.store();
|
||||
store.incRef();
|
||||
try {
|
||||
return new StoreFilesMetaData(true, shardId, store.getMetadataOrEmpty().asMap());
|
||||
} finally {
|
||||
store.decRef();
|
||||
logger.trace("listing store meta data for {}", shardId);
|
||||
long startTime = System.currentTimeMillis();
|
||||
boolean exists = false;
|
||||
try {
|
||||
IndexService indexService = indicesService.indexService(shardId.index().name());
|
||||
if (indexService != null) {
|
||||
IndexShard indexShard = indexService.shard(shardId.id());
|
||||
if (indexShard != null) {
|
||||
final Store store = indexShard.store();
|
||||
store.incRef();
|
||||
try {
|
||||
exists = true;
|
||||
return new StoreFilesMetaData(true, shardId, store.getMetadataOrEmpty().asMap());
|
||||
} finally {
|
||||
store.decRef();
|
||||
}
|
||||
}
|
||||
}
|
||||
// try and see if we an list unallocated
|
||||
IndexMetaData metaData = clusterService.state().metaData().index(shardId.index().name());
|
||||
if (metaData == null) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
String storeType = metaData.settings().get("index.store.type", "fs");
|
||||
if (!storeType.contains("fs")) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
Path[] shardLocations = nodeEnv.shardDataPaths(shardId, metaData.settings());
|
||||
Path[] shardIndexLocations = new Path[shardLocations.length];
|
||||
for (int i = 0; i < shardLocations.length; i++) {
|
||||
shardIndexLocations[i] = shardLocations[i].resolve("index");
|
||||
}
|
||||
exists = FileSystemUtils.exists(shardIndexLocations);
|
||||
if (!exists) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
return new StoreFilesMetaData(false, shardId, Store.readMetadataSnapshot(shardIndexLocations, logger).asMap());
|
||||
} finally {
|
||||
TimeValue took = new TimeValue(System.currentTimeMillis() - startTime);
|
||||
if (exists) {
|
||||
logger.debug("loaded store meta data for {} (took [{}])", shardId, took);
|
||||
} else {
|
||||
logger.trace("loaded store meta data for {} (took [{}])", shardId, took);
|
||||
}
|
||||
}
|
||||
// try and see if we an list unallocated
|
||||
IndexMetaData metaData = clusterService.state().metaData().index(shardId.index().name());
|
||||
if (metaData == null) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
String storeType = metaData.settings().get("index.store.type", "fs");
|
||||
if (!storeType.contains("fs")) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
Path[] shardLocations = nodeEnv.shardDataPaths(shardId, metaData.settings());
|
||||
Path[] shardIndexLocations = new Path[shardLocations.length];
|
||||
for (int i = 0; i < shardLocations.length; i++) {
|
||||
shardIndexLocations[i] = shardLocations[i].resolve("index");
|
||||
}
|
||||
final boolean exists = FileSystemUtils.exists(shardIndexLocations);
|
||||
if (!exists) {
|
||||
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
|
||||
}
|
||||
return new StoreFilesMetaData(false, shardId, Store.readMetadataSnapshot(shardIndexLocations, logger).asMap());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
Loading…
Reference in New Issue