Gateway: add logging around gateway shard allocation

This commit adds more logs around the gateway shard allocation. Any errors while reaching out to nodes to list the local shards are logged in `WARN`. Shard info loading time is logged under DEBUG. Also, we log a `WARN` message if an exception forces a full checksum check during reading the store metadata

Closes #9562
This commit is contained in:
Boaz Leskes 2015-02-04 16:04:07 +01:00
parent 95f46f1212
commit 9362ba200d
5 changed files with 72 additions and 64 deletions

View File

@ -26,6 +26,7 @@ import com.carrotsearch.hppc.predicates.ObjectPredicate;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.FailedNodeException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
@ -48,7 +49,9 @@ import org.elasticsearch.index.store.StoreFileMetaData;
import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;
import org.elasticsearch.transport.ConnectTransportException;
import java.util.*;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;
/**
@ -397,19 +400,7 @@ public class GatewayAllocator extends AbstractComponent {
String[] nodesIdsArray = nodeIds.toArray(String.class);
TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response = listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet();
if (logger.isDebugEnabled()) {
if (response.failures().length > 0) {
StringBuilder sb = new StringBuilder(shard + ": failures when trying to list shards on nodes:");
for (int i = 0; i < response.failures().length; i++) {
Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]);
if (cause instanceof ConnectTransportException) {
continue;
}
sb.append("\n -> ").append(response.failures()[i].getDetailedMessage());
}
logger.debug(sb.toString());
}
}
logListActionFailures(shard, "state", response.failures());
for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) {
// -1 version means it does not exists, which is what the API returns, and what we expect to
@ -420,6 +411,17 @@ public class GatewayAllocator extends AbstractComponent {
return shardStates;
}
private void logListActionFailures(MutableShardRouting shard, String actionType, FailedNodeException[] failures) {
for (final FailedNodeException failure : failures) {
Throwable cause = ExceptionsHelper.unwrapCause(failure);
if (cause instanceof ConnectTransportException) {
continue;
}
// we log warn here. debug logs with full stack traces will be logged if debug logging is turned on for TransportNodeListGatewayStartedShards
logger.warn("{}: failed to list shard {} on node [{}]", failure, shard.shardId(), actionType, failure.nodeId());
}
}
private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId());
ObjectOpenHashSet<String> nodesIds;
@ -448,19 +450,7 @@ public class GatewayAllocator extends AbstractComponent {
if (!nodesIds.isEmpty()) {
String[] nodesIdsArray = nodesIds.toArray(String.class);
TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData.list(shard.shardId(), false, nodesIdsArray, listTimeout).actionGet();
if (logger.isTraceEnabled()) {
if (nodesStoreFilesMetaData.failures().length > 0) {
StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:");
for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
if (cause instanceof ConnectTransportException) {
continue;
}
sb.append("\n -> ").append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
}
logger.trace(sb.toString());
}
}
logListActionFailures(shard, "stores", nodesStoreFilesMetaData.failures());
for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) {
if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {

View File

@ -58,9 +58,6 @@ public final class ShardStateInfo {
@Override
public String toString() {
return "ShardStateInfo{" +
"version=" + version +
", primary=" + primary +
'}';
return "version [" + version + "], primary [" + primary + "]";
}
}

View File

@ -117,10 +117,13 @@ public class TransportNodesListGatewayStartedShards extends TransportNodesOperat
@Override
protected NodeGatewayStartedShards nodeOperation(NodeRequest request) throws ElasticsearchException {
try {
logger.trace("loading shard state info for {}", request.shardId);
ShardStateInfo shardStateInfo = shardsState.loadShardInfo(request.shardId);
if (shardStateInfo != null) {
logger.debug("{} shard state info found: [{}]", request.shardId, shardStateInfo);
return new NodeGatewayStartedShards(clusterService.localNode(), shardStateInfo.version);
}
logger.trace("no shard info found for {}", request.shardId);
return new NodeGatewayStartedShards(clusterService.localNode(), -1);
} catch (Exception e) {
throw new ElasticsearchException("failed to load started shards", e);

View File

@ -22,7 +22,6 @@ package org.elasticsearch.index.store;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
@ -717,13 +716,16 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
} else {
builder.put(segmentsFile, new StoreFileMetaData(segmentsFile, directory.fileLength(segmentsFile), legacyChecksum, maxVersion, hashFile(directory, segmentsFile)));
}
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
} catch (CorruptIndexException | IndexNotFoundException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
// we either know the index is corrupted or it's just not there
throw ex;
} catch (Throwable ex) {
try {
// Lucene checks the checksum after it tries to lookup the codec etc.
// in that case we might get only IAE or similar exceptions while we are really corrupt...
// TODO we should check the checksum in lucene if we hit an exception
logger.warn("failed to build store metadata. checking segment info integrity (with commit [{}])",
ex, commit == null ? "no" : "yes");
Lucene.checkSegmentInfoIntegrity(directory);
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException cex) {
cex.addSuppressed(ex);

View File

@ -41,8 +41,8 @@ import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.index.IndexService;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreFileMetaData;
import org.elasticsearch.indices.IndicesService;
@ -51,7 +51,10 @@ import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.nio.file.Path;
import java.util.*;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicReferenceArray;
/**
@ -143,38 +146,51 @@ public class TransportNodesListShardStoreMetaData extends TransportNodesOperatio
}
private StoreFilesMetaData listStoreMetaData(ShardId shardId) throws IOException {
IndexService indexService = indicesService.indexService(shardId.index().name());
if (indexService != null) {
IndexShard indexShard = indexService.shard(shardId.id());
if (indexShard != null) {
final Store store = indexShard.store();
store.incRef();
try {
return new StoreFilesMetaData(true, shardId, store.getMetadataOrEmpty().asMap());
} finally {
store.decRef();
logger.trace("listing store meta data for {}", shardId);
long startTime = System.currentTimeMillis();
boolean exists = false;
try {
IndexService indexService = indicesService.indexService(shardId.index().name());
if (indexService != null) {
IndexShard indexShard = indexService.shard(shardId.id());
if (indexShard != null) {
final Store store = indexShard.store();
store.incRef();
try {
exists = true;
return new StoreFilesMetaData(true, shardId, store.getMetadataOrEmpty().asMap());
} finally {
store.decRef();
}
}
}
// try and see if we an list unallocated
IndexMetaData metaData = clusterService.state().metaData().index(shardId.index().name());
if (metaData == null) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
String storeType = metaData.settings().get("index.store.type", "fs");
if (!storeType.contains("fs")) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
Path[] shardLocations = nodeEnv.shardDataPaths(shardId, metaData.settings());
Path[] shardIndexLocations = new Path[shardLocations.length];
for (int i = 0; i < shardLocations.length; i++) {
shardIndexLocations[i] = shardLocations[i].resolve("index");
}
exists = FileSystemUtils.exists(shardIndexLocations);
if (!exists) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
return new StoreFilesMetaData(false, shardId, Store.readMetadataSnapshot(shardIndexLocations, logger).asMap());
} finally {
TimeValue took = new TimeValue(System.currentTimeMillis() - startTime);
if (exists) {
logger.debug("loaded store meta data for {} (took [{}])", shardId, took);
} else {
logger.trace("loaded store meta data for {} (took [{}])", shardId, took);
}
}
// try and see if we an list unallocated
IndexMetaData metaData = clusterService.state().metaData().index(shardId.index().name());
if (metaData == null) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
String storeType = metaData.settings().get("index.store.type", "fs");
if (!storeType.contains("fs")) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
Path[] shardLocations = nodeEnv.shardDataPaths(shardId, metaData.settings());
Path[] shardIndexLocations = new Path[shardLocations.length];
for (int i = 0; i < shardLocations.length; i++) {
shardIndexLocations[i] = shardLocations[i].resolve("index");
}
final boolean exists = FileSystemUtils.exists(shardIndexLocations);
if (!exists) {
return new StoreFilesMetaData(false, shardId, ImmutableMap.<String, StoreFileMetaData>of());
}
return new StoreFilesMetaData(false, shardId, Store.readMetadataSnapshot(shardIndexLocations, logger).asMap());
}
@Override