From a6e6c4efc48bc23501995c6c864567311d93db72 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Thu, 20 Nov 2014 16:29:43 +0100 Subject: [PATCH] [CORE] Ensure shards are deleted under lock on close Today there is a race condition between the actual deletion of the shard and the release of the lock in the store. This race can cause rare imports of dangeling indices if the cluster state update loop tires to import the dangeling index in that particular windonw. This commit adds more safety to the import of dangeling indices and removes the race condition by holding on to the lock on store closing while the listener is notified. --- .../state/meta/LocalGatewayMetaState.java | 26 ++++++++++++++----- .../org/elasticsearch/index/store/Store.java | 14 +++++----- .../elasticsearch/indices/IndicesService.java | 4 ++- .../indices/InternalIndicesService.java | 4 ++- 4 files changed, 32 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/elasticsearch/gateway/local/state/meta/LocalGatewayMetaState.java b/src/main/java/org/elasticsearch/gateway/local/state/meta/LocalGatewayMetaState.java index f937e288609..fd01ea29e9d 100644 --- a/src/main/java/org/elasticsearch/gateway/local/state/meta/LocalGatewayMetaState.java +++ b/src/main/java/org/elasticsearch/gateway/local/state/meta/LocalGatewayMetaState.java @@ -45,6 +45,7 @@ import org.elasticsearch.common.util.concurrent.ConcurrentCollections; import org.elasticsearch.common.util.concurrent.FutureUtils; import org.elasticsearch.common.xcontent.*; import org.elasticsearch.env.NodeEnvironment; +import org.elasticsearch.env.ShardLock; import org.elasticsearch.index.Index; import org.elasticsearch.threadpool.ThreadPool; @@ -287,6 +288,25 @@ public class LocalGatewayMetaState extends AbstractComponent implements ClusterS } final IndexMetaData indexMetaData = loadIndexState(indexName); if (indexMetaData != null) { + final Index index = new Index(indexName); + try { + // the index deletion might not have worked due to shards still being locked + // we have three cases here: + // - we acquired all shards locks here --> we can import the dangeling index + // - we failed to acquire the lock --> somebody else uses it - DON'T IMPORT + // - we acquired successfully but the lock list is empty --> no shards present - DON'T IMPORT + // in the last case we should in-fact try to delete the directory since it might be a leftover... + final List shardLocks = nodeEnv.lockAllForIndex(index); + if (shardLocks.isEmpty()) { + // no shards - try to remove the directory + nodeEnv.deleteIndexDirectorySafe(index); + continue; + } + IOUtils.closeWhileHandlingException(shardLocks); + } catch (IOException ex) { + logger.warn("[{}] skipping locked dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state is set to [{}]", ex, indexName, autoImportDangled); + continue; + } if(autoImportDangled.shouldImport()){ logger.info("[{}] dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state [{}]", indexName, autoImportDangled); danglingIndices.put(indexName, new DanglingIndex(indexName, null)); @@ -300,12 +320,6 @@ public class LocalGatewayMetaState extends AbstractComponent implements ClusterS logger.warn("[{}] failed to delete dangling index", ex, indexName); } } else { - try { // the index deletion might not have worked due to shards still being locked - IOUtils.closeWhileHandlingException(nodeEnv.lockAllForIndex(new Index(indexName))); - } catch (IOException ex) { - logger.warn("[{}] skipping locked dangling index, exists on local file system, but not in cluster metadata, auto import to cluster state is set to [{}]", ex, indexName, autoImportDangled); - continue; - } logger.info("[{}] dangling index, exists on local file system, but not in cluster metadata, scheduling to delete in [{}], auto import to cluster state [{}]", indexName, danglingTimeout, autoImportDangled); danglingIndices.put(indexName, new DanglingIndex(indexName, threadPool.schedule(danglingTimeout, ThreadPool.Names.SAME, new RemoveDanglingIndex(indexName)))); } diff --git a/src/main/java/org/elasticsearch/index/store/Store.java b/src/main/java/org/elasticsearch/index/store/Store.java index 51c51907e93..ce166cec892 100644 --- a/src/main/java/org/elasticsearch/index/store/Store.java +++ b/src/main/java/org/elasticsearch/index/store/Store.java @@ -364,15 +364,13 @@ public class Store extends AbstractIndexShardComponent implements CloseableIndex logger.debug("failed to close directory", e); } finally { try { - IOUtils.closeWhileHandlingException(shardLock); - } finally { - try { - if (listener != null) { - listener.onClose(shardId); - } - } catch (Exception ex){ - logger.debug("OnCloseListener threw an exception", ex); + if (listener != null) { + listener.onClose(shardId); } + } catch (Exception ex){ + logger.debug("OnCloseListener threw an exception", ex); + } finally { + IOUtils.closeWhileHandlingException(shardLock); } diff --git a/src/main/java/org/elasticsearch/indices/IndicesService.java b/src/main/java/org/elasticsearch/indices/IndicesService.java index d64b1481911..33d97ed5e13 100644 --- a/src/main/java/org/elasticsearch/indices/IndicesService.java +++ b/src/main/java/org/elasticsearch/indices/IndicesService.java @@ -113,7 +113,9 @@ public interface IndicesService extends Iterable, LifecycleCompone public void onAllShardsClosed(Index index, List failures); /** - * Invoked once the last resource using the given shard ID is released + * Invoked once the last resource using the given shard ID is released. + * Yet, this method is called while still holding the shards lock such that + * operations on the shards data can safely be executed in this callback. */ public void onShardClosed(ShardId shardId); diff --git a/src/main/java/org/elasticsearch/indices/InternalIndicesService.java b/src/main/java/org/elasticsearch/indices/InternalIndicesService.java index 13421bef179..3a91c9f7f9a 100644 --- a/src/main/java/org/elasticsearch/indices/InternalIndicesService.java +++ b/src/main/java/org/elasticsearch/indices/InternalIndicesService.java @@ -20,6 +20,7 @@ package org.elasticsearch.indices; import com.google.common.collect.*; +import org.apache.lucene.util.IOUtils; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.ElasticsearchIllegalStateException; import org.elasticsearch.action.admin.indices.stats.CommonStats; @@ -357,7 +358,8 @@ public class InternalIndicesService extends AbstractLifecycleComponent