Report more details of unobtainable ShardLock (#61255)

Today a common reason for a `ShardLockObtainFailedException` is when a
shard is removed from a node and then assigned straight back to it again
before the node has had a chance to shut the previous shard instance
down. For instance, this can happen if a node briefly leaves the cluster
holding a primary with no in-sync replicas.

The message in this case is typically as follows:

    obtaining shard lock timed out after 5000ms, previous lock details: [shard creation] trying to lock for [shard creation]

This is pretty hard to interpret, and doesn't raise the important
question: "why didn't the shard shut down sooner?"

With this change we reword the message a bit, report the age of the
shard lock, and adjust the details to report that the lock is held by a
closing shard:

    obtaining shard lock for [starting shard] timed out after [5000ms], lock already held for [closing shard] with age [12345ms]

Relates #38807
This commit is contained in:
David Turner 2020-08-19 06:35:55 +01:00
parent 79390bc80b
commit 389f7779e7
4 changed files with 33 additions and 7 deletions

View File

@ -40,6 +40,7 @@ import org.elasticsearch.common.CheckedFunction;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.common.SuppressForbidden;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.io.FileSystemUtils;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.settings.Setting;
@ -698,6 +699,11 @@ public final class NodeEnvironment implements Closeable {
shardLock.release();
logger.trace("released shard lock for [{}]", shardId);
}
@Override
public void setDetails(String details) {
shardLock.setDetails(details);
}
};
}
@ -729,13 +735,13 @@ public final class NodeEnvironment implements Closeable {
*/
private final Semaphore mutex = new Semaphore(1);
private int waitCount = 1; // guarded by shardLocks
private String lockDetails;
private final ShardId shardId;
private volatile Tuple<Long, String> lockDetails;
InternalShardLock(final ShardId shardId, final String details) {
this.shardId = shardId;
mutex.acquireUninterruptibly();
lockDetails = details;
lockDetails = Tuple.tuple(System.nanoTime(), details);
}
protected void release() {
@ -766,17 +772,23 @@ public final class NodeEnvironment implements Closeable {
void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
try {
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
lockDetails = details;
setDetails(details);
} else {
final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
throw new ShardLockObtainFailedException(shardId,
"obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
"] trying to lock for [" + details + "]");
"obtaining shard lock for [" + details + "] timed out after [" + timeoutInMillis +
"ms], lock already held for [" + lockDetails.v2() + "] with age [" +
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1()) + "ms]");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ShardLockObtainFailedException(shardId, "thread interrupted while trying to obtain shard lock", e);
}
}
public void setDetails(String details) {
lockDetails = Tuple.tuple(System.nanoTime(), details);
}
}
public boolean hasNodeFile() {

View File

@ -54,7 +54,14 @@ public abstract class ShardLock implements Closeable {
}
}
protected abstract void closeInternal();
protected abstract void closeInternal();
/**
* Update the details of the holder of this lock. These details are displayed alongside a {@link ShardLockObtainFailedException}. Must
* only be called by the holder of this lock.
*/
public void setDetails(String details) {
}
@Override
public String toString() {

View File

@ -407,7 +407,7 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
IndexShard indexShard = null;
ShardLock lock = null;
try {
lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
lock = nodeEnv.shardLock(shardId, "starting shard", TimeUnit.SECONDS.toMillis(5));
eventListener.beforeIndexShardCreated(shardId, indexSettings);
ShardPath path;
try {
@ -516,6 +516,9 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
private void closeShard(String reason, ShardId sId, IndexShard indexShard, Store store, IndexEventListener listener) {
final int shardId = sId.id();
final Settings indexSettings = this.getIndexSettings().getSettings();
if (store != null) {
store.beforeClose();
}
try {
try {
listener.beforeIndexShardClosed(sId, indexShard, indexSettings);

View File

@ -686,6 +686,10 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
return refCounter.refCount();
}
public void beforeClose() {
shardLock.setDetails("closing shard");
}
static final class StoreDirectory extends FilterDirectory {
private final Logger deletesLogger;