Report more details of unobtainable ShardLock (#61255)
Today a common reason for a `ShardLockObtainFailedException` is when a shard is removed from a node and then assigned straight back to it again before the node has had a chance to shut the previous shard instance down. For instance, this can happen if a node briefly leaves the cluster holding a primary with no in-sync replicas. The message in this case is typically as follows: obtaining shard lock timed out after 5000ms, previous lock details: [shard creation] trying to lock for [shard creation] This is pretty hard to interpret, and doesn't raise the important question: "why didn't the shard shut down sooner?" With this change we reword the message a bit, report the age of the shard lock, and adjust the details to report that the lock is held by a closing shard: obtaining shard lock for [starting shard] timed out after [5000ms], lock already held for [closing shard] with age [12345ms] Relates #38807
This commit is contained in:
parent
79390bc80b
commit
389f7779e7
|
@ -40,6 +40,7 @@ import org.elasticsearch.common.CheckedFunction;
|
|||
import org.elasticsearch.common.Randomness;
|
||||
import org.elasticsearch.common.SuppressForbidden;
|
||||
import org.elasticsearch.common.UUIDs;
|
||||
import org.elasticsearch.common.collect.Tuple;
|
||||
import org.elasticsearch.common.io.FileSystemUtils;
|
||||
import org.elasticsearch.common.lease.Releasable;
|
||||
import org.elasticsearch.common.settings.Setting;
|
||||
|
@ -698,6 +699,11 @@ public final class NodeEnvironment implements Closeable {
|
|||
shardLock.release();
|
||||
logger.trace("released shard lock for [{}]", shardId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDetails(String details) {
|
||||
shardLock.setDetails(details);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -729,13 +735,13 @@ public final class NodeEnvironment implements Closeable {
|
|||
*/
|
||||
private final Semaphore mutex = new Semaphore(1);
|
||||
private int waitCount = 1; // guarded by shardLocks
|
||||
private String lockDetails;
|
||||
private final ShardId shardId;
|
||||
private volatile Tuple<Long, String> lockDetails;
|
||||
|
||||
InternalShardLock(final ShardId shardId, final String details) {
|
||||
this.shardId = shardId;
|
||||
mutex.acquireUninterruptibly();
|
||||
lockDetails = details;
|
||||
lockDetails = Tuple.tuple(System.nanoTime(), details);
|
||||
}
|
||||
|
||||
protected void release() {
|
||||
|
@ -766,17 +772,23 @@ public final class NodeEnvironment implements Closeable {
|
|||
void acquire(long timeoutInMillis, final String details) throws ShardLockObtainFailedException {
|
||||
try {
|
||||
if (mutex.tryAcquire(timeoutInMillis, TimeUnit.MILLISECONDS)) {
|
||||
lockDetails = details;
|
||||
setDetails(details);
|
||||
} else {
|
||||
final Tuple<Long, String> lockDetails = this.lockDetails; // single volatile read
|
||||
throw new ShardLockObtainFailedException(shardId,
|
||||
"obtaining shard lock timed out after " + timeoutInMillis + "ms, previous lock details: [" + lockDetails +
|
||||
"] trying to lock for [" + details + "]");
|
||||
"obtaining shard lock for [" + details + "] timed out after [" + timeoutInMillis +
|
||||
"ms], lock already held for [" + lockDetails.v2() + "] with age [" +
|
||||
TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lockDetails.v1()) + "ms]");
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new ShardLockObtainFailedException(shardId, "thread interrupted while trying to obtain shard lock", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void setDetails(String details) {
|
||||
lockDetails = Tuple.tuple(System.nanoTime(), details);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNodeFile() {
|
||||
|
|
|
@ -54,7 +54,14 @@ public abstract class ShardLock implements Closeable {
|
|||
}
|
||||
}
|
||||
|
||||
protected abstract void closeInternal();
|
||||
protected abstract void closeInternal();
|
||||
|
||||
/**
|
||||
* Update the details of the holder of this lock. These details are displayed alongside a {@link ShardLockObtainFailedException}. Must
|
||||
* only be called by the holder of this lock.
|
||||
*/
|
||||
public void setDetails(String details) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
|
|
|
@ -407,7 +407,7 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
|
|||
IndexShard indexShard = null;
|
||||
ShardLock lock = null;
|
||||
try {
|
||||
lock = nodeEnv.shardLock(shardId, "shard creation", TimeUnit.SECONDS.toMillis(5));
|
||||
lock = nodeEnv.shardLock(shardId, "starting shard", TimeUnit.SECONDS.toMillis(5));
|
||||
eventListener.beforeIndexShardCreated(shardId, indexSettings);
|
||||
ShardPath path;
|
||||
try {
|
||||
|
@ -516,6 +516,9 @@ public class IndexService extends AbstractIndexComponent implements IndicesClust
|
|||
private void closeShard(String reason, ShardId sId, IndexShard indexShard, Store store, IndexEventListener listener) {
|
||||
final int shardId = sId.id();
|
||||
final Settings indexSettings = this.getIndexSettings().getSettings();
|
||||
if (store != null) {
|
||||
store.beforeClose();
|
||||
}
|
||||
try {
|
||||
try {
|
||||
listener.beforeIndexShardClosed(sId, indexShard, indexSettings);
|
||||
|
|
|
@ -686,6 +686,10 @@ public class Store extends AbstractIndexShardComponent implements Closeable, Ref
|
|||
return refCounter.refCount();
|
||||
}
|
||||
|
||||
public void beforeClose() {
|
||||
shardLock.setDetails("closing shard");
|
||||
}
|
||||
|
||||
static final class StoreDirectory extends FilterDirectory {
|
||||
|
||||
private final Logger deletesLogger;
|
||||
|
|
Loading…
Reference in New Issue