Fix ref count handling in Engine.failEngine (#48639) (#48646)

We can run into an already closed store here and hence
throw on trying to increment the ref count => moving to
the guarded ref count increment

closes #48625
This commit is contained in:
Armin Braun 2019-10-30 10:10:48 +01:00 committed by GitHub
parent f2d68934e8
commit 01e326d2e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 13 additions and 9 deletions

View File

@ -1157,7 +1157,6 @@ public abstract class Engine implements Closeable {
maybeDie(reason, failure);
}
if (failEngineLock.tryLock()) {
store.incRef();
try {
if (failedEngine.get() != null) {
logger.warn(() ->
@ -1179,11 +1178,19 @@ public abstract class Engine implements Closeable {
// on the same node that we don't see the corrupted marker file when
// the shard is initializing
if (Lucene.isCorruptionException(failure)) {
if (store.tryIncRef()) {
try {
store.markStoreCorrupted(new IOException("failed engine (reason: [" + reason + "])",
ExceptionsHelper.unwrapCorruption(failure)));
} catch (IOException e) {
logger.warn("Couldn't mark store corrupted", e);
} finally {
store.decRef();
}
} else {
logger.warn(() ->
new ParameterizedMessage("tried to mark store as corrupted but store is already closed. [{}]", reason),
failure);
}
}
eventListener.onFailedEngine(reason, failure);
@ -1192,8 +1199,6 @@ public abstract class Engine implements Closeable {
if (failure != null) inner.addSuppressed(failure);
// don't bubble up these exceptions up
logger.warn("failEngine threw exception", inner);
} finally {
store.decRef();
}
} else {
logger.debug(() -> new ParameterizedMessage("tried to fail engine but could not acquire lock - engine should " +

View File

@ -490,7 +490,6 @@ public class RelocationIT extends ESIntegTestCase {
docs[i] = client().prepareIndex("test", "type1", id).setSource("field1", English.intToEnglish(numDocs + i));
}
indexRandom(true, docs);
numDocs *= 2;
logger.info(" --> waiting for relocation to complete");
ensureGreen(TimeValue.timeValueSeconds(60), "test"); // move all shards to the new nodes (it waits on relocation)