Avoid self-deadlock in the translog (#29520)

Today when reading an operation from the current generation fails
tragically we attempt to close the translog. However, by invoking close
before releasing the read lock we end up in self-deadlock because
closing tries to acquire the write lock and the read lock can not be
upgraded to a write lock. To avoid this, we move the close invocation
outside of the try-with-resources that acquired the read lock. As an
extra guard against this, we document the problem and add an assertion
that we are not trying to invoke close while holding the read lock.
This commit is contained in:
Jason Tedor 2018-04-15 16:26:09 -04:00 committed by GitHub
parent 9125684d86
commit 00fd73acc4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 10 deletions

View File

@ -583,12 +583,7 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
if (current.generation == location.generation) { if (current.generation == location.generation) {
// no need to fsync here the read operation will ensure that buffers are written to disk // no need to fsync here the read operation will ensure that buffers are written to disk
// if they are still in RAM and we are reading onto that position // if they are still in RAM and we are reading onto that position
try {
return current.read(location); return current.read(location);
} catch (final Exception ex) {
closeOnTragicEvent(ex);
throw ex;
}
} else { } else {
// read backwards - it's likely we need to read on that is recent // read backwards - it's likely we need to read on that is recent
for (int i = readers.size() - 1; i >= 0; i--) { for (int i = readers.size() - 1; i >= 0; i--) {
@ -598,6 +593,9 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
} }
} }
} }
} catch (final Exception ex) {
closeOnTragicEvent(ex);
throw ex;
} }
return null; return null;
} }
@ -735,15 +733,28 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
} }
} }
/**
* Closes the translog if the current translog writer experienced a tragic exception.
*
* Note that in case this thread closes the translog it must not already be holding a read lock on the translog as it will acquire a
* write lock in the course of closing the translog
*
* @param ex if an exception occurs closing the translog, it will be suppressed into the provided exception
*/
private void closeOnTragicEvent(final Exception ex) { private void closeOnTragicEvent(final Exception ex) {
// we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock
assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName();
if (current.getTragicException() != null) { if (current.getTragicException() != null) {
try { try {
close(); close();
} catch (final AlreadyClosedException inner) { } catch (final AlreadyClosedException inner) {
// don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as suppressed because /*
// will contain the Exception ex as cause. See also https://github.com/elastic/elasticsearch/issues/15941 * Don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as
* suppressed because it will contain the provided exception as its cause. See also
* https://github.com/elastic/elasticsearch/issues/15941.
*/
} catch (final Exception inner) { } catch (final Exception inner) {
assert (ex != inner.getCause()); assert ex != inner.getCause();
ex.addSuppressed(inner); ex.addSuppressed(inner);
} }
} }

View File

@ -1812,7 +1812,6 @@ public class TranslogTests extends ESTestCase {
assertTrue(translog.getTragicException() instanceof UnknownException); assertTrue(translog.getTragicException() instanceof UnknownException);
} }
@AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/29509")
public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException { public void testFatalIOExceptionsWhileWritingConcurrently() throws IOException, InterruptedException {
Path tempDir = createTempDir(); Path tempDir = createTempDir();
final FailSwitch fail = new FailSwitch(); final FailSwitch fail = new FailSwitch();