mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-09 14:34:43 +00:00
Ensure no circular reference in translog tragic exception (#55959)
We generate a circular reference exception in translog in 6.8 in the following scenario: - The first rollGeneration hits "too many open files" exception when it's copying a checkpoint file. We will set the tragic exception and close the translog - The second rollGeneration hits AlreadyClosedException as the current writer is closed. We will suppress the ACE to the current tragic exception. Unfortunately, this leads to a circular reference as ACE already suppresses the tragic exception. Other factors that help to manifest this bug: - We do not fail the engine on AlreadyClosedException in flush - We do not check for ensureOpen before rolling a new generation Closes #55893
This commit is contained in:
parent
c204353249
commit
2fd257add2
@ -239,7 +239,7 @@ public final class ExceptionsHelper {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
private static <T extends Throwable> Optional<T> unwrapCausesAndSuppressed(Throwable cause, Predicate<Throwable> predicate) {
|
public static <T extends Throwable> Optional<T> unwrapCausesAndSuppressed(Throwable cause, Predicate<Throwable> predicate) {
|
||||||
if (predicate.test(cause)) {
|
if (predicate.test(cause)) {
|
||||||
return Optional.of((T) cause);
|
return Optional.of((T) cause);
|
||||||
}
|
}
|
||||||
|
@ -1808,6 +1808,7 @@ public class InternalEngine extends Engine {
|
|||||||
refresh("version_table_flush", SearcherScope.INTERNAL, true);
|
refresh("version_table_flush", SearcherScope.INTERNAL, true);
|
||||||
translog.trimUnreferencedReaders();
|
translog.trimUnreferencedReaders();
|
||||||
} catch (AlreadyClosedException e) {
|
} catch (AlreadyClosedException e) {
|
||||||
|
failOnTragicEvent(e);
|
||||||
throw e;
|
throw e;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new FlushFailedEngineException(shardId, e);
|
throw new FlushFailedEngineException(shardId, e);
|
||||||
|
@ -19,6 +19,9 @@
|
|||||||
|
|
||||||
package org.elasticsearch.index.translog;
|
package org.elasticsearch.index.translog;
|
||||||
|
|
||||||
|
import org.apache.lucene.store.AlreadyClosedException;
|
||||||
|
import org.elasticsearch.ExceptionsHelper;
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
import java.util.concurrent.atomic.AtomicReference;
|
||||||
|
|
||||||
public class TragicExceptionHolder {
|
public class TragicExceptionHolder {
|
||||||
@ -30,10 +33,15 @@ public class TragicExceptionHolder {
|
|||||||
*/
|
*/
|
||||||
public void setTragicException(Exception ex) {
|
public void setTragicException(Exception ex) {
|
||||||
assert ex != null;
|
assert ex != null;
|
||||||
if (tragedy.compareAndSet(null, ex) == false) {
|
if (tragedy.compareAndSet(null, ex)) {
|
||||||
if (tragedy.get() != ex) { // to ensure there is no self-suppression
|
return; // first exception
|
||||||
tragedy.get().addSuppressed(ex);
|
|
||||||
}
|
}
|
||||||
|
final Exception tragedy = this.tragedy.get();
|
||||||
|
// ensure no circular reference
|
||||||
|
if (ExceptionsHelper.unwrapCausesAndSuppressed(ex, e -> e == tragedy).isPresent()) {
|
||||||
|
assert ex == tragedy || ex instanceof AlreadyClosedException : new AssertionError("must be ACE or tragic exception", ex);
|
||||||
|
} else {
|
||||||
|
tragedy.addSuppressed(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1623,10 +1623,9 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
|
|||||||
* @throws IOException if an I/O exception occurred during any file operations
|
* @throws IOException if an I/O exception occurred during any file operations
|
||||||
*/
|
*/
|
||||||
public void rollGeneration() throws IOException {
|
public void rollGeneration() throws IOException {
|
||||||
// make sure we move most of the data to disk outside of the writeLock
|
syncBeforeRollGeneration();
|
||||||
// in order to reduce the time the lock is held since it's blocking all threads
|
|
||||||
sync();
|
|
||||||
try (Releasable ignored = writeLock.acquire()) {
|
try (Releasable ignored = writeLock.acquire()) {
|
||||||
|
ensureOpen();
|
||||||
try {
|
try {
|
||||||
final TranslogReader reader = current.closeIntoReader();
|
final TranslogReader reader = current.closeIntoReader();
|
||||||
readers.add(reader);
|
readers.add(reader);
|
||||||
@ -1643,6 +1642,12 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void syncBeforeRollGeneration() throws IOException {
|
||||||
|
// make sure we move most of the data to disk outside of the writeLock
|
||||||
|
// in order to reduce the time the lock is held since it's blocking all threads
|
||||||
|
sync();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Trims unreferenced translog generations by asking {@link TranslogDeletionPolicy} for the minimum
|
* Trims unreferenced translog generations by asking {@link TranslogDeletionPolicy} for the minimum
|
||||||
* required generation
|
* required generation
|
||||||
|
@ -100,9 +100,12 @@ import java.util.Comparator;
|
|||||||
import java.util.Deque;
|
import java.util.Deque;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.IdentityHashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Queue;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
@ -3307,4 +3310,64 @@ public class TranslogTests extends ESTestCase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEnsureNoCircularException() throws Exception {
|
||||||
|
final AtomicBoolean failedToSyncCheckpoint = new AtomicBoolean();
|
||||||
|
final ChannelFactory channelFactory = (file, openOption) -> {
|
||||||
|
final FileChannel channel = FileChannel.open(file, openOption);
|
||||||
|
return new FilterFileChannel(channel) {
|
||||||
|
@Override
|
||||||
|
public void force(boolean metaData) throws IOException {
|
||||||
|
if (failedToSyncCheckpoint.get()) {
|
||||||
|
throw new IOException("simulated");
|
||||||
|
}
|
||||||
|
super.force(metaData);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
};
|
||||||
|
final TranslogConfig config = getTranslogConfig(createTempDir());
|
||||||
|
final String translogUUID = Translog.createEmptyTranslog(
|
||||||
|
config.getTranslogPath(), SequenceNumbers.NO_OPS_PERFORMED, shardId, channelFactory, primaryTerm.get());
|
||||||
|
final Translog translog = new Translog(config, translogUUID, createTranslogDeletionPolicy(),
|
||||||
|
() -> SequenceNumbers.NO_OPS_PERFORMED, primaryTerm::get,
|
||||||
|
seqNo -> {}) {
|
||||||
|
@Override
|
||||||
|
ChannelFactory getChannelFactory() {
|
||||||
|
return channelFactory;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
void syncBeforeRollGeneration() {
|
||||||
|
// make it a noop like the old versions
|
||||||
|
}
|
||||||
|
};
|
||||||
|
try {
|
||||||
|
translog.add(new Translog.Index("1", "_doc", 1, primaryTerm.get(), new byte[]{1}));
|
||||||
|
failedToSyncCheckpoint.set(true);
|
||||||
|
expectThrows(IOException.class, translog::rollGeneration);
|
||||||
|
final AlreadyClosedException alreadyClosedException = expectThrows(AlreadyClosedException.class, translog::rollGeneration);
|
||||||
|
if (hasCircularReference(alreadyClosedException)) {
|
||||||
|
throw new AssertionError("detect circular reference exception", alreadyClosedException);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
IOUtils.close(translog);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static boolean hasCircularReference(Exception cause) {
|
||||||
|
final Queue<Throwable> queue = new LinkedList<>();
|
||||||
|
queue.add(cause);
|
||||||
|
final Set<Throwable> seen = Collections.newSetFromMap(new IdentityHashMap<>());
|
||||||
|
while (queue.isEmpty() == false) {
|
||||||
|
final Throwable current = queue.remove();
|
||||||
|
if (seen.add(current) == false) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
Collections.addAll(queue, current.getSuppressed());
|
||||||
|
if (current.getCause() != null) {
|
||||||
|
queue.add(current.getCause());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user