Never block on key in `LiveVersionMap#pruneTombstones` (#28736)

Pruning tombstones is best effort and should not block if a key is currently
locked. This can cause a deadlock in rare situations if we switch of append
only optimization while heavily updating the same key in the engine
while the LiveVersionMap is locked. This is very rare since this code
patch only executed every 15 seconds by default since that is the interval
we try to prune the deletes in the version map.

Closes #28714
This commit is contained in:
Simon Willnauer 2018-02-20 16:35:05 +01:00 committed by GitHub
parent 8bbb3c9ffa
commit b00870600b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 190 additions and 29 deletions

View File

@ -63,22 +63,54 @@ public final class KeyedLock<T> {
while (true) {
KeyLock perNodeLock = map.get(key);
if (perNodeLock == null) {
KeyLock newLock = new KeyLock(fair);
perNodeLock = map.putIfAbsent(key, newLock);
if (perNodeLock == null) {
newLock.lock();
return new ReleasableLock(key, newLock);
ReleasableLock newLock = tryCreateNewLock(key);
if (newLock != null) {
return newLock;
}
} else {
assert perNodeLock != null;
int i = perNodeLock.count.get();
if (i > 0 && perNodeLock.count.compareAndSet(i, i + 1)) {
perNodeLock.lock();
return new ReleasableLock(key, perNodeLock);
}
}
assert perNodeLock != null;
int i = perNodeLock.count.get();
if (i > 0 && perNodeLock.count.compareAndSet(i, i + 1)) {
perNodeLock.lock();
return new ReleasableLock(key, perNodeLock);
}
}
}
/**
* Tries to acquire the lock for the given key and returns it. If the lock can't be acquired null is returned.
*/
public Releasable tryAcquire(T key) {
final KeyLock perNodeLock = map.get(key);
if (perNodeLock == null) {
return tryCreateNewLock(key);
}
if (perNodeLock.tryLock()) { // ok we got it - make sure we increment it accordingly otherwise release it again
int i;
while ((i = perNodeLock.count.get()) > 0) {
// we have to do this in a loop here since even if the count is > 0
// there could be a concurrent blocking acquire that changes the count and then this CAS fails. Since we already got
// the lock we should retry and see if we can still get it or if the count is 0. If that is the case and we give up.
if (perNodeLock.count.compareAndSet(i, i + 1)) {
return new ReleasableLock(key, perNodeLock);
}
}
perNodeLock.unlock(); // make sure we unlock and don't leave the lock in a locked state
}
return null;
}
private ReleasableLock tryCreateNewLock(T key) {
KeyLock newLock = new KeyLock(fair);
newLock.lock();
KeyLock keyLock = map.putIfAbsent(key, newLock);
if (keyLock == null) {
return new ReleasableLock(key, newLock);
}
return null;
}
/**
* Returns <code>true</code> iff the caller thread holds the lock for the given key
*/
@ -92,11 +124,12 @@ public final class KeyedLock<T> {
private void release(T key, KeyLock lock) {
assert lock == map.get(key);
final int decrementAndGet = lock.count.decrementAndGet();
lock.unlock();
int decrementAndGet = lock.count.decrementAndGet();
if (decrementAndGet == 0) {
map.remove(key, lock);
}
assert decrementAndGet >= 0 : decrementAndGet + " must be >= 0 but wasn't";
}

View File

@ -32,7 +32,6 @@ import java.util.Collection;
import java.util.Collections;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
/** Maps _uid value to its version information. */
final class LiveVersionMap implements ReferenceManager.RefreshListener, Accountable {
@ -378,20 +377,25 @@ final class LiveVersionMap implements ReferenceManager.RefreshListener, Accounta
void pruneTombstones(long currentTime, long pruneInterval) {
for (Map.Entry<BytesRef, DeleteVersionValue> entry : tombstones.entrySet()) {
BytesRef uid = entry.getKey();
try (Releasable ignored = acquireLock(uid)) { // can we do it without this lock on each value? maybe batch to a set and get
// the lock once per set?
// Must re-get it here, vs using entry.getValue(), in case the uid was indexed/deleted since we pulled the iterator:
DeleteVersionValue versionValue = tombstones.get(uid);
if (versionValue != null) {
// check if the value is old enough to be removed
final boolean isTooOld = currentTime - versionValue.time > pruneInterval;
if (isTooOld) {
// version value can't be removed it's
// not yet flushed to lucene ie. it's part of this current maps object
final boolean isNotTrackedByCurrentMaps = versionValue.time < maps.getMinDeleteTimestamp();
if (isNotTrackedByCurrentMaps) {
removeTombstoneUnderLock(uid);
final BytesRef uid = entry.getKey();
try (Releasable lock = keyedLock.tryAcquire(uid)) {
// we use tryAcquire here since this is a best effort and we try to be least disruptive
// this method is also called under lock in the engine under certain situations such that this can lead to deadlocks
// if we do use a blocking acquire. see #28714
if (lock != null) { // did we get the lock?
// can we do it without this lock on each value? maybe batch to a set and get the lock once per set?
// Must re-get it here, vs using entry.getValue(), in case the uid was indexed/deleted since we pulled the iterator:
final DeleteVersionValue versionValue = tombstones.get(uid);
if (versionValue != null) {
// check if the value is old enough to be removed
final boolean isTooOld = currentTime - versionValue.time > pruneInterval;
if (isTooOld) {
// version value can't be removed it's
// not yet flushed to lucene ie. it's part of this current maps object
final boolean isNotTrackedByCurrentMaps = versionValue.time < maps.getMinDeleteTimestamp();
if (isNotTrackedByCurrentMaps) {
removeTombstoneUnderLock(uid);
}
}
}
}

View File

@ -20,7 +20,6 @@
package org.elasticsearch.common.util.concurrent;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.util.concurrent.KeyedLock;
import org.elasticsearch.test.ESTestCase;
import org.hamcrest.Matchers;
@ -31,6 +30,7 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import static org.hamcrest.Matchers.equalTo;
@ -79,6 +79,34 @@ public class KeyedLockTests extends ESTestCase {
assertFalse(lock.hasLockedKeys());
}
public void testTryAcquire() throws InterruptedException {
KeyedLock<String> lock = new KeyedLock<>();
Releasable foo = lock.tryAcquire("foo");
Releasable second = lock.tryAcquire("foo");
assertTrue(lock.hasLockedKeys());
foo.close();
assertTrue(lock.hasLockedKeys());
second.close();
assertFalse(lock.hasLockedKeys());
// lock again
Releasable acquire = lock.tryAcquire("foo");
assertNotNull(acquire);
final AtomicBoolean check = new AtomicBoolean(false);
CountDownLatch latch = new CountDownLatch(1);
Thread thread = new Thread(() -> {
latch.countDown();
try (Releasable ignore = lock.acquire("foo")) {
assertTrue(check.get());
}
});
thread.start();
latch.await();
check.set(true);
acquire.close();
foo.close();
thread.join();
}
public void testLockIsReentrant() throws InterruptedException {
KeyedLock<String> lock = new KeyedLock<>();
Releasable foo = lock.acquire("foo");
@ -137,7 +165,24 @@ public class KeyedLockTests extends ESTestCase {
for (int i = 0; i < numRuns; i++) {
String curName = names[randomInt(names.length - 1)];
assert connectionLock.isHeldByCurrentThread(curName) == false;
try (Releasable ignored = connectionLock.acquire(curName)) {
Releasable lock;
if (randomIntBetween(0, 10) < 4) {
int tries = 0;
boolean stepOut = false;
while ((lock = connectionLock.tryAcquire(curName)) == null) {
assertFalse(connectionLock.isHeldByCurrentThread(curName));
if (tries++ == 10) {
stepOut = true;
break;
}
}
if (stepOut) {
break;
}
} else {
lock = connectionLock.acquire(curName);
}
try (Releasable ignore = lock) {
assert connectionLock.isHeldByCurrentThread(curName);
assert connectionLock.isHeldByCurrentThread(curName + "bla") == false;
if (randomBoolean()) {

View File

@ -4517,4 +4517,60 @@ public class InternalEngineTests extends EngineTestCase {
assertThat(engine.getLastCommittedSegmentInfos(), not(sameInstance(lastCommitInfo)));
assertThat(engine.getTranslog().uncommittedOperations(), equalTo(0));
}
public void testStressUpdateSameDocWhileGettingIt() throws IOException, InterruptedException {
final int iters = randomIntBetween(1, 15);
for (int i = 0; i < iters; i++) {
// this is a reproduction of https://github.com/elastic/elasticsearch/issues/28714
try (Store store = createStore(); InternalEngine engine = createEngine(store, createTempDir())) {
final IndexSettings indexSettings = engine.config().getIndexSettings();
final IndexMetaData indexMetaData = IndexMetaData.builder(indexSettings.getIndexMetaData())
.settings(Settings.builder().put(indexSettings.getSettings())
.put(IndexSettings.INDEX_GC_DELETES_SETTING.getKey(), TimeValue.timeValueMillis(1))).build();
engine.engineConfig.getIndexSettings().updateIndexMetaData(indexMetaData);
engine.onSettingsChanged();
ParsedDocument document = testParsedDocument(Integer.toString(0), null, testDocumentWithTextField(), SOURCE, null);
final Engine.Index doc = new Engine.Index(newUid(document), document, SequenceNumbers.UNASSIGNED_SEQ_NO, 0,
Versions.MATCH_ANY, VersionType.INTERNAL, Engine.Operation.Origin.PRIMARY, System.nanoTime(), 0, false);
// first index an append only document and then delete it. such that we have it in the tombstones
engine.index(doc);
engine.delete(new Engine.Delete(doc.type(), doc.id(), doc.uid()));
// now index more append only docs and refresh so we re-enabel the optimization for unsafe version map
ParsedDocument document1 = testParsedDocument(Integer.toString(1), null, testDocumentWithTextField(), SOURCE, null);
engine.index(new Engine.Index(newUid(document1), document1, SequenceNumbers.UNASSIGNED_SEQ_NO, 0,
Versions.MATCH_ANY, VersionType.INTERNAL, Engine.Operation.Origin.PRIMARY, System.nanoTime(), 0, false));
engine.refresh("test");
ParsedDocument document2 = testParsedDocument(Integer.toString(2), null, testDocumentWithTextField(), SOURCE, null);
engine.index(new Engine.Index(newUid(document2), document2, SequenceNumbers.UNASSIGNED_SEQ_NO, 0,
Versions.MATCH_ANY, VersionType.INTERNAL, Engine.Operation.Origin.PRIMARY, System.nanoTime(), 0, false));
engine.refresh("test");
ParsedDocument document3 = testParsedDocument(Integer.toString(3), null, testDocumentWithTextField(), SOURCE, null);
final Engine.Index doc3 = new Engine.Index(newUid(document3), document3, SequenceNumbers.UNASSIGNED_SEQ_NO, 0,
Versions.MATCH_ANY, VersionType.INTERNAL, Engine.Operation.Origin.PRIMARY, System.nanoTime(), 0, false);
engine.index(doc3);
engine.engineConfig.setEnableGcDeletes(true);
// once we are here the version map is unsafe again and we need to do a refresh inside the get calls to ensure we
// de-optimize. We also enabled GCDeletes which now causes pruning tombstones inside that refresh that is done internally
// to ensure we de-optimize. One get call will purne and the other will try to lock the version map concurrently while
// holding the lock that pruneTombstones needs and we have a deadlock
CountDownLatch awaitStarted = new CountDownLatch(1);
Thread thread = new Thread(() -> {
awaitStarted.countDown();
try (Engine.GetResult getResult = engine.get(new Engine.Get(true, doc3.type(), doc3.id(), doc3.uid()),
engine::acquireSearcher)) {
assertTrue(getResult.exists());
}
});
thread.start();
awaitStarted.await();
try (Engine.GetResult getResult = engine.get(new Engine.Get(true, doc.type(), doc.id(), doc.uid()),
engine::acquireSearcher)) {
assertFalse(getResult.exists());
}
thread.join();
}
}
}
}

View File

@ -348,4 +348,27 @@ public class LiveVersionMapTests extends ESTestCase {
}
}
}
public void testPruneTombstonesWhileLocked() throws InterruptedException, IOException {
LiveVersionMap map = new LiveVersionMap();
BytesRef uid = uid("1");
;
try (Releasable ignore = map.acquireLock(uid)) {
map.putUnderLock(uid, new DeleteVersionValue(0, 0, 0, 0));
map.beforeRefresh(); // refresh otherwise we won't prune since it's tracked by the current map
map.afterRefresh(false);
Thread thread = new Thread(() -> {
map.pruneTombstones(Long.MAX_VALUE, 0);
});
thread.start();
thread.join();
assertEquals(1, map.getAllTombstones().size());
}
Thread thread = new Thread(() -> {
map.pruneTombstones(Long.MAX_VALUE, 0);
});
thread.start();
thread.join();
assertEquals(0, map.getAllTombstones().size());
}
}