SOLR-7585: Fix NoSuchElementException in LFUCache

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1681449 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Shawn Heisey 2015-05-24 13:20:04 +00:00
parent 4bbc5470e7
commit 88b5ea4935
3 changed files with 81 additions and 30 deletions

View File

@ -351,6 +351,9 @@ Bug Fixes
* SOLR-7335: Fix doc boosts to no longer be multiplied in each field value in multivalued fields that
are not used in copyFields (Shingo Sasaki via hossman)
* SOLR-7585: Fix NoSuchElementException in LFUCache resulting from heavy writes
making concurrent put() calls. (Maciej Zasada via Shawn Heisey)
Optimizations
----------------------

View File

@ -51,6 +51,7 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
private final boolean newThreadForCleanup;
private volatile boolean islive = true;
private final Stats stats = new Stats();
@SuppressWarnings("unused")
private final int acceptableWaterMark;
private long lowHitCount = 0; // not volatile, only accessed in the cleaning method
private final EvictionListener<K, V> evictionListener;
@ -154,57 +155,64 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
}
/**
* Removes items from the cache to bring the size down
* to an acceptable value ('acceptableWaterMark').
* <p/>
* It is done in two stages. In the first stage, least recently used items are evicted.
* If, after the first stage, the cache size is still greater than 'acceptableSize'
* config parameter, the second stage takes over.
* <p/>
* The second stage is more intensive and tries to bring down the cache size
* to the 'lowerWaterMark' config parameter.
* Removes items from the cache to bring the size down to the lowerWaterMark.
*/
private void markAndSweep() {
if (!markAndSweepLock.tryLock()) return;
try {
long lowHitCount = this.lowHitCount;
isCleaning = true;
this.lowHitCount = lowHitCount; // volatile write to make isCleaning visible
this.lowHitCount = lowHitCount; // volatile write to make isCleaning visible
int sz = stats.size.get();
if (sz <= upperWaterMark) {
/* SOLR-7585: Even though we acquired a lock, multiple threads might detect a need for calling this method.
* Locking keeps these from executing at the same time, so they run sequentially. The second and subsequent
* sequential runs of this method don't need to be done, since there are no elements to remove.
*/
return;
}
int wantToRemove = sz - lowerWaterMark;
TreeSet<CacheEntry> tree = new TreeSet<>();
TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
for (CacheEntry<K, V> ce : map.values()) {
// set hitsCopy to avoid later Atomic reads
// set hitsCopy to avoid later Atomic reads. Primitive types are faster than the atomic get().
ce.hitsCopy = ce.hits.get();
ce.lastAccessedCopy = ce.lastAccessed;
if (timeDecay) {
ce.hits.set(ce.hitsCopy >>> 1);
}
if (tree.size() < wantToRemove) {
tree.add(ce);
} else {
// If the hits are not equal, we can remove before adding
// which is slightly faster
if (ce.hitsCopy < tree.first().hitsCopy) {
tree.remove(tree.first());
tree.add(ce);
} else if (ce.hitsCopy == tree.first().hitsCopy) {
tree.add(ce);
tree.remove(tree.first());
/*
* SOLR-7585: Before doing this part, make sure the TreeSet actually has an element, since the first() method
* fails with NoSuchElementException if the set is empty. If that test passes, check hits. This test may
* never actually fail due to the upperWaterMark check above, but we'll do it anyway.
*/
if (tree.size() > 0) {
/* If hits are not equal, we can remove before adding which is slightly faster. I can no longer remember
* why removing first is faster, but I vaguely remember being sure about it!
*/
if (ce.hitsCopy < tree.first().hitsCopy) {
tree.remove(tree.first());
tree.add(ce);
} else if (ce.hitsCopy == tree.first().hitsCopy) {
tree.add(ce);
tree.remove(tree.first());
}
}
}
}
for (CacheEntry<K, V> e : tree) {
evictEntry(e.key);
}
} finally {
isCleaning = false; // set before markAndSweep.unlock() for visibility
isCleaning = false; // set before markAndSweep.unlock() for visibility
markAndSweepLock.unlock();
}
}
@ -230,12 +238,12 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
Map<K, V> result = new LinkedHashMap<>();
if (n <= 0)
return result;
TreeSet<CacheEntry> tree = new TreeSet<>();
TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
// we need to grab the lock since we are changing the copy variables
markAndSweepLock.lock();
try {
for (Map.Entry<Object, CacheEntry<K, V>> entry : map.entrySet()) {
CacheEntry ce = entry.getValue();
CacheEntry<K, V> ce = entry.getValue();
ce.hitsCopy = ce.hits.get();
ce.lastAccessedCopy = ce.lastAccessed;
if (tree.size() < n) {
@ -274,7 +282,7 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
Map<K, V> result = new LinkedHashMap<>();
if (n <= 0)
return result;
TreeSet<CacheEntry> tree = new TreeSet<>();
TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
// we need to grab the lock since we are changing the copy variables
markAndSweepLock.lock();
try {

View File

@ -18,8 +18,10 @@ package org.apache.solr.search;
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.ConcurrentLFUCache;
import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.RefCounted;
import org.junit.BeforeClass;
import org.junit.Test;
@ -28,6 +30,9 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
/**
@ -360,6 +365,41 @@ public class TestLFUCache extends SolrTestCaseJ4 {
}
}
@Test
public void testConcurrentAccess() throws InterruptedException {
/* Set up a thread pool with twice as many threads as there are CPUs. */
final ConcurrentLFUCache<Integer,Long> cache = new ConcurrentLFUCache<>(10, 9);
ExecutorService executorService = ExecutorUtil.newMDCAwareFixedThreadPool(10,
new DefaultSolrThreadFactory("testConcurrentAccess"));
final AtomicReference<Throwable> error = new AtomicReference<>();
/*
* Use the thread pool to execute at least two million puts into the cache.
* Without the fix on SOLR-7585, NoSuchElementException is thrown.
* Simultaneous calls to markAndSweep are protected from each other by a
* lock, so they run sequentially, and due to a problem in the previous
* design, the cache eviction doesn't work right.
*/
for (int i = 0; i < atLeast(2_000_000); ++i) {
executorService.submit(new Runnable() {
@Override
public void run() {
try {
cache.put(random().nextInt(100), random().nextLong());
} catch (Throwable t) {
error.compareAndSet(null, t);
}
}
});
}
executorService.shutdown();
executorService.awaitTermination(1, TimeUnit.MINUTES);
// then:
assertNull("Exception during concurrent access: " + error.get(), error.get());
}
// From the original LRU cache tests, they're commented out there too because they take a while.
// void doPerfTest(int iter, int cacheSize, int maxKey) {
// long start = System.currentTimeMillis();