diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 5927f76b24a..8b900785bae 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -351,6 +351,9 @@ Bug Fixes * SOLR-7335: Fix doc boosts to no longer be multiplied in each field value in multivalued fields that are not used in copyFields (Shingo Sasaki via hossman) +* SOLR-7585: Fix NoSuchElementException in LFUCache resulting from heavy writes + making concurrent put() calls. (Maciej Zasada via Shawn Heisey) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java b/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java index ade820720ab..2cb4e2e913e 100644 --- a/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java +++ b/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java @@ -51,6 +51,7 @@ public class ConcurrentLFUCache implements Cache { private final boolean newThreadForCleanup; private volatile boolean islive = true; private final Stats stats = new Stats(); + @SuppressWarnings("unused") private final int acceptableWaterMark; private long lowHitCount = 0; // not volatile, only accessed in the cleaning method private final EvictionListener evictionListener; @@ -154,57 +155,64 @@ public class ConcurrentLFUCache implements Cache { } /** - * Removes items from the cache to bring the size down - * to an acceptable value ('acceptableWaterMark'). - *

- * It is done in two stages. In the first stage, least recently used items are evicted. - * If, after the first stage, the cache size is still greater than 'acceptableSize' - * config parameter, the second stage takes over. - *

- * The second stage is more intensive and tries to bring down the cache size - * to the 'lowerWaterMark' config parameter. + * Removes items from the cache to bring the size down to the lowerWaterMark. */ private void markAndSweep() { if (!markAndSweepLock.tryLock()) return; try { long lowHitCount = this.lowHitCount; isCleaning = true; - this.lowHitCount = lowHitCount; // volatile write to make isCleaning visible - + this.lowHitCount = lowHitCount; // volatile write to make isCleaning visible + int sz = stats.size.get(); - + if (sz <= upperWaterMark) { + /* SOLR-7585: Even though we acquired a lock, multiple threads might detect a need for calling this method. + * Locking keeps these from executing at the same time, so they run sequentially. The second and subsequent + * sequential runs of this method don't need to be done, since there are no elements to remove. + */ + return; + } + int wantToRemove = sz - lowerWaterMark; - - TreeSet tree = new TreeSet<>(); - + + TreeSet> tree = new TreeSet<>(); + for (CacheEntry ce : map.values()) { - // set hitsCopy to avoid later Atomic reads + // set hitsCopy to avoid later Atomic reads. Primitive types are faster than the atomic get(). ce.hitsCopy = ce.hits.get(); ce.lastAccessedCopy = ce.lastAccessed; if (timeDecay) { ce.hits.set(ce.hitsCopy >>> 1); } - + if (tree.size() < wantToRemove) { tree.add(ce); } else { - // If the hits are not equal, we can remove before adding - // which is slightly faster - if (ce.hitsCopy < tree.first().hitsCopy) { - tree.remove(tree.first()); - tree.add(ce); - } else if (ce.hitsCopy == tree.first().hitsCopy) { - tree.add(ce); - tree.remove(tree.first()); + /* + * SOLR-7585: Before doing this part, make sure the TreeSet actually has an element, since the first() method + * fails with NoSuchElementException if the set is empty. If that test passes, check hits. This test may + * never actually fail due to the upperWaterMark check above, but we'll do it anyway. + */ + if (tree.size() > 0) { + /* If hits are not equal, we can remove before adding which is slightly faster. I can no longer remember + * why removing first is faster, but I vaguely remember being sure about it! + */ + if (ce.hitsCopy < tree.first().hitsCopy) { + tree.remove(tree.first()); + tree.add(ce); + } else if (ce.hitsCopy == tree.first().hitsCopy) { + tree.add(ce); + tree.remove(tree.first()); + } } } } - + for (CacheEntry e : tree) { evictEntry(e.key); } } finally { - isCleaning = false; // set before markAndSweep.unlock() for visibility + isCleaning = false; // set before markAndSweep.unlock() for visibility markAndSweepLock.unlock(); } } @@ -230,12 +238,12 @@ public class ConcurrentLFUCache implements Cache { Map result = new LinkedHashMap<>(); if (n <= 0) return result; - TreeSet tree = new TreeSet<>(); + TreeSet> tree = new TreeSet<>(); // we need to grab the lock since we are changing the copy variables markAndSweepLock.lock(); try { for (Map.Entry> entry : map.entrySet()) { - CacheEntry ce = entry.getValue(); + CacheEntry ce = entry.getValue(); ce.hitsCopy = ce.hits.get(); ce.lastAccessedCopy = ce.lastAccessed; if (tree.size() < n) { @@ -274,7 +282,7 @@ public class ConcurrentLFUCache implements Cache { Map result = new LinkedHashMap<>(); if (n <= 0) return result; - TreeSet tree = new TreeSet<>(); + TreeSet> tree = new TreeSet<>(); // we need to grab the lock since we are changing the copy variables markAndSweepLock.lock(); try { diff --git a/solr/core/src/test/org/apache/solr/search/TestLFUCache.java b/solr/core/src/test/org/apache/solr/search/TestLFUCache.java index 3835af3f180..157b49430e8 100644 --- a/solr/core/src/test/org/apache/solr/search/TestLFUCache.java +++ b/solr/core/src/test/org/apache/solr/search/TestLFUCache.java @@ -18,8 +18,10 @@ package org.apache.solr.search; */ import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.util.ExecutorUtil; import org.apache.solr.common.util.NamedList; import org.apache.solr.util.ConcurrentLFUCache; +import org.apache.solr.util.DefaultSolrThreadFactory; import org.apache.solr.util.RefCounted; import org.junit.BeforeClass; import org.junit.Test; @@ -28,6 +30,9 @@ import java.io.IOException; import java.util.HashMap; import java.util.Locale; import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; /** @@ -360,6 +365,41 @@ public class TestLFUCache extends SolrTestCaseJ4 { } } + @Test + public void testConcurrentAccess() throws InterruptedException { + /* Set up a thread pool with twice as many threads as there are CPUs. */ + final ConcurrentLFUCache cache = new ConcurrentLFUCache<>(10, 9); + ExecutorService executorService = ExecutorUtil.newMDCAwareFixedThreadPool(10, + new DefaultSolrThreadFactory("testConcurrentAccess")); + final AtomicReference error = new AtomicReference<>(); + + /* + * Use the thread pool to execute at least two million puts into the cache. + * Without the fix on SOLR-7585, NoSuchElementException is thrown. + * Simultaneous calls to markAndSweep are protected from each other by a + * lock, so they run sequentially, and due to a problem in the previous + * design, the cache eviction doesn't work right. + */ + for (int i = 0; i < atLeast(2_000_000); ++i) { + executorService.submit(new Runnable() { + @Override + public void run() { + try { + cache.put(random().nextInt(100), random().nextLong()); + } catch (Throwable t) { + error.compareAndSet(null, t); + } + } + }); + } + + executorService.shutdown(); + executorService.awaitTermination(1, TimeUnit.MINUTES); + + // then: + assertNull("Exception during concurrent access: " + error.get(), error.get()); + } + // From the original LRU cache tests, they're commented out there too because they take a while. // void doPerfTest(int iter, int cacheSize, int maxKey) { // long start = System.currentTimeMillis();