SOLR-7585: Fix NoSuchElementException in LFUCache

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1681449 13f79535-47bb-0310-9956-ffa450edef68
2015-05-24 13:20:04 +00:00 · 2015-05-24 13:20:04 +00:00 · 88b5ea4935
parent 4bbc5470e7
commit 88b5ea4935
3 changed files with 81 additions and 30 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -351,6 +351,9 @@ Bug Fixes
 * SOLR-7335: Fix doc boosts to no longer be multiplied in each field value in multivalued fields that
  are not used in copyFields (Shingo Sasaki via hossman)

+* SOLR-7585: Fix NoSuchElementException in LFUCache resulting from heavy writes
+  making concurrent put() calls. (Maciej Zasada via Shawn Heisey)
+
 Optimizations
 ----------------------

--- a/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java
+++ b/solr/core/src/java/org/apache/solr/util/ConcurrentLFUCache.java
@ -51,6 +51,7 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
  private final boolean newThreadForCleanup;
  private volatile boolean islive = true;
  private final Stats stats = new Stats();
+  @SuppressWarnings("unused")
  private final int acceptableWaterMark;
  private long lowHitCount = 0;  // not volatile, only accessed in the cleaning method
  private final EvictionListener<K, V> evictionListener;
@ -154,57 +155,64 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
  }

  /**
-   * Removes items from the cache to bring the size down
-   * to an acceptable value ('acceptableWaterMark').
-   * <p/>
-   * It is done in two stages. In the first stage, least recently used items are evicted.
-   * If, after the first stage, the cache size is still greater than 'acceptableSize'
-   * config parameter, the second stage takes over.
-   * <p/>
-   * The second stage is more intensive and tries to bring down the cache size
-   * to the 'lowerWaterMark' config parameter.
+   * Removes items from the cache to bring the size down to the lowerWaterMark.
   */
  private void markAndSweep() {
    if (!markAndSweepLock.tryLock()) return;
    try {
      long lowHitCount = this.lowHitCount;
      isCleaning = true;
-      this.lowHitCount = lowHitCount;     // volatile write to make isCleaning visible
-
+      this.lowHitCount = lowHitCount; // volatile write to make isCleaning visible
+      
      int sz = stats.size.get();
-
+      if (sz <= upperWaterMark) {
+        /* SOLR-7585: Even though we acquired a lock, multiple threads might detect a need for calling this method.
+         * Locking keeps these from executing at the same time, so they run sequentially.  The second and subsequent
+         * sequential runs of this method don't need to be done, since there are no elements to remove.
+        */
+        return;
+      }
+      
      int wantToRemove = sz - lowerWaterMark;
-
-      TreeSet<CacheEntry> tree = new TreeSet<>();
-
+      
+      TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
+      
      for (CacheEntry<K, V> ce : map.values()) {
-        // set hitsCopy to avoid later Atomic reads
+        // set hitsCopy to avoid later Atomic reads.  Primitive types are faster than the atomic get().
        ce.hitsCopy = ce.hits.get();
        ce.lastAccessedCopy = ce.lastAccessed;
        if (timeDecay) {
          ce.hits.set(ce.hitsCopy >>> 1);
        }
-
+        
        if (tree.size() < wantToRemove) {
          tree.add(ce);
        } else {
-          // If the hits are not equal, we can remove before adding
-          // which is slightly faster
-          if (ce.hitsCopy < tree.first().hitsCopy) {
-            tree.remove(tree.first());
-            tree.add(ce);
-          } else if (ce.hitsCopy == tree.first().hitsCopy) {
-            tree.add(ce);
-            tree.remove(tree.first());
+          /*
+           * SOLR-7585: Before doing this part, make sure the TreeSet actually has an element, since the first() method
+           * fails with NoSuchElementException if the set is empty.  If that test passes, check hits. This test may
+           * never actually fail due to the upperWaterMark check above, but we'll do it anyway.
+           */
+          if (tree.size() > 0) {
+            /* If hits are not equal, we can remove before adding which is slightly faster. I can no longer remember
+             * why removing first is faster, but I vaguely remember being sure about it!
+             */
+            if (ce.hitsCopy < tree.first().hitsCopy) {
+              tree.remove(tree.first());
+              tree.add(ce);
+            } else if (ce.hitsCopy == tree.first().hitsCopy) {
+              tree.add(ce);
+              tree.remove(tree.first());
+            }
          }
        }
      }
-
+      
      for (CacheEntry<K, V> e : tree) {
        evictEntry(e.key);
      }
    } finally {
-      isCleaning = false;  // set before markAndSweep.unlock() for visibility
+      isCleaning = false; // set before markAndSweep.unlock() for visibility
      markAndSweepLock.unlock();
    }
  }
@ -230,12 +238,12 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
    Map<K, V> result = new LinkedHashMap<>();
    if (n <= 0)
      return result;
-    TreeSet<CacheEntry> tree = new TreeSet<>();
+    TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
    // we need to grab the lock since we are changing the copy variables
    markAndSweepLock.lock();
    try {
      for (Map.Entry<Object, CacheEntry<K, V>> entry : map.entrySet()) {
-        CacheEntry ce = entry.getValue();
+        CacheEntry<K, V> ce = entry.getValue();
        ce.hitsCopy = ce.hits.get();
        ce.lastAccessedCopy = ce.lastAccessed;
        if (tree.size() < n) {
@ -274,7 +282,7 @@ public class ConcurrentLFUCache<K, V> implements Cache<K,V> {
    Map<K, V> result = new LinkedHashMap<>();
    if (n <= 0)
      return result;
-    TreeSet<CacheEntry> tree = new TreeSet<>();
+    TreeSet<CacheEntry<K, V>> tree = new TreeSet<>();
    // we need to grab the lock since we are changing the copy variables
    markAndSweepLock.lock();
    try {
--- a/solr/core/src/test/org/apache/solr/search/TestLFUCache.java
+++ b/solr/core/src/test/org/apache/solr/search/TestLFUCache.java
@ -18,8 +18,10 @@ package org.apache.solr.search;
 */

 import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.util.ExecutorUtil;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.util.ConcurrentLFUCache;
+import org.apache.solr.util.DefaultSolrThreadFactory;
 import org.apache.solr.util.RefCounted;
 import org.junit.BeforeClass;
 import org.junit.Test;
@ -28,6 +30,9 @@ import java.io.IOException;
 import java.util.HashMap;
 import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicReference;


 /**
@ -360,6 +365,41 @@ public class TestLFUCache extends SolrTestCaseJ4 {
    }
  }

+  @Test
+  public void testConcurrentAccess() throws InterruptedException {
+    /* Set up a thread pool with twice as many threads as there are CPUs. */
+    final ConcurrentLFUCache<Integer,Long> cache = new ConcurrentLFUCache<>(10, 9);
+    ExecutorService executorService = ExecutorUtil.newMDCAwareFixedThreadPool(10,
+        new DefaultSolrThreadFactory("testConcurrentAccess"));
+    final AtomicReference<Throwable> error = new AtomicReference<>();
+    
+    /*
+     * Use the thread pool to execute at least two million puts into the cache.
+     * Without the fix on SOLR-7585, NoSuchElementException is thrown.
+     * Simultaneous calls to markAndSweep are protected from each other by a
+     * lock, so they run sequentially, and due to a problem in the previous
+     * design, the cache eviction doesn't work right.
+     */
+    for (int i = 0; i < atLeast(2_000_000); ++i) {
+      executorService.submit(new Runnable() {
+        @Override
+        public void run() {
+          try {
+            cache.put(random().nextInt(100), random().nextLong());
+          } catch (Throwable t) {
+            error.compareAndSet(null, t);
+          }
+        }
+      });
+    }
+    
+    executorService.shutdown();
+    executorService.awaitTermination(1, TimeUnit.MINUTES);
+    
+    // then:
+    assertNull("Exception during concurrent access: " + error.get(), error.get());
+  }
+
 // From the original LRU cache tests, they're commented out there too because they take a while.
 //  void doPerfTest(int iter, int cacheSize, int maxKey) {
 //    long start = System.currentTimeMillis();