From 5a9639c3e629d6af02d20acfee58d33d2d35b1a6 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 19 Dec 2014 11:22:13 +0000 Subject: [PATCH] LUCENE-6118: Improve the efficiency of the history structure for recently-used filters. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1646674 13f79535-47bb-0310-9956-ffa450edef68 --- .../UsageTrackingFilterCachingPolicy.java | 19 +- .../util/FrequencyTrackingRingBuffer.java | 215 ++++++++++++++---- .../util/TestFrequencyTrackingRingBuffer.java | 33 ++- 3 files changed, 208 insertions(+), 59 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java b/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java index db4341e2a07..684ef471f8f 100644 --- a/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java +++ b/lucene/core/src/java/org/apache/lucene/search/UsageTrackingFilterCachingPolicy.java @@ -36,6 +36,9 @@ import org.apache.lucene.util.FrequencyTrackingRingBuffer; */ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPolicy { + // the hash code that we use as a sentinel in the ring buffer. + private static final int SENTINEL = Integer.MIN_VALUE; + static boolean isCostly(Filter filter) { // This does not measure the cost of iterating over the filter (for this we // already have the DocIdSetIterator#cost API) but the cost to build the @@ -51,7 +54,7 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli } private final FilterCachingPolicy.CacheOnLargeSegments segmentPolicy; - private final FrequencyTrackingRingBuffer recentlyUsedFilters; + private final FrequencyTrackingRingBuffer recentlyUsedFilters; private final int minFrequencyCostlyFilters; private final int minFrequencyCheapFilters; private final int minFrequencyOtherFilters; @@ -96,7 +99,7 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli if (minFrequencyCheapFilters > historySize || minFrequencyCostlyFilters > historySize || minFrequencyOtherFilters > historySize) { throw new IllegalArgumentException("The minimum frequencies should be less than the size of the history of filters that are being tracked"); } - this.recentlyUsedFilters = new FrequencyTrackingRingBuffer<>(historySize); + this.recentlyUsedFilters = new FrequencyTrackingRingBuffer(historySize, SENTINEL); this.minFrequencyCostlyFilters = minFrequencyCostlyFilters; this.minFrequencyCheapFilters = minFrequencyCheapFilters; this.minFrequencyOtherFilters = minFrequencyOtherFilters; @@ -104,9 +107,10 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli @Override public void onUse(Filter filter) { - // Using the filter hash codes might help keep memory usage a bit lower - // since some filters might have non-negligible memory usage? - recentlyUsedFilters.add(filter.hashCode()); + // we only track hash codes, which + synchronized (this) { + recentlyUsedFilters.add(filter.hashCode()); + } } @Override @@ -114,7 +118,10 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli if (segmentPolicy.shouldCache(filter, context, set) == false) { return false; } - final int frequency = recentlyUsedFilters.frequency(filter.hashCode()); + final int frequency; + synchronized (this) { + frequency = recentlyUsedFilters.frequency(filter.hashCode()); + } if (frequency >= minFrequencyOtherFilters) { return true; } else if (isCostly(filter) && frequency >= minFrequencyCostlyFilters) { diff --git a/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java b/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java index 7fee08f25d0..a362e951102 100644 --- a/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java +++ b/lucene/core/src/java/org/apache/lucene/util/FrequencyTrackingRingBuffer.java @@ -17,75 +17,202 @@ package org.apache.lucene.util; * limitations under the License. */ -import java.util.ArrayDeque; -import java.util.Collections; -import java.util.Deque; +import java.util.Arrays; +import java.util.HashMap; import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.ConcurrentMap; /** - * A ring buffer that tracks the frequency of the items that it contains. - * This is typically useful to track popular recently-used items. + * A ring buffer that tracks the frequency of the integers that it contains. + * This is typically useful to track the hash codes of popular recently-used + * items. * - * This class is thread-safe. + * This data-structure requires 22 bytes per entry on average (between 16 and + * 28). * * @lucene.internal */ -public final class FrequencyTrackingRingBuffer { +public final class FrequencyTrackingRingBuffer implements Accountable { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FrequencyTrackingRingBuffer.class); private final int maxSize; - private final Deque ringBuffer; - private final ConcurrentMap frequencies; + private final int[] buffer; + private int position; + private final IntBag frequencies; - /** Create a new ring buffer that will contain at most size items. */ - public FrequencyTrackingRingBuffer(int maxSize) { + /** Create a new ring buffer that will contain at most maxSize items. + * This buffer will initially contain maxSize times the + * sentinel value. */ + public FrequencyTrackingRingBuffer(int maxSize, int sentinel) { + if (maxSize < 2) { + throw new IllegalArgumentException("maxSize must be at least 2"); + } this.maxSize = maxSize; - this.ringBuffer = new ArrayDeque<>(maxSize); - this.frequencies = new ConcurrentHashMap<>(); + buffer = new int[maxSize]; + position = 0; + frequencies = new IntBag(maxSize); + + Arrays.fill(buffer, sentinel); + for (int i = 0; i < maxSize; ++i) { + frequencies.add(sentinel); + } + assert frequencies.frequency(sentinel) == maxSize; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + + frequencies.ramBytesUsed() + + RamUsageEstimator.sizeOf(buffer); } /** * Add a new item to this ring buffer, potentially removing the oldest * entry from this buffer if it is already full. */ - public synchronized void add(T item) { - // we need this method to be protected by a lock since it is important for - // correctness that the ring buffer and the frequencies table have - // consistent content - if (item == null) { - throw new IllegalArgumentException("null items are not supported"); + public void add(int i) { + // remove the previous value + final int removed = buffer[position]; + final boolean removedFromBag = frequencies.remove(removed); + assert removedFromBag; + // add the new value + buffer[position] = i; + frequencies.add(i); + // increment the position + position += 1; + if (position == maxSize) { + position = 0; } - assert ringBuffer.size() <= maxSize; - if (ringBuffer.size() == maxSize) { - // evict the oldest entry - final T removed = ringBuffer.removeFirst(); - final int newFrequency = frequency(removed) - 1; - if (newFrequency == 0) { - // free for GC - frequencies.remove(removed); - } else { - frequencies.put(removed, newFrequency); - } - } - - // add the new entry and update frequencies - ringBuffer.addLast(item); - frequencies.put(item, frequency(item) + 1); } /** - * Returns the frequency of the provided item in the ring buffer. + * Returns the frequency of the provided key in the ring buffer. */ - public int frequency(T item) { - // The use of a concurrent hash map allows us to not use a lock for this read-only method - final Integer freq = frequencies.get(item); - return freq == null ? 0 : freq; + public int frequency(int key) { + return frequencies.frequency(key); } // pkg-private for testing - Map asFrequencyMap() { - return Collections.unmodifiableMap(frequencies); + Map asFrequencyMap() { + return frequencies.asMap(); + } + + /** + * A bag of integers. + * Since in the context of the ring buffer the maximum size is known up-front + * there is no need to worry about resizing the underlying storage. + */ + private static class IntBag implements Accountable { + + private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(IntBag.class); + + private final int[] keys; + private final int[] freqs; + private final int mask; + + IntBag(int maxSize) { + // load factor of 2/3 + int capacity = Math.max(2, maxSize * 3 / 2); + // round up to the next power of two + capacity = Integer.highestOneBit(capacity - 1) << 1; + assert capacity > maxSize; + keys = new int[capacity]; + freqs = new int[capacity]; + mask = capacity - 1; + } + + @Override + public long ramBytesUsed() { + return BASE_RAM_BYTES_USED + + RamUsageEstimator.sizeOf(keys) + + RamUsageEstimator.sizeOf(freqs); + } + + /** Return the frequency of the give key in the bag. */ + int frequency(int key) { + for (int slot = key & mask; ; slot = (slot + 1) & mask) { + if (keys[slot] == key) { + return freqs[slot]; + } else if (freqs[slot] == 0) { + return 0; + } + } + } + + /** Increment the frequency of the given key by 1 and return its new frequency. */ + int add(int key) { + for (int slot = key & mask; ; slot = (slot + 1) & mask) { + if (freqs[slot] == 0) { + keys[slot] = key; + return freqs[slot] = 1; + } else if (keys[slot] == key) { + return ++freqs[slot]; + } + } + } + + /** Decrement the frequency of the given key by one, or do nothing if the + * key is not present in the bag. Returns true iff the key was contained + * in the bag. */ + boolean remove(int key) { + for (int slot = key & mask; ; slot = (slot + 1) & mask) { + if (freqs[slot] == 0) { + // no such key in the bag + return false; + } else if (keys[slot] == key) { + final int newFreq = --freqs[slot]; + if (newFreq == 0) { // removed + relocateAdjacentKeys(slot); + } + return true; + } + } + } + + private void relocateAdjacentKeys(int freeSlot) { + for (int slot = (freeSlot + 1) & mask; ; slot = (slot + 1) & mask) { + final int freq = freqs[slot]; + if (freq == 0) { + // end of the collision chain, we're done + break; + } + final int key = keys[slot]; + // the slot where key should be if there were no collisions + final int expectedSlot = key & mask; + // if the free slot is between the expected slot and the slot where the + // key is, then we can relocate there + if (between(expectedSlot, slot, freeSlot)) { + keys[freeSlot] = key; + freqs[freeSlot] = freq; + // slot is the new free slot + freqs[slot] = 0; + freeSlot = slot; + } + } + } + + /** Given a chain of occupied slots between chainStart + * and chainEnd, return whether slot is + * between the start and end of the chain. */ + private static boolean between(int chainStart, int chainEnd, int slot) { + if (chainStart <= chainEnd) { + return chainStart <= slot && slot <= chainEnd; + } else { + // the chain is across the end of the array + return slot >= chainStart || slot <= chainEnd; + } + } + + Map asMap() { + Map map = new HashMap<>(); + for (int i = 0; i < keys.length; ++i) { + if (freqs[i] > 0) { + map.put(keys[i], freqs[i]); + } + } + return map; + } + } } diff --git a/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java b/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java index f73de48977d..5fe1a499cd1 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestFrequencyTrackingRingBuffer.java @@ -24,15 +24,19 @@ import java.util.Map; public class TestFrequencyTrackingRingBuffer extends LuceneTestCase { - private static void assertBuffer(FrequencyTrackingRingBuffer buffer, int maxSize, List items) { - final List recentItems; + private static void assertBuffer(FrequencyTrackingRingBuffer buffer, int maxSize, int sentinel, List items) { + final List recentItems; if (items.size() <= maxSize) { - recentItems = items; + recentItems = new ArrayList<>(); + for (int i = items.size(); i < maxSize; ++i) { + recentItems.add(sentinel); + } + recentItems.addAll(items); } else { recentItems = items.subList(items.size() - maxSize, items.size()); } - final Map expectedFrequencies = new HashMap(); - for (T item : recentItems) { + final Map expectedFrequencies = new HashMap(); + for (Integer item : recentItems) { final Integer freq = expectedFrequencies.get(item); if (freq == null) { expectedFrequencies.put(item, 1); @@ -46,18 +50,29 @@ public class TestFrequencyTrackingRingBuffer extends LuceneTestCase { public void test() { final int iterations = atLeast(100); for (int i = 0; i < iterations; ++i) { - final int maxSize = 1 + random().nextInt(100); - final int numitems = random().nextInt(500); + final int maxSize = 2 + random().nextInt(100); + final int numitems = random().nextInt(5000); final int maxitem = 1 + random().nextInt(100); List items = new ArrayList<>(); - FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer<>(maxSize); + final int sentinel = random().nextInt(200); + FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel); for (int j = 0; j < numitems; ++j) { final Integer item = random().nextInt(maxitem); items.add(item); buffer.add(item); } - assertBuffer(buffer, maxSize, items); + assertBuffer(buffer, maxSize, sentinel, items); } } + public void testRamBytesUsed() { + final int maxSize = 2 + random().nextInt(10000); + final int sentinel = random().nextInt(); + FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel); + for (int i = 0; i < 10000; ++i) { + buffer.add(random().nextInt()); + } + assertEquals(RamUsageTester.sizeOf(buffer), buffer.ramBytesUsed()); + } + }