LUCENE-6118: Improve the efficiency of the history structure for recently-used filters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1646674 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2014-12-19 11:22:13 +00:00
parent 8079368b8b
commit 5a9639c3e6
3 changed files with 208 additions and 59 deletions

View File

@ -36,6 +36,9 @@ import org.apache.lucene.util.FrequencyTrackingRingBuffer;
*/
public final class UsageTrackingFilterCachingPolicy implements FilterCachingPolicy {
// the hash code that we use as a sentinel in the ring buffer.
private static final int SENTINEL = Integer.MIN_VALUE;
static boolean isCostly(Filter filter) {
// This does not measure the cost of iterating over the filter (for this we
// already have the DocIdSetIterator#cost API) but the cost to build the
@ -51,7 +54,7 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli
}
private final FilterCachingPolicy.CacheOnLargeSegments segmentPolicy;
private final FrequencyTrackingRingBuffer<Integer> recentlyUsedFilters;
private final FrequencyTrackingRingBuffer recentlyUsedFilters;
private final int minFrequencyCostlyFilters;
private final int minFrequencyCheapFilters;
private final int minFrequencyOtherFilters;
@ -96,7 +99,7 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli
if (minFrequencyCheapFilters > historySize || minFrequencyCostlyFilters > historySize || minFrequencyOtherFilters > historySize) {
throw new IllegalArgumentException("The minimum frequencies should be less than the size of the history of filters that are being tracked");
}
this.recentlyUsedFilters = new FrequencyTrackingRingBuffer<>(historySize);
this.recentlyUsedFilters = new FrequencyTrackingRingBuffer(historySize, SENTINEL);
this.minFrequencyCostlyFilters = minFrequencyCostlyFilters;
this.minFrequencyCheapFilters = minFrequencyCheapFilters;
this.minFrequencyOtherFilters = minFrequencyOtherFilters;
@ -104,9 +107,10 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli
@Override
public void onUse(Filter filter) {
// Using the filter hash codes might help keep memory usage a bit lower
// since some filters might have non-negligible memory usage?
recentlyUsedFilters.add(filter.hashCode());
// we only track hash codes, which
synchronized (this) {
recentlyUsedFilters.add(filter.hashCode());
}
}
@Override
@ -114,7 +118,10 @@ public final class UsageTrackingFilterCachingPolicy implements FilterCachingPoli
if (segmentPolicy.shouldCache(filter, context, set) == false) {
return false;
}
final int frequency = recentlyUsedFilters.frequency(filter.hashCode());
final int frequency;
synchronized (this) {
frequency = recentlyUsedFilters.frequency(filter.hashCode());
}
if (frequency >= minFrequencyOtherFilters) {
return true;
} else if (isCostly(filter) && frequency >= minFrequencyCostlyFilters) {

View File

@ -17,75 +17,202 @@ package org.apache.lucene.util;
* limitations under the License.
*/
import java.util.ArrayDeque;
import java.util.Collections;
import java.util.Deque;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
/**
* A ring buffer that tracks the frequency of the items that it contains.
* This is typically useful to track popular recently-used items.
* A ring buffer that tracks the frequency of the integers that it contains.
* This is typically useful to track the hash codes of popular recently-used
* items.
*
* This class is thread-safe.
* This data-structure requires 22 bytes per entry on average (between 16 and
* 28).
*
* @lucene.internal
*/
public final class FrequencyTrackingRingBuffer<T> {
public final class FrequencyTrackingRingBuffer implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(FrequencyTrackingRingBuffer.class);
private final int maxSize;
private final Deque<T> ringBuffer;
private final ConcurrentMap<T, Integer> frequencies;
private final int[] buffer;
private int position;
private final IntBag frequencies;
/** Create a new ring buffer that will contain at most <code>size</code> items. */
public FrequencyTrackingRingBuffer(int maxSize) {
/** Create a new ring buffer that will contain at most <code>maxSize</code> items.
* This buffer will initially contain <code>maxSize</code> times the
* <code>sentinel</code> value. */
public FrequencyTrackingRingBuffer(int maxSize, int sentinel) {
if (maxSize < 2) {
throw new IllegalArgumentException("maxSize must be at least 2");
}
this.maxSize = maxSize;
this.ringBuffer = new ArrayDeque<>(maxSize);
this.frequencies = new ConcurrentHashMap<>();
buffer = new int[maxSize];
position = 0;
frequencies = new IntBag(maxSize);
Arrays.fill(buffer, sentinel);
for (int i = 0; i < maxSize; ++i) {
frequencies.add(sentinel);
}
assert frequencies.frequency(sentinel) == maxSize;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED
+ frequencies.ramBytesUsed()
+ RamUsageEstimator.sizeOf(buffer);
}
/**
* Add a new item to this ring buffer, potentially removing the oldest
* entry from this buffer if it is already full.
*/
public synchronized void add(T item) {
// we need this method to be protected by a lock since it is important for
// correctness that the ring buffer and the frequencies table have
// consistent content
if (item == null) {
throw new IllegalArgumentException("null items are not supported");
public void add(int i) {
// remove the previous value
final int removed = buffer[position];
final boolean removedFromBag = frequencies.remove(removed);
assert removedFromBag;
// add the new value
buffer[position] = i;
frequencies.add(i);
// increment the position
position += 1;
if (position == maxSize) {
position = 0;
}
assert ringBuffer.size() <= maxSize;
if (ringBuffer.size() == maxSize) {
// evict the oldest entry
final T removed = ringBuffer.removeFirst();
final int newFrequency = frequency(removed) - 1;
if (newFrequency == 0) {
// free for GC
frequencies.remove(removed);
} else {
frequencies.put(removed, newFrequency);
}
}
// add the new entry and update frequencies
ringBuffer.addLast(item);
frequencies.put(item, frequency(item) + 1);
}
/**
* Returns the frequency of the provided item in the ring buffer.
* Returns the frequency of the provided key in the ring buffer.
*/
public int frequency(T item) {
// The use of a concurrent hash map allows us to not use a lock for this read-only method
final Integer freq = frequencies.get(item);
return freq == null ? 0 : freq;
public int frequency(int key) {
return frequencies.frequency(key);
}
// pkg-private for testing
Map<T, Integer> asFrequencyMap() {
return Collections.unmodifiableMap(frequencies);
Map<Integer, Integer> asFrequencyMap() {
return frequencies.asMap();
}
/**
* A bag of integers.
* Since in the context of the ring buffer the maximum size is known up-front
* there is no need to worry about resizing the underlying storage.
*/
private static class IntBag implements Accountable {
private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(IntBag.class);
private final int[] keys;
private final int[] freqs;
private final int mask;
IntBag(int maxSize) {
// load factor of 2/3
int capacity = Math.max(2, maxSize * 3 / 2);
// round up to the next power of two
capacity = Integer.highestOneBit(capacity - 1) << 1;
assert capacity > maxSize;
keys = new int[capacity];
freqs = new int[capacity];
mask = capacity - 1;
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED
+ RamUsageEstimator.sizeOf(keys)
+ RamUsageEstimator.sizeOf(freqs);
}
/** Return the frequency of the give key in the bag. */
int frequency(int key) {
for (int slot = key & mask; ; slot = (slot + 1) & mask) {
if (keys[slot] == key) {
return freqs[slot];
} else if (freqs[slot] == 0) {
return 0;
}
}
}
/** Increment the frequency of the given key by 1 and return its new frequency. */
int add(int key) {
for (int slot = key & mask; ; slot = (slot + 1) & mask) {
if (freqs[slot] == 0) {
keys[slot] = key;
return freqs[slot] = 1;
} else if (keys[slot] == key) {
return ++freqs[slot];
}
}
}
/** Decrement the frequency of the given key by one, or do nothing if the
* key is not present in the bag. Returns true iff the key was contained
* in the bag. */
boolean remove(int key) {
for (int slot = key & mask; ; slot = (slot + 1) & mask) {
if (freqs[slot] == 0) {
// no such key in the bag
return false;
} else if (keys[slot] == key) {
final int newFreq = --freqs[slot];
if (newFreq == 0) { // removed
relocateAdjacentKeys(slot);
}
return true;
}
}
}
private void relocateAdjacentKeys(int freeSlot) {
for (int slot = (freeSlot + 1) & mask; ; slot = (slot + 1) & mask) {
final int freq = freqs[slot];
if (freq == 0) {
// end of the collision chain, we're done
break;
}
final int key = keys[slot];
// the slot where <code>key</code> should be if there were no collisions
final int expectedSlot = key & mask;
// if the free slot is between the expected slot and the slot where the
// key is, then we can relocate there
if (between(expectedSlot, slot, freeSlot)) {
keys[freeSlot] = key;
freqs[freeSlot] = freq;
// slot is the new free slot
freqs[slot] = 0;
freeSlot = slot;
}
}
}
/** Given a chain of occupied slots between <code>chainStart</code>
* and <code>chainEnd</code>, return whether <code>slot</code> is
* between the start and end of the chain. */
private static boolean between(int chainStart, int chainEnd, int slot) {
if (chainStart <= chainEnd) {
return chainStart <= slot && slot <= chainEnd;
} else {
// the chain is across the end of the array
return slot >= chainStart || slot <= chainEnd;
}
}
Map<Integer, Integer> asMap() {
Map<Integer, Integer> map = new HashMap<>();
for (int i = 0; i < keys.length; ++i) {
if (freqs[i] > 0) {
map.put(keys[i], freqs[i]);
}
}
return map;
}
}
}

View File

@ -24,15 +24,19 @@ import java.util.Map;
public class TestFrequencyTrackingRingBuffer extends LuceneTestCase {
private static <T> void assertBuffer(FrequencyTrackingRingBuffer<T> buffer, int maxSize, List<T> items) {
final List<T> recentItems;
private static void assertBuffer(FrequencyTrackingRingBuffer buffer, int maxSize, int sentinel, List<Integer> items) {
final List<Integer> recentItems;
if (items.size() <= maxSize) {
recentItems = items;
recentItems = new ArrayList<>();
for (int i = items.size(); i < maxSize; ++i) {
recentItems.add(sentinel);
}
recentItems.addAll(items);
} else {
recentItems = items.subList(items.size() - maxSize, items.size());
}
final Map<T, Integer> expectedFrequencies = new HashMap<T, Integer>();
for (T item : recentItems) {
final Map<Integer, Integer> expectedFrequencies = new HashMap<Integer, Integer>();
for (Integer item : recentItems) {
final Integer freq = expectedFrequencies.get(item);
if (freq == null) {
expectedFrequencies.put(item, 1);
@ -46,18 +50,29 @@ public class TestFrequencyTrackingRingBuffer extends LuceneTestCase {
public void test() {
final int iterations = atLeast(100);
for (int i = 0; i < iterations; ++i) {
final int maxSize = 1 + random().nextInt(100);
final int numitems = random().nextInt(500);
final int maxSize = 2 + random().nextInt(100);
final int numitems = random().nextInt(5000);
final int maxitem = 1 + random().nextInt(100);
List<Integer> items = new ArrayList<>();
FrequencyTrackingRingBuffer<Integer> buffer = new FrequencyTrackingRingBuffer<>(maxSize);
final int sentinel = random().nextInt(200);
FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel);
for (int j = 0; j < numitems; ++j) {
final Integer item = random().nextInt(maxitem);
items.add(item);
buffer.add(item);
}
assertBuffer(buffer, maxSize, items);
assertBuffer(buffer, maxSize, sentinel, items);
}
}
public void testRamBytesUsed() {
final int maxSize = 2 + random().nextInt(10000);
final int sentinel = random().nextInt();
FrequencyTrackingRingBuffer buffer = new FrequencyTrackingRingBuffer(maxSize, sentinel);
for (int i = 0; i < 10000; ++i) {
buffer.add(random().nextInt());
}
assertEquals(RamUsageTester.sizeOf(buffer), buffer.ramBytesUsed());
}
}