LUCENE-2075: share the terms dict cache across threads

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@884895 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2009-11-27 15:32:57 +00:00
parent 48fe1847ab
commit 1aab3f64f0
9 changed files with 373 additions and 67 deletions

View File

@ -30,6 +30,13 @@ Optimizations
* LUCENE-2086: When resolving deleted terms, do so in term sort order
for better performance. (Bogdan Ghidireac via Mike McCandless)
* LUCENE-2075: Terms dict cache is now shared across threads instead
of being stored separately in thread local storage. Also fixed
terms dict so that the cache is used when seeking the thread local
term enum, which will be important for MultiTermQuery impls that do
lots of seeking (Mike McCandless, Uwe Schindler, Robert Muir, Yonik
Seeley)
Build
======================= Release 3.0.0 2009-11-25 =======================

View File

@ -19,7 +19,7 @@ package org.apache.lucene.index;
/** A TermInfo is the record of information stored for a term.*/
final class TermInfo {
class TermInfo {
/** The number of documents which contain the term. */
int docFreq = 0;
@ -42,6 +42,28 @@ final class TermInfo {
skipOffset = ti.skipOffset;
}
public boolean equals(Object obj) {
if (obj instanceof TermInfo) {
TermInfo other = (TermInfo) obj;
return other.docFreq == docFreq &&
other.freqPointer == freqPointer &&
other.proxPointer == proxPointer &&
other.skipOffset == skipOffset;
} else {
return false;
}
}
public int hashCode() {
final int PRIME = 17;
int result = 1;
result = PRIME * result + docFreq;
result = (int) (PRIME * result + freqPointer);
result = (int) (PRIME * result + proxPointer);
result = (int) (PRIME * result + skipOffset);
return result;
}
final void set(int docFreq,
long freqPointer, long proxPointer, int skipOffset) {
this.docFreq = docFreq;

View File

@ -21,7 +21,7 @@ import java.io.IOException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.cache.Cache;
import org.apache.lucene.util.cache.SimpleLRUCache;
import org.apache.lucene.util.cache.DoubleBarrelLRUCache;
import org.apache.lucene.util.CloseableThreadLocal;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
@ -44,15 +44,23 @@ final class TermInfosReader {
private final int totalIndexInterval;
private final static int DEFAULT_CACHE_SIZE = 1024;
// Just adds term's ord to TermInfo
private final static class TermInfoAndOrd extends TermInfo {
final int termOrd;
public TermInfoAndOrd(TermInfo ti, int termOrd) {
super(ti);
this.termOrd = termOrd;
}
}
private final Cache<Term,TermInfoAndOrd> termsCache = new DoubleBarrelLRUCache<Term,TermInfoAndOrd>(DEFAULT_CACHE_SIZE);
/**
* Per-thread resources managed by ThreadLocal
*/
private static final class ThreadResources {
SegmentTermEnum termEnum;
// Used for caching the least recently looked-up Terms
Cache<Term,TermInfo> termInfoCache;
}
TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor)
@ -130,6 +138,7 @@ final class TermInfosReader {
if (origEnum != null)
origEnum.close();
threadResources.close();
termsCache.close();
}
/** Returns the number of term/value pairs in the set. */
@ -142,8 +151,6 @@ final class TermInfosReader {
if (resources == null) {
resources = new ThreadResources();
resources.termEnum = terms();
// Cache does not have to be thread-safe, it is only used by one thread at the same time
resources.termInfoCache = new SimpleLRUCache<Term,TermInfo>(DEFAULT_CACHE_SIZE);
threadResources.set(resources);
}
return resources;
@ -176,26 +183,20 @@ final class TermInfosReader {
/** Returns the TermInfo for a Term in the set, or null. */
TermInfo get(Term term) throws IOException {
return get(term, true);
return get(term, false);
}
/** Returns the TermInfo for a Term in the set, or null. */
private TermInfo get(Term term, boolean useCache) throws IOException {
private TermInfo get(Term term, boolean mustSeekEnum) throws IOException {
if (size == 0) return null;
ensureIndexIsRead();
TermInfo ti;
TermInfoAndOrd tiOrd = termsCache.get(term);
ThreadResources resources = getThreadResources();
Cache<Term,TermInfo> cache = null;
if (useCache) {
cache = resources.termInfoCache;
// check the cache first if the term was recently looked up
ti = cache.get(term);
if (ti != null) {
return ti;
}
if (!mustSeekEnum && tiOrd != null) {
return tiOrd;
}
// optimize sequential access: first try scanning cached enum w/o seeking
@ -208,16 +209,23 @@ final class TermInfosReader {
|| term.compareTo(indexTerms[enumOffset]) < 0) {
// no need to seek
final TermInfo ti;
int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (cache != null && numScans > 1) {
if (numScans > 1) {
// we only want to put this TermInfo into the cache if
// scanEnum skipped more than one dictionary entry.
// This prevents RangeQueries or WildcardQueries to
// wipe out the cache when they iterate over a large numbers
// of terms in order
cache.put(term, ti);
if (tiOrd == null) {
termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position));
} else {
assert ti.equals(tiOrd);
assert (int) enumerator.position == tiOrd.termOrd;
}
}
} else {
ti = null;
@ -228,12 +236,24 @@ final class TermInfosReader {
}
// random-access: must seek
seekEnum(enumerator, getIndexOffset(term));
final int indexPos;
if (tiOrd != null) {
indexPos = tiOrd.termOrd / totalIndexInterval;
} else {
// Must do binary search:
indexPos = getIndexOffset(term);
}
seekEnum(enumerator, indexPos);
enumerator.scanTo(term);
final TermInfo ti;
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (cache != null) {
cache.put(term, ti);
if (tiOrd == null) {
termsCache.put(term, new TermInfoAndOrd(ti, (int) enumerator.position));
} else {
assert ti.equals(tiOrd);
assert (int) enumerator.position == tiOrd.termOrd;
}
} else {
ti = null;
@ -294,9 +314,7 @@ final class TermInfosReader {
/** Returns an enumeration of terms starting at or after the named term. */
public SegmentTermEnum terms(Term term) throws IOException {
// don't use the cache in this call because we want to reposition the
// enumeration
get(term, false);
get(term, true);
return (SegmentTermEnum)getThreadResources().termEnum.clone();
}
}

View File

@ -0,0 +1,128 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.Map;
/**
* Simple concurrent LRU cache, using a "double barrel"
* approach where two ConcurrentHashMaps record entries.
*
* <p>At any given time, one hash is primary and the other
* is secondary. {@link #get} first checks primary, and if
* that's a miss, checks secondary. If secondary has the
* entry, it's promoted to primary. Once primary is full,
* the secondary is cleared and the two are swapped.</p>
*
* <p>This is not as space efficient as other possible
* concurrent approaches (see LUCENE-2075): to achieve
* perfect LRU(N) it requires 2*N storage. But, this
* approach is relatively simple and seems in practice to
* not grow unbounded in size when under hideously high
* load.</p>
*
* <p>NOTE: this class is meant only to be used internally
* by Lucene; it's only public so it can be shared across
* packages. This means the API is freely subject to
* change, and, the class could be removed entirely, in any
* Lucene release. Use directly at your own risk!
*/
final public class DoubleBarrelLRUCache<K,V> extends Cache<K,V> {
private final Map<K,V> cache1;
private final Map<K,V> cache2;
private final AtomicInteger countdown;
private volatile boolean swapped;
private final int maxSize;
public DoubleBarrelLRUCache(int maxSize) {
this.maxSize = maxSize;
countdown = new AtomicInteger(maxSize);
cache1 = new ConcurrentHashMap<K,V>();
cache2 = new ConcurrentHashMap<K,V>();
}
@Override
public boolean containsKey(Object k) {
return false;
}
@Override
public void close() {
}
@Override @SuppressWarnings("unchecked")
public V get(Object key) {
final Map<K,V> primary;
final Map<K,V> secondary;
if (swapped) {
primary = cache2;
secondary = cache1;
} else {
primary = cache1;
secondary = cache2;
}
// Try primary frist
V result = primary.get(key);
if (result == null) {
// Not found -- try secondary
result = secondary.get(key);
if (result != null) {
// Promote to primary
put((K) key, result);
}
}
return result;
}
@Override
public void put(K key, V value) {
final Map<K,V> primary;
final Map<K,V> secondary;
if (swapped) {
primary = cache2;
secondary = cache1;
} else {
primary = cache1;
secondary = cache2;
}
primary.put(key, value);
if (countdown.decrementAndGet() == 0) {
// Time to swap
// NOTE: there is saturation risk here, that the
// thread that's doing the clear() takes too long to
// do so, while other threads continue to add to
// primary, but in practice this seems not to be an
// issue (see LUCENE-2075 for benchmark & details)
// First, clear secondary
secondary.clear();
// Second, swap
swapped = !swapped;
// Third, reset countdown
countdown.set(maxSize);
}
}
}

View File

@ -24,7 +24,9 @@ import java.util.Map;
* Simple LRU cache implementation that uses a LinkedHashMap.
* This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)}
* if needed.
*
*
* @deprecated Lucene's internal use of this class has now
* switched to {@link DoubleBarrelLRUCache}.
*/
public class SimpleLRUCache<K,V> extends SimpleMapCache<K,V> {
private final static float LOADFACTOR = 0.75f;

View File

@ -25,6 +25,9 @@ import java.util.Set;
* Simple cache implementation that uses a HashMap to store (key, value) pairs.
* This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)}
* if needed.
*
* @deprecated Lucene's internal use of this class has now
* switched to {@link DoubleBarrelLRUCache}.
*/
public class SimpleMapCache<K,V> extends Cache<K,V> {
protected Map<K,V> map;

View File

@ -0,0 +1,60 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class BaseTestLRU extends LuceneTestCase {
protected void testCache(Cache cache, int n) throws Exception {
Object dummy = new Object();
for (int i = 0; i < n; i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 2nd item in cache
for (int i = 0; i < n; i+=2) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
// add n/2 elements to cache, the ones that weren't
// touched in the previous loop should now be thrown away
for (int i = n; i < n + (n / 2); i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
// add 3/4n elements to cache, the ones that weren't
// touched in the previous loops should now be thrown away
for (int i = n; i < n + (n * 3 / 4); i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
}
}

View File

@ -0,0 +1,103 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TestDoubleBarrelLRUCache extends BaseTestLRU {
public void testLRUCache() throws Exception {
final int n = 100;
testCache(new DoubleBarrelLRUCache(n), n);
}
private class CacheThread extends Thread {
private final Object[] objs;
private final Cache<Object,Object> c;
private final long endTime;
volatile boolean failed;
public CacheThread(Cache<Object,Object> c,
Object[] objs, long endTime) {
this.c = c;
this.objs = objs;
this.endTime = endTime;
}
public void run() {
try {
long count = 0;
long miss = 0;
long hit = 0;
final int limit = objs.length;
while(true) {
final Object obj = objs[(int) ((count/2) % limit)];
Object v = c.get(obj);
if (v == null) {
c.put(obj, obj);
miss++;
} else {
assert obj == v;
hit++;
}
if ((++count % 10000) == 0) {
if (System.currentTimeMillis() >= endTime) {
break;
}
}
}
addResults(miss, hit);
} catch (Throwable t) {
failed = true;
throw new RuntimeException(t);
}
}
}
long totMiss, totHit;
void addResults(long miss, long hit) {
totMiss += miss;
totHit += hit;
}
public void testThreadCorrectness() throws Exception {
final int NUM_THREADS = 4;
final int CACHE_SIZE = 512;
final int OBJ_COUNT = 3*CACHE_SIZE;
Cache<Object,Object> c = new DoubleBarrelLRUCache<Object,Object>(1024);
Object[] objs = new Object[OBJ_COUNT];
for(int i=0;i<OBJ_COUNT;i++) {
objs[i] = new Object();
}
final CacheThread[] threads = new CacheThread[NUM_THREADS];
final long endTime = System.currentTimeMillis()+((long) 1000);
for(int i=0;i<NUM_THREADS;i++) {
threads[i] = new CacheThread(c, objs, endTime);
threads[i].start();
}
for(int i=0;i<NUM_THREADS;i++) {
threads[i].join();
assert !threads[i].failed;
}
//System.out.println("hits=" + totHit + " misses=" + totMiss);
}
}

View File

@ -17,47 +17,10 @@ package org.apache.lucene.util.cache;
* limitations under the License.
*/
import org.apache.lucene.util.LuceneTestCase;
public class TestSimpleLRUCache extends LuceneTestCase {
/** @deprecated */
public class TestSimpleLRUCache extends BaseTestLRU {
public void testLRUCache() throws Exception {
final int n = 100;
Object dummy = new Object();
Cache cache = new SimpleLRUCache(n);
for (int i = 0; i < n; i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 2nd item in cache
for (int i = 0; i < n; i+=2) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
// add n/2 elements to cache, the ones that weren't
// touched in the previous loop should now be thrown away
for (int i = n; i < n + (n / 2); i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
// add 3/4n elements to cache, the ones that weren't
// touched in the previous loops should now be thrown away
for (int i = n; i < n + (n * 3 / 4); i++) {
cache.put(Integer.valueOf(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(Integer.valueOf(i)));
}
testCache(new SimpleLRUCache(n), n);
}
}