LUCENE-1195: Improve term lookup performance by adding a LRU cache to the TermInfosReader.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@659602 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Busch 2008-05-23 17:20:31 +00:00
parent a7f1cbc5be
commit eb0596c721
7 changed files with 414 additions and 38 deletions

View File

@ -183,6 +183,11 @@ Optimizations
5. LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses 5. LUCENE-1183: Optimized TRStringDistance class (in contrib/spell) that uses
less memory than the previous version. (Cédrik LIME via Otis Gospodnetic) less memory than the previous version. (Cédrik LIME via Otis Gospodnetic)
6. LUCENE-1195: Improve term lookup performance by adding a LRU cache to the
TermInfosReader. In performance experiments the speedup was about 25% on
average on mid-size indexes with ~500,000 documents for queries with 3
terms and about 7% on larger indexes with ~4.3M documents. (Michael Busch)
Documentation Documentation
1. LUCENE-1236: Added some clarifying remarks to EdgeNGram*.java (Hiroaki Kawai via Grant Ingersoll) 1. LUCENE-1236: Added some clarifying remarks to EdgeNGram*.java (Hiroaki Kawai via Grant Ingersoll)

View File

@ -150,10 +150,15 @@ final class SegmentTermEnum extends TermEnum implements Cloneable {
return true; return true;
} }
/** Optimized scan, without allocating new terms. */ /** Optimized scan, without allocating new terms.
final void scanTo(Term term) throws IOException { * Return number of invocations to next(). */
final int scanTo(Term term) throws IOException {
scanBuffer.set(term); scanBuffer.set(term);
while (scanBuffer.compareTo(termBuffer) > 0 && next()) {} int count = 0;
while (scanBuffer.compareTo(termBuffer) > 0 && next()) {
count++;
}
return count;
} }
/** Returns the current Term in the enumeration. /** Returns the current Term in the enumeration.

View File

@ -21,6 +21,8 @@ import java.io.IOException;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.BufferedIndexInput;
import org.apache.lucene.util.cache.Cache;
import org.apache.lucene.util.cache.SimpleLRUCache;
/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
* Directory. Pairs are accessed either by Term or by ordinal position the * Directory. Pairs are accessed either by Term or by ordinal position the
@ -31,7 +33,7 @@ final class TermInfosReader {
private String segment; private String segment;
private FieldInfos fieldInfos; private FieldInfos fieldInfos;
private ThreadLocal enumerators = new ThreadLocal(); private ThreadLocal threadResources = new ThreadLocal();
private SegmentTermEnum origEnum; private SegmentTermEnum origEnum;
private long size; private long size;
@ -44,6 +46,18 @@ final class TermInfosReader {
private int indexDivisor = 1; private int indexDivisor = 1;
private int totalIndexInterval; private int totalIndexInterval;
private final static int DEFAULT_CACHE_SIZE = 1024;
/**
* Per-thread resources managed by ThreadLocal
*/
private static final class ThreadResources {
SegmentTermEnum termEnum;
// Used for caching the least recently looked-up Terms
Cache termInfoCache;
}
TermInfosReader(Directory dir, String seg, FieldInfos fis) TermInfosReader(Directory dir, String seg, FieldInfos fis)
throws CorruptIndexException, IOException { throws CorruptIndexException, IOException {
this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE); this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
@ -129,7 +143,7 @@ final class TermInfosReader {
origEnum.close(); origEnum.close();
if (indexEnum != null) if (indexEnum != null)
indexEnum.close(); indexEnum.close();
enumerators.set(null); threadResources.set(null);
} }
/** Returns the number of term/value pairs in the set. */ /** Returns the number of term/value pairs in the set. */
@ -137,13 +151,16 @@ final class TermInfosReader {
return size; return size;
} }
private SegmentTermEnum getEnum() { private ThreadResources getThreadResources() {
SegmentTermEnum termEnum = (SegmentTermEnum)enumerators.get(); ThreadResources resources = (ThreadResources)threadResources.get();
if (termEnum == null) { if (resources == null) {
termEnum = terms(); resources = new ThreadResources();
enumerators.set(termEnum); resources.termEnum = terms();
// Cache does not have to be thread-safe, it is only used by one thread at the same time
resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
threadResources.set(resources);
} }
return termEnum; return resources;
} }
private synchronized void ensureIndexIsRead() throws IOException { private synchronized void ensureIndexIsRead() throws IOException {
@ -189,60 +206,94 @@ final class TermInfosReader {
return hi; return hi;
} }
private final void seekEnum(int indexOffset) throws IOException { private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
getEnum().seek(indexPointers[indexOffset], enumerator.seek(indexPointers[indexOffset],
(indexOffset * totalIndexInterval) - 1, (indexOffset * totalIndexInterval) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]); indexTerms[indexOffset], indexInfos[indexOffset]);
} }
/** Returns the TermInfo for a Term in the set, or null. */ /** Returns the TermInfo for a Term in the set, or null. */
TermInfo get(Term term) throws IOException { TermInfo get(Term term) throws IOException {
return get(term, true);
}
/** Returns the TermInfo for a Term in the set, or null. */
private TermInfo get(Term term, boolean useCache) throws IOException {
if (size == 0) return null; if (size == 0) return null;
ensureIndexIsRead(); ensureIndexIsRead();
TermInfo ti;
ThreadResources resources = getThreadResources();
Cache cache = null;
if (useCache) {
cache = resources.termInfoCache;
// check the cache first if the term was recently looked up
ti = (TermInfo) cache.get(term);
if (ti != null) {
return ti;
}
}
// optimize sequential access: first try scanning cached enum w/o seeking // optimize sequential access: first try scanning cached enum w/o seeking
SegmentTermEnum enumerator = getEnum(); SegmentTermEnum enumerator = resources.termEnum;
if (enumerator.term() != null // term is at or past current if (enumerator.term() != null // term is at or past current
&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
|| term.compareTo(enumerator.term()) >= 0)) { || term.compareTo(enumerator.term()) >= 0)) {
int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
if (indexTerms.length == enumOffset // but before end of block if (indexTerms.length == enumOffset // but before end of block
|| term.compareTo(indexTerms[enumOffset]) < 0) || term.compareTo(indexTerms[enumOffset]) < 0) {
return scanEnum(term); // no need to seek // no need to seek
int numScans = enumerator.scanTo(term);
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
ti = enumerator.termInfo();
if (cache != null && numScans > 1) {
// we only want to put this TermInfo into the cache if
// scanEnum skipped more than one dictionary entry.
// This prevents RangeQueries or WildcardQueries to
// wipe out the cache when they iterate over a large numbers
// of terms in order
cache.put(term, ti);
}
} else {
ti = null;
}
return ti;
}
} }
// random-access: must seek // random-access: must seek
seekEnum(getIndexOffset(term)); seekEnum(enumerator, getIndexOffset(term));
return scanEnum(term);
}
/** Scans within block for matching term. */
private final TermInfo scanEnum(Term term) throws IOException {
SegmentTermEnum enumerator = getEnum();
enumerator.scanTo(term); enumerator.scanTo(term);
if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
return enumerator.termInfo(); ti = enumerator.termInfo();
else if (cache != null) {
return null; cache.put(term, ti);
}
} else {
ti = null;
}
return ti;
} }
/** Returns the nth term in the set. */ /** Returns the nth term in the set. */
final Term get(int position) throws IOException { final Term get(int position) throws IOException {
if (size == 0) return null; if (size == 0) return null;
SegmentTermEnum enumerator = getEnum(); SegmentTermEnum enumerator = getThreadResources().termEnum;
if (enumerator != null && enumerator.term() != null && if (enumerator != null && enumerator.term() != null &&
position >= enumerator.position && position >= enumerator.position &&
position < (enumerator.position + totalIndexInterval)) position < (enumerator.position + totalIndexInterval))
return scanEnum(position); // can avoid seek return scanEnum(enumerator, position); // can avoid seek
seekEnum(position/totalIndexInterval); // must seek seekEnum(enumerator, position/totalIndexInterval); // must seek
return scanEnum(position); return scanEnum(enumerator, position);
} }
private final Term scanEnum(int position) throws IOException { private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
SegmentTermEnum enumerator = getEnum();
while(enumerator.position < position) while(enumerator.position < position)
if (!enumerator.next()) if (!enumerator.next())
return null; return null;
@ -256,9 +307,10 @@ final class TermInfosReader {
ensureIndexIsRead(); ensureIndexIsRead();
int indexOffset = getIndexOffset(term); int indexOffset = getIndexOffset(term);
seekEnum(indexOffset);
SegmentTermEnum enumerator = getEnum(); SegmentTermEnum enumerator = getThreadResources().termEnum;
seekEnum(enumerator, indexOffset);
while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
if (term.compareTo(enumerator.term()) == 0) if (term.compareTo(enumerator.term()) == 0)
@ -274,7 +326,9 @@ final class TermInfosReader {
/** Returns an enumeration of terms starting at or after the named term. */ /** Returns an enumeration of terms starting at or after the named term. */
public SegmentTermEnum terms(Term term) throws IOException { public SegmentTermEnum terms(Term term) throws IOException {
get(term); // don't use the cache in this call because we want to reposition the
return (SegmentTermEnum)getEnum().clone(); // enumeration
get(term, false);
return (SegmentTermEnum)getThreadResources().termEnum.clone();
} }
} }

View File

@ -0,0 +1,100 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Base class for cache implementations.
*/
public abstract class Cache {
/**
* Simple Cache wrapper that synchronizes all
* calls that access the cache.
*/
static class SynchronizedCache extends Cache {
Object mutex;
Cache cache;
SynchronizedCache(Cache cache) {
this.cache = cache;
this.mutex = this;
}
SynchronizedCache(Cache cache, Object mutex) {
this.cache = cache;
this.mutex = mutex;
}
public void put(Object key, Object value) {
synchronized(mutex) {cache.put(key, value);}
}
public Object get(Object key) {
synchronized(mutex) {return cache.get(key);}
}
public boolean containsKey(Object key) {
synchronized(mutex) {return cache.containsKey(key);}
}
public void close() {
synchronized(mutex) {cache.close();}
}
Cache getSynchronizedCache() {
return this;
}
}
public static Cache synchronizedCache(Cache cache) {
return cache.getSynchronizedCache();
}
/**
* Called by {@link #synchronizedCache(Cache)}. This method
* returns a {@link SynchronizedCache} instance that wraps
* this instance by default and can be overridden to return
* e. g. subclasses of {@link SynchronizedCache} or this
* in case this cache is already synchronized.
*/
Cache getSynchronizedCache() {
return new SynchronizedCache(this);
}
/**
* Puts a (key, value)-pair into the cache.
*/
public abstract void put(Object key, Object value);
/**
* Returns the value for the given key.
*/
public abstract Object get(Object key);
/**
* Returns whether the given key is in this cache.
*/
public abstract boolean containsKey(Object key);
/**
* Closes the cache.
*/
public abstract void close();
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Simple LRU cache implementation that uses a LinkedHashMap.
* This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)}
* if needed.
*
*/
public class SimpleLRUCache extends SimpleMapCache {
private final static float LOADFACTOR = 0.75f;
private int cacheSize;
/**
* Creates a last-recently-used cache with the specified size.
*/
public SimpleLRUCache(int cacheSize) {
super(null);
this.cacheSize = cacheSize;
int capacity = (int) Math.ceil(cacheSize / LOADFACTOR) + 1;
super.map = new LinkedHashMap(capacity, LOADFACTOR, true) {
protected boolean removeEldestEntry(Map.Entry eldest) {
return size() > SimpleLRUCache.this.cacheSize;
}
};
}
}

View File

@ -0,0 +1,100 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
/**
* Simple cache implementation that uses a HashMap to store (key, value) pairs.
* This cache is not synchronized, use {@link Cache#synchronizedCache(Cache)}
* if needed.
*/
public class SimpleMapCache extends Cache {
Map map;
public SimpleMapCache() {
this(new HashMap());
}
public SimpleMapCache(Map map) {
this.map = map;
}
public Object get(Object key) {
return map.get(key);
}
public void put(Object key, Object value) {
map.put(key, value);
}
public void close() {
// NOOP
}
public boolean containsKey(Object key) {
return map.containsKey(key);
}
/**
* Returns a Set containing all keys in this cache.
*/
public Set keySet() {
return map.keySet();
}
Cache getSynchronizedCache() {
return new SynchronizedSimpleMapCache(this);
}
private static class SynchronizedSimpleMapCache extends SimpleMapCache {
Object mutex;
SimpleMapCache cache;
SynchronizedSimpleMapCache(SimpleMapCache cache) {
this.cache = cache;
this.mutex = this;
}
public void put(Object key, Object value) {
synchronized(mutex) {cache.put(key, value);}
}
public Object get(Object key) {
synchronized(mutex) {return cache.get(key);}
}
public boolean containsKey(Object key) {
synchronized(mutex) {return cache.containsKey(key);}
}
public void close() {
synchronized(mutex) {cache.close();}
}
public Set keySet() {
synchronized(mutex) {return cache.keySet();}
}
Cache getSynchronizedCache() {
return this;
}
}
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.util.cache;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
public class TestSimpleLRUCache extends TestCase {
public void testLRUCache() throws Exception {
final int n = 100;
Object dummy = new Object();
Cache cache = new SimpleLRUCache(n);
for (int i = 0; i < n; i++) {
cache.put(new Integer(i), dummy);
}
// access every 2nd item in cache
for (int i = 0; i < n; i+=2) {
assertNotNull(cache.get(new Integer(i)));
}
// add n/2 elements to cache, the ones that weren't
// touched in the previous loop should now be thrown away
for (int i = n; i < n + (n / 2); i++) {
cache.put(new Integer(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(new Integer(i)));
}
// add 3/4n elements to cache, the ones that weren't
// touched in the previous loops should now be thrown away
for (int i = n; i < n + (n * 3 / 4); i++) {
cache.put(new Integer(i), dummy);
}
// access every 4th item in cache
for (int i = 0; i < n; i+=4) {
assertNotNull(cache.get(new Integer(i)));
}
}
}