LUCENE-3354: FieldCache can cache DocTermOrds.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158393 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Martijn van Groningen 2011-08-16 17:44:16 +00:00
parent 7eab19aff7
commit 4dfa638783
5 changed files with 160 additions and 42 deletions

View File

@ -473,6 +473,8 @@ New features
different usecases like merging, flushing and reading.
(Simon Willnauer, Mike McCandless, Varun Thacker)
* LUCENE-3354: FieldCache can cache DocTermOrds. (Martijn van Groningen)
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -17,6 +17,7 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.cache.EntryCreator;
@ -653,6 +654,18 @@ public interface FieldCache {
public DocTermsIndex getTermsIndex (IndexReader reader, String field, boolean fasterButMoreRAM)
throws IOException;
/**
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
* the terms (as ords) per document.
*
* @param reader Used to build a {@link DocTermOrds} instance
* @param field Which field contains the strings.
* @return a {@link DocTermOrds} instance
* @throws IOException If any error occurs.
*/
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException;
/**
* EXPERT: A unique Identifier/Description for each item in the FieldCache.
* Can be useful for logging/debugging.

View File

@ -17,31 +17,15 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.cache.*;
import org.apache.lucene.search.cache.CachedArray.*;
import org.apache.lucene.util.FieldCacheSanityChecker;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.WeakHashMap;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.cache.ByteValuesCreator;
import org.apache.lucene.search.cache.DocTermsCreator;
import org.apache.lucene.search.cache.DocTermsIndexCreator;
import org.apache.lucene.search.cache.DoubleValuesCreator;
import org.apache.lucene.search.cache.EntryCreator;
import org.apache.lucene.search.cache.FloatValuesCreator;
import org.apache.lucene.search.cache.IntValuesCreator;
import org.apache.lucene.search.cache.LongValuesCreator;
import org.apache.lucene.search.cache.ShortValuesCreator;
import org.apache.lucene.search.cache.CachedArray.ByteValues;
import org.apache.lucene.search.cache.CachedArray.DoubleValues;
import org.apache.lucene.search.cache.CachedArray.FloatValues;
import org.apache.lucene.search.cache.CachedArray.IntValues;
import org.apache.lucene.search.cache.CachedArray.LongValues;
import org.apache.lucene.search.cache.CachedArray.ShortValues;
import org.apache.lucene.util.FieldCacheSanityChecker;
import java.util.*;
/**
* Expert: The default cache implementation, storing all values in memory.
@ -70,6 +54,7 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
caches.put(Double.TYPE, new Cache<DoubleValues>(this));
caches.put(DocTermsIndex.class, new Cache<DocTermsIndex>(this));
caches.put(DocTerms.class, new Cache<DocTerms>(this));
caches.put(DocTermOrds.class, new Cache<DocTermOrds>(this));
}
public synchronized void purgeAllCaches() {
@ -393,6 +378,11 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
return (DocTerms)caches.get(DocTerms.class).get(reader, new Entry(field, creator));
}
@SuppressWarnings("unchecked")
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException {
return (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new Entry(field, new DocTermOrdsCreator(field, 0)));
}
private volatile PrintStream infoStream;
public void setInfoStream(PrintStream stream) {

View File

@ -0,0 +1,51 @@
package org.apache.lucene.search.cache;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.DocTermOrds;
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/**
* Creates {@link DocTermOrds} instances.
*/
public class DocTermOrdsCreator extends EntryCreatorWithOptions<DocTermOrds> {
private final String field;
public DocTermOrdsCreator(String field, int flag) {
super(flag);
this.field = field;
}
@Override
public DocTermOrds create(IndexReader reader) throws IOException {
return new DocTermOrds(reader, field);
}
@Override
public DocTermOrds validate(DocTermOrds entry, IndexReader reader) throws IOException {
return entry;
}
@Override
public EntryKey getCacheKey() {
return new SimpleEntryKey(DocTermOrdsCreator.class, field);
}
}

View File

@ -19,28 +19,33 @@ package org.apache.lucene.search;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.BytesRef;
import java.io.IOException;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.LinkedHashSet;
import java.util.List;
public class TestFieldCache extends LuceneTestCase {
protected IndexReader reader;
private int NUM_DOCS;
private int NUM_ORDS;
private String[] unicodeStrings;
private BytesRef[][] multiValued;
private Directory directory;
@Override
public void setUp() throws Exception {
super.setUp();
NUM_DOCS = atLeast(1000);
NUM_ORDS = atLeast(2);
directory = newDirectory();
RandomIndexWriter writer= new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
long theLong = Long.MAX_VALUE;
@ -50,6 +55,7 @@ public class TestFieldCache extends LuceneTestCase {
int theInt = Integer.MAX_VALUE;
float theFloat = Float.MAX_VALUE;
unicodeStrings = new String[NUM_DOCS];
multiValued = new BytesRef[NUM_DOCS][NUM_ORDS];
if (VERBOSE) {
System.out.println("TEST: setUp");
}
@ -65,21 +71,19 @@ public class TestFieldCache extends LuceneTestCase {
// sometimes skip the field:
if (random.nextInt(40) != 17) {
String s = null;
if (i > 0 && random.nextInt(3) == 1) {
// reuse past string -- try to find one that's not null
for(int iter=0;iter<10 && s==null;iter++) {
s = unicodeStrings[random.nextInt(i)];
}
if (s == null) {
s = _TestUtil.randomUnicodeString(random, 250);
}
} else {
s = _TestUtil.randomUnicodeString(random, 250);
}
unicodeStrings[i] = s;
unicodeStrings[i] = generateString(i);
doc.add(newField("theRandomUnicodeString", unicodeStrings[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
}
// sometimes skip the field:
if (random.nextInt(10) != 8) {
for (int j = 0; j < NUM_ORDS; j++) {
String newValue = generateString(i);
multiValued[i][j] = new BytesRef(newValue);
doc.add(newField("theRandomUnicodeMultiValuedField", newValue, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
}
Arrays.sort(multiValued[i]);
}
writer.addDocument(doc);
}
reader = writer.getReader();
@ -210,6 +214,47 @@ public class TestFieldCache extends LuceneTestCase {
// test bad field
terms = cache.getTerms(reader, "bogusfield");
// getDocTermOrds
DocTermOrds termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField");
TermsEnum termsEnum = termOrds.getOrdTermsEnum(reader);
assertSame("Second request to cache return same DocTermOrds", termOrds, cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"));
DocTermOrds.TermOrdsIterator reuse = null;
for (int i = 0; i < NUM_DOCS; i++) {
reuse = termOrds.lookup(i, reuse);
final int[] buffer = new int[5];
// This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId
List<BytesRef> values = new ArrayList<BytesRef>(new LinkedHashSet<BytesRef>(Arrays.asList(multiValued[i])));
for (;;) {
int chunk = reuse.read(buffer);
if (chunk == 0) {
for (int ord = 0; ord < values.size(); ord++) {
BytesRef term = values.get(ord);
assertNull(String.format("Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
}
break;
}
for(int idx=0; idx < chunk; idx++) {
int key = buffer[idx];
termsEnum.seekExact((long) key);
String actual = termsEnum.term().utf8ToString();
String expected = values.get(idx).utf8ToString();
if (!expected.equals(actual)) {
reuse = termOrds.lookup(i, reuse);
reuse.read(buffer);
}
assertTrue(String.format("Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
}
if (chunk < buffer.length) {
break;
}
}
}
// test bad field
termOrds = cache.getDocTermOrds(reader, "bogusfield");
FieldCache.DEFAULT.purge(reader);
}
@ -223,4 +268,21 @@ public class TestFieldCache extends LuceneTestCase {
r.close();
dir.close();
}
private String generateString(int i) {
String s = null;
if (i > 0 && random.nextInt(3) == 1) {
// reuse past string -- try to find one that's not null
for(int iter = 0; iter < 10 && s == null;iter++) {
s = unicodeStrings[random.nextInt(i)];
}
if (s == null) {
s = _TestUtil.randomUnicodeString(random, 250);
}
} else {
s = _TestUtil.randomUnicodeString(random, 250);
}
return s;
}
}