mirror of https://github.com/apache/lucene.git
LUCENE-3354: FieldCache can cache DocTermOrds.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158393 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7eab19aff7
commit
4dfa638783
|
@ -473,6 +473,8 @@ New features
|
||||||
different usecases like merging, flushing and reading.
|
different usecases like merging, flushing and reading.
|
||||||
(Simon Willnauer, Mike McCandless, Varun Thacker)
|
(Simon Willnauer, Mike McCandless, Varun Thacker)
|
||||||
|
|
||||||
|
* LUCENE-3354: FieldCache can cache DocTermOrds. (Martijn van Groningen)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DocTermOrds;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.cache.EntryCreator;
|
import org.apache.lucene.search.cache.EntryCreator;
|
||||||
|
@ -653,6 +654,18 @@ public interface FieldCache {
|
||||||
public DocTermsIndex getTermsIndex (IndexReader reader, String field, boolean fasterButMoreRAM)
|
public DocTermsIndex getTermsIndex (IndexReader reader, String field, boolean fasterButMoreRAM)
|
||||||
throws IOException;
|
throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values
|
||||||
|
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
|
||||||
|
* the terms (as ords) per document.
|
||||||
|
*
|
||||||
|
* @param reader Used to build a {@link DocTermOrds} instance
|
||||||
|
* @param field Which field contains the strings.
|
||||||
|
* @return a {@link DocTermOrds} instance
|
||||||
|
* @throws IOException If any error occurs.
|
||||||
|
*/
|
||||||
|
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* EXPERT: A unique Identifier/Description for each item in the FieldCache.
|
* EXPERT: A unique Identifier/Description for each item in the FieldCache.
|
||||||
* Can be useful for logging/debugging.
|
* Can be useful for logging/debugging.
|
||||||
|
|
|
@ -17,31 +17,15 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DocTermOrds;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.search.cache.*;
|
||||||
|
import org.apache.lucene.search.cache.CachedArray.*;
|
||||||
|
import org.apache.lucene.util.FieldCacheSanityChecker;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.WeakHashMap;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.search.cache.ByteValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.DocTermsCreator;
|
|
||||||
import org.apache.lucene.search.cache.DocTermsIndexCreator;
|
|
||||||
import org.apache.lucene.search.cache.DoubleValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.EntryCreator;
|
|
||||||
import org.apache.lucene.search.cache.FloatValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.IntValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.LongValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.ShortValuesCreator;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.ByteValues;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.DoubleValues;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.FloatValues;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.IntValues;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.LongValues;
|
|
||||||
import org.apache.lucene.search.cache.CachedArray.ShortValues;
|
|
||||||
import org.apache.lucene.util.FieldCacheSanityChecker;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Expert: The default cache implementation, storing all values in memory.
|
* Expert: The default cache implementation, storing all values in memory.
|
||||||
|
@ -70,6 +54,7 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
|
||||||
caches.put(Double.TYPE, new Cache<DoubleValues>(this));
|
caches.put(Double.TYPE, new Cache<DoubleValues>(this));
|
||||||
caches.put(DocTermsIndex.class, new Cache<DocTermsIndex>(this));
|
caches.put(DocTermsIndex.class, new Cache<DocTermsIndex>(this));
|
||||||
caches.put(DocTerms.class, new Cache<DocTerms>(this));
|
caches.put(DocTerms.class, new Cache<DocTerms>(this));
|
||||||
|
caches.put(DocTermOrds.class, new Cache<DocTermOrds>(this));
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void purgeAllCaches() {
|
public synchronized void purgeAllCaches() {
|
||||||
|
@ -393,6 +378,11 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
|
||||||
return (DocTerms)caches.get(DocTerms.class).get(reader, new Entry(field, creator));
|
return (DocTerms)caches.get(DocTerms.class).get(reader, new Entry(field, creator));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException {
|
||||||
|
return (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new Entry(field, new DocTermOrdsCreator(field, 0)));
|
||||||
|
}
|
||||||
|
|
||||||
private volatile PrintStream infoStream;
|
private volatile PrintStream infoStream;
|
||||||
|
|
||||||
public void setInfoStream(PrintStream stream) {
|
public void setInfoStream(PrintStream stream) {
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
package org.apache.lucene.search.cache;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.DocTermOrds;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates {@link DocTermOrds} instances.
|
||||||
|
*/
|
||||||
|
public class DocTermOrdsCreator extends EntryCreatorWithOptions<DocTermOrds> {
|
||||||
|
|
||||||
|
private final String field;
|
||||||
|
|
||||||
|
public DocTermOrdsCreator(String field, int flag) {
|
||||||
|
super(flag);
|
||||||
|
this.field = field;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocTermOrds create(IndexReader reader) throws IOException {
|
||||||
|
return new DocTermOrds(reader, field);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DocTermOrds validate(DocTermOrds entry, IndexReader reader) throws IOException {
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public EntryKey getCacheKey() {
|
||||||
|
return new SimpleEntryKey(DocTermOrdsCreator.class, field);
|
||||||
|
}
|
||||||
|
}
|
|
@ -19,28 +19,33 @@ package org.apache.lucene.search;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
|
||||||
import org.apache.lucene.index.TermsEnum;
|
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedHashSet;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class TestFieldCache extends LuceneTestCase {
|
public class TestFieldCache extends LuceneTestCase {
|
||||||
protected IndexReader reader;
|
protected IndexReader reader;
|
||||||
private int NUM_DOCS;
|
private int NUM_DOCS;
|
||||||
|
private int NUM_ORDS;
|
||||||
private String[] unicodeStrings;
|
private String[] unicodeStrings;
|
||||||
|
private BytesRef[][] multiValued;
|
||||||
private Directory directory;
|
private Directory directory;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
NUM_DOCS = atLeast(1000);
|
NUM_DOCS = atLeast(1000);
|
||||||
|
NUM_ORDS = atLeast(2);
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
RandomIndexWriter writer= new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
RandomIndexWriter writer= new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||||
long theLong = Long.MAX_VALUE;
|
long theLong = Long.MAX_VALUE;
|
||||||
|
@ -50,6 +55,7 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
int theInt = Integer.MAX_VALUE;
|
int theInt = Integer.MAX_VALUE;
|
||||||
float theFloat = Float.MAX_VALUE;
|
float theFloat = Float.MAX_VALUE;
|
||||||
unicodeStrings = new String[NUM_DOCS];
|
unicodeStrings = new String[NUM_DOCS];
|
||||||
|
multiValued = new BytesRef[NUM_DOCS][NUM_ORDS];
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println("TEST: setUp");
|
System.out.println("TEST: setUp");
|
||||||
}
|
}
|
||||||
|
@ -65,21 +71,19 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
|
|
||||||
// sometimes skip the field:
|
// sometimes skip the field:
|
||||||
if (random.nextInt(40) != 17) {
|
if (random.nextInt(40) != 17) {
|
||||||
String s = null;
|
unicodeStrings[i] = generateString(i);
|
||||||
if (i > 0 && random.nextInt(3) == 1) {
|
|
||||||
// reuse past string -- try to find one that's not null
|
|
||||||
for(int iter=0;iter<10 && s==null;iter++) {
|
|
||||||
s = unicodeStrings[random.nextInt(i)];
|
|
||||||
}
|
|
||||||
if (s == null) {
|
|
||||||
s = _TestUtil.randomUnicodeString(random, 250);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s = _TestUtil.randomUnicodeString(random, 250);
|
|
||||||
}
|
|
||||||
unicodeStrings[i] = s;
|
|
||||||
doc.add(newField("theRandomUnicodeString", unicodeStrings[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
doc.add(newField("theRandomUnicodeString", unicodeStrings[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sometimes skip the field:
|
||||||
|
if (random.nextInt(10) != 8) {
|
||||||
|
for (int j = 0; j < NUM_ORDS; j++) {
|
||||||
|
String newValue = generateString(i);
|
||||||
|
multiValued[i][j] = new BytesRef(newValue);
|
||||||
|
doc.add(newField("theRandomUnicodeMultiValuedField", newValue, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
||||||
|
}
|
||||||
|
Arrays.sort(multiValued[i]);
|
||||||
|
}
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
reader = writer.getReader();
|
reader = writer.getReader();
|
||||||
|
@ -210,6 +214,47 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
// test bad field
|
// test bad field
|
||||||
terms = cache.getTerms(reader, "bogusfield");
|
terms = cache.getTerms(reader, "bogusfield");
|
||||||
|
|
||||||
|
// getDocTermOrds
|
||||||
|
DocTermOrds termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField");
|
||||||
|
TermsEnum termsEnum = termOrds.getOrdTermsEnum(reader);
|
||||||
|
assertSame("Second request to cache return same DocTermOrds", termOrds, cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"));
|
||||||
|
DocTermOrds.TermOrdsIterator reuse = null;
|
||||||
|
for (int i = 0; i < NUM_DOCS; i++) {
|
||||||
|
reuse = termOrds.lookup(i, reuse);
|
||||||
|
final int[] buffer = new int[5];
|
||||||
|
// This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId
|
||||||
|
List<BytesRef> values = new ArrayList<BytesRef>(new LinkedHashSet<BytesRef>(Arrays.asList(multiValued[i])));
|
||||||
|
for (;;) {
|
||||||
|
int chunk = reuse.read(buffer);
|
||||||
|
if (chunk == 0) {
|
||||||
|
for (int ord = 0; ord < values.size(); ord++) {
|
||||||
|
BytesRef term = values.get(ord);
|
||||||
|
assertNull(String.format("Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int idx=0; idx < chunk; idx++) {
|
||||||
|
int key = buffer[idx];
|
||||||
|
termsEnum.seekExact((long) key);
|
||||||
|
String actual = termsEnum.term().utf8ToString();
|
||||||
|
String expected = values.get(idx).utf8ToString();
|
||||||
|
if (!expected.equals(actual)) {
|
||||||
|
reuse = termOrds.lookup(i, reuse);
|
||||||
|
reuse.read(buffer);
|
||||||
|
}
|
||||||
|
assertTrue(String.format("Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (chunk < buffer.length) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// test bad field
|
||||||
|
termOrds = cache.getDocTermOrds(reader, "bogusfield");
|
||||||
|
|
||||||
FieldCache.DEFAULT.purge(reader);
|
FieldCache.DEFAULT.purge(reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,4 +268,21 @@ public class TestFieldCache extends LuceneTestCase {
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String generateString(int i) {
|
||||||
|
String s = null;
|
||||||
|
if (i > 0 && random.nextInt(3) == 1) {
|
||||||
|
// reuse past string -- try to find one that's not null
|
||||||
|
for(int iter = 0; iter < 10 && s == null;iter++) {
|
||||||
|
s = unicodeStrings[random.nextInt(i)];
|
||||||
|
}
|
||||||
|
if (s == null) {
|
||||||
|
s = _TestUtil.randomUnicodeString(random, 250);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
s = _TestUtil.randomUnicodeString(random, 250);
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue