mirror of https://github.com/apache/lucene.git
LUCENE-3354: FieldCache can cache DocTermOrds.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1158393 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7eab19aff7
commit
4dfa638783
|
@ -473,6 +473,8 @@ New features
|
|||
different usecases like merging, flushing and reading.
|
||||
(Simon Willnauer, Mike McCandless, Varun Thacker)
|
||||
|
||||
* LUCENE-3354: FieldCache can cache DocTermOrds. (Martijn van Groningen)
|
||||
|
||||
Optimizations
|
||||
|
||||
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms
|
||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocTermOrds;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.cache.EntryCreator;
|
||||
|
@ -653,6 +654,18 @@ public interface FieldCache {
|
|||
public DocTermsIndex getTermsIndex (IndexReader reader, String field, boolean fasterButMoreRAM)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values
|
||||
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
|
||||
* the terms (as ords) per document.
|
||||
*
|
||||
* @param reader Used to build a {@link DocTermOrds} instance
|
||||
* @param field Which field contains the strings.
|
||||
* @return a {@link DocTermOrds} instance
|
||||
* @throws IOException If any error occurs.
|
||||
*/
|
||||
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException;
|
||||
|
||||
/**
|
||||
* EXPERT: A unique Identifier/Description for each item in the FieldCache.
|
||||
* Can be useful for logging/debugging.
|
||||
|
|
|
@ -17,31 +17,15 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocTermOrds;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.cache.*;
|
||||
import org.apache.lucene.search.cache.CachedArray.*;
|
||||
import org.apache.lucene.util.FieldCacheSanityChecker;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.cache.ByteValuesCreator;
|
||||
import org.apache.lucene.search.cache.DocTermsCreator;
|
||||
import org.apache.lucene.search.cache.DocTermsIndexCreator;
|
||||
import org.apache.lucene.search.cache.DoubleValuesCreator;
|
||||
import org.apache.lucene.search.cache.EntryCreator;
|
||||
import org.apache.lucene.search.cache.FloatValuesCreator;
|
||||
import org.apache.lucene.search.cache.IntValuesCreator;
|
||||
import org.apache.lucene.search.cache.LongValuesCreator;
|
||||
import org.apache.lucene.search.cache.ShortValuesCreator;
|
||||
import org.apache.lucene.search.cache.CachedArray.ByteValues;
|
||||
import org.apache.lucene.search.cache.CachedArray.DoubleValues;
|
||||
import org.apache.lucene.search.cache.CachedArray.FloatValues;
|
||||
import org.apache.lucene.search.cache.CachedArray.IntValues;
|
||||
import org.apache.lucene.search.cache.CachedArray.LongValues;
|
||||
import org.apache.lucene.search.cache.CachedArray.ShortValues;
|
||||
import org.apache.lucene.util.FieldCacheSanityChecker;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Expert: The default cache implementation, storing all values in memory.
|
||||
|
@ -70,6 +54,7 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
|
|||
caches.put(Double.TYPE, new Cache<DoubleValues>(this));
|
||||
caches.put(DocTermsIndex.class, new Cache<DocTermsIndex>(this));
|
||||
caches.put(DocTerms.class, new Cache<DocTerms>(this));
|
||||
caches.put(DocTermOrds.class, new Cache<DocTermOrds>(this));
|
||||
}
|
||||
|
||||
public synchronized void purgeAllCaches() {
|
||||
|
@ -393,6 +378,11 @@ public class FieldCacheImpl implements FieldCache { // Made Public so that
|
|||
return (DocTerms)caches.get(DocTerms.class).get(reader, new Entry(field, creator));
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public DocTermOrds getDocTermOrds(IndexReader reader, String field) throws IOException {
|
||||
return (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new Entry(field, new DocTermOrdsCreator(field, 0)));
|
||||
}
|
||||
|
||||
private volatile PrintStream infoStream;
|
||||
|
||||
public void setInfoStream(PrintStream stream) {
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.search.cache;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.DocTermOrds;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Creates {@link DocTermOrds} instances.
|
||||
*/
|
||||
public class DocTermOrdsCreator extends EntryCreatorWithOptions<DocTermOrds> {
|
||||
|
||||
private final String field;
|
||||
|
||||
public DocTermOrdsCreator(String field, int flag) {
|
||||
super(flag);
|
||||
this.field = field;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocTermOrds create(IndexReader reader) throws IOException {
|
||||
return new DocTermOrds(reader, field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocTermOrds validate(DocTermOrds entry, IndexReader reader) throws IOException {
|
||||
return entry;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EntryKey getCacheKey() {
|
||||
return new SimpleEntryKey(DocTermOrdsCreator.class, field);
|
||||
}
|
||||
}
|
|
@ -19,28 +19,33 @@ package org.apache.lucene.search;
|
|||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import java.io.IOException;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
|
||||
public class TestFieldCache extends LuceneTestCase {
|
||||
protected IndexReader reader;
|
||||
private int NUM_DOCS;
|
||||
private int NUM_ORDS;
|
||||
private String[] unicodeStrings;
|
||||
private BytesRef[][] multiValued;
|
||||
private Directory directory;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
NUM_DOCS = atLeast(1000);
|
||||
NUM_ORDS = atLeast(2);
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer= new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
long theLong = Long.MAX_VALUE;
|
||||
|
@ -50,6 +55,7 @@ public class TestFieldCache extends LuceneTestCase {
|
|||
int theInt = Integer.MAX_VALUE;
|
||||
float theFloat = Float.MAX_VALUE;
|
||||
unicodeStrings = new String[NUM_DOCS];
|
||||
multiValued = new BytesRef[NUM_DOCS][NUM_ORDS];
|
||||
if (VERBOSE) {
|
||||
System.out.println("TEST: setUp");
|
||||
}
|
||||
|
@ -65,21 +71,19 @@ public class TestFieldCache extends LuceneTestCase {
|
|||
|
||||
// sometimes skip the field:
|
||||
if (random.nextInt(40) != 17) {
|
||||
String s = null;
|
||||
if (i > 0 && random.nextInt(3) == 1) {
|
||||
// reuse past string -- try to find one that's not null
|
||||
for(int iter=0;iter<10 && s==null;iter++) {
|
||||
s = unicodeStrings[random.nextInt(i)];
|
||||
}
|
||||
if (s == null) {
|
||||
s = _TestUtil.randomUnicodeString(random, 250);
|
||||
}
|
||||
} else {
|
||||
s = _TestUtil.randomUnicodeString(random, 250);
|
||||
}
|
||||
unicodeStrings[i] = s;
|
||||
unicodeStrings[i] = generateString(i);
|
||||
doc.add(newField("theRandomUnicodeString", unicodeStrings[i], Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
||||
}
|
||||
|
||||
// sometimes skip the field:
|
||||
if (random.nextInt(10) != 8) {
|
||||
for (int j = 0; j < NUM_ORDS; j++) {
|
||||
String newValue = generateString(i);
|
||||
multiValued[i][j] = new BytesRef(newValue);
|
||||
doc.add(newField("theRandomUnicodeMultiValuedField", newValue, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));
|
||||
}
|
||||
Arrays.sort(multiValued[i]);
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
reader = writer.getReader();
|
||||
|
@ -210,6 +214,47 @@ public class TestFieldCache extends LuceneTestCase {
|
|||
// test bad field
|
||||
terms = cache.getTerms(reader, "bogusfield");
|
||||
|
||||
// getDocTermOrds
|
||||
DocTermOrds termOrds = cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField");
|
||||
TermsEnum termsEnum = termOrds.getOrdTermsEnum(reader);
|
||||
assertSame("Second request to cache return same DocTermOrds", termOrds, cache.getDocTermOrds(reader, "theRandomUnicodeMultiValuedField"));
|
||||
DocTermOrds.TermOrdsIterator reuse = null;
|
||||
for (int i = 0; i < NUM_DOCS; i++) {
|
||||
reuse = termOrds.lookup(i, reuse);
|
||||
final int[] buffer = new int[5];
|
||||
// This will remove identical terms. A DocTermOrds doesn't return duplicate ords for a docId
|
||||
List<BytesRef> values = new ArrayList<BytesRef>(new LinkedHashSet<BytesRef>(Arrays.asList(multiValued[i])));
|
||||
for (;;) {
|
||||
int chunk = reuse.read(buffer);
|
||||
if (chunk == 0) {
|
||||
for (int ord = 0; ord < values.size(); ord++) {
|
||||
BytesRef term = values.get(ord);
|
||||
assertNull(String.format("Document[%d] misses field must be null. Has value %s for ord %d", i, term, ord), term);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
for(int idx=0; idx < chunk; idx++) {
|
||||
int key = buffer[idx];
|
||||
termsEnum.seekExact((long) key);
|
||||
String actual = termsEnum.term().utf8ToString();
|
||||
String expected = values.get(idx).utf8ToString();
|
||||
if (!expected.equals(actual)) {
|
||||
reuse = termOrds.lookup(i, reuse);
|
||||
reuse.read(buffer);
|
||||
}
|
||||
assertTrue(String.format("Expected value %s for doc %d and ord %d, but was %s", expected, i, idx, actual), expected.equals(actual));
|
||||
}
|
||||
|
||||
if (chunk < buffer.length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// test bad field
|
||||
termOrds = cache.getDocTermOrds(reader, "bogusfield");
|
||||
|
||||
FieldCache.DEFAULT.purge(reader);
|
||||
}
|
||||
|
||||
|
@ -223,4 +268,21 @@ public class TestFieldCache extends LuceneTestCase {
|
|||
r.close();
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private String generateString(int i) {
|
||||
String s = null;
|
||||
if (i > 0 && random.nextInt(3) == 1) {
|
||||
// reuse past string -- try to find one that's not null
|
||||
for(int iter = 0; iter < 10 && s == null;iter++) {
|
||||
s = unicodeStrings[random.nextInt(i)];
|
||||
}
|
||||
if (s == null) {
|
||||
s = _TestUtil.randomUnicodeString(random, 250);
|
||||
}
|
||||
} else {
|
||||
s = _TestUtil.randomUnicodeString(random, 250);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue