LUCENE-5666: actually make a single-valued fc if the field is not multi-valued

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1594095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2014-05-12 21:20:09 +00:00
parent 5aba5fda67
commit 003b2e9e11
4 changed files with 94 additions and 11 deletions

View File

@ -239,6 +239,11 @@ interface FieldCache {
* subsequent calls will share the same cache entry. */ * subsequent calls will share the same cache entry. */
public SortedDocValues getTermsIndex(AtomicReader reader, String field, float acceptableOverheadRatio) throws IOException; public SortedDocValues getTermsIndex(AtomicReader reader, String field, float acceptableOverheadRatio) throws IOException;
/** Can be passed to {@link #getDocTermOrds} to filter for 32-bit numeric terms */
public static final BytesRef INT32_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_INT });
/** Can be passed to {@link #getDocTermOrds} to filter for 64-bit numeric terms */
public static final BytesRef INT64_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_LONG });
/** /**
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values * Checks the internal cache for an appropriate entry, and if none is found, reads the term values
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve * in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
@ -246,11 +251,12 @@ interface FieldCache {
* *
* @param reader Used to build a {@link DocTermOrds} instance * @param reader Used to build a {@link DocTermOrds} instance
* @param field Which field contains the strings. * @param field Which field contains the strings.
* @param prefix prefix for a subset of the terms which should be uninverted. Can be null. * @param prefix prefix for a subset of the terms which should be uninverted. Can be null or
* {@link #INT32_TERM_PREFIX} or {@link #INT64_TERM_PREFIX}
*
* @return a {@link DocTermOrds} instance * @return a {@link DocTermOrds} instance
* @throws IOException If any error occurs. * @throws IOException If any error occurs.
*/ */
// TODO: change this to take Parser
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException; public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException;
/** /**

View File

@ -841,6 +841,9 @@ class FieldCacheImpl implements FieldCache {
// TODO: this if DocTermsIndex was already created, we // TODO: this if DocTermsIndex was already created, we
// should share it... // should share it...
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException { public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException {
// not a general purpose filtering mechanism...
assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;
SortedSetDocValues dv = reader.getSortedSetDocValues(field); SortedSetDocValues dv = reader.getSortedSetDocValues(field);
if (dv != null) { if (dv != null) {
return dv; return dv;
@ -860,6 +863,21 @@ class FieldCacheImpl implements FieldCache {
return DocValues.EMPTY_SORTED_SET; return DocValues.EMPTY_SORTED_SET;
} }
// ok we need to uninvert. check if we can optimize a bit.
Terms terms = reader.terms(field);
if (terms == null) {
return DocValues.EMPTY_SORTED_SET;
} else {
// if #postings = #docswithfield we know that the field is "single valued enough".
// its possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
// its still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
long numPostings = terms.getSumDocFreq();
if (numPostings != -1 && numPostings == terms.getDocCount()) {
return DocValues.singleton(getTermsIndex(reader, field));
}
}
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix), false); DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix), false);
return dto.iterator(reader); return dto.iterator(reader);
} }

View File

@ -248,10 +248,6 @@ public class UninvertingReader extends FilterAtomicReader {
} }
} }
// TODO: clean this up to instead just pass parsers...
static final BytesRef INT32_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_INT });
static final BytesRef INT64_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_LONG });
@Override @Override
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
Type v = mapping.get(field); Type v = mapping.get(field);
@ -259,10 +255,10 @@ public class UninvertingReader extends FilterAtomicReader {
switch (mapping.get(field)) { switch (mapping.get(field)) {
case SORTED_SET_INTEGER: case SORTED_SET_INTEGER:
case SORTED_SET_FLOAT: case SORTED_SET_FLOAT:
return FieldCache.DEFAULT.getDocTermOrds(in, field, INT32_TERM_PREFIX); return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT32_TERM_PREFIX);
case SORTED_SET_LONG: case SORTED_SET_LONG:
case SORTED_SET_DOUBLE: case SORTED_SET_DOUBLE:
return FieldCache.DEFAULT.getDocTermOrds(in, field, INT64_TERM_PREFIX); return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT64_TERM_PREFIX);
case SORTED_SET_BINARY: case SORTED_SET_BINARY:
return FieldCache.DEFAULT.getDocTermOrds(in, field, null); return FieldCache.DEFAULT.getDocTermOrds(in, field, null);
} }

View File

@ -36,6 +36,7 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.index.AtomicReader; import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
@ -402,6 +403,8 @@ public class TestDocTermOrds extends LuceneTestCase {
doc = new Document(); doc = new Document();
doc.add(newStringField("foo", "baz", Field.Store.NO)); doc.add(newStringField("foo", "baz", Field.Store.NO));
// we need a second value for a doc, or we don't actually test DocTermOrds!
doc.add(newStringField("foo", "car", Field.Store.NO));
iw.addDocument(doc); iw.addDocument(doc);
DirectoryReader r1 = DirectoryReader.open(iw, true); DirectoryReader r1 = DirectoryReader.open(iw, true);
@ -412,7 +415,7 @@ public class TestDocTermOrds extends LuceneTestCase {
FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r2), "foo", null); FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r2), "foo", null);
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r1), "foo", null); SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r1), "foo", null);
assertEquals(2, v.getValueCount()); assertEquals(3, v.getValueCount());
v.setDocument(1); v.setDocument(1);
assertEquals(1, v.nextOrd()); assertEquals(1, v.nextOrd());
@ -441,7 +444,7 @@ public class TestDocTermOrds extends LuceneTestCase {
DirectoryReader ir = DirectoryReader.open(dir); DirectoryReader ir = DirectoryReader.open(dir);
AtomicReader ar = getOnlySegmentReader(ir); AtomicReader ar = getOnlySegmentReader(ir);
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", UninvertingReader.INT32_TERM_PREFIX); SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", FieldCache.INT32_TERM_PREFIX);
assertEquals(2, v.getValueCount()); assertEquals(2, v.getValueCount());
v.setDocument(0); v.setDocument(0);
@ -483,7 +486,7 @@ public class TestDocTermOrds extends LuceneTestCase {
DirectoryReader ir = DirectoryReader.open(dir); DirectoryReader ir = DirectoryReader.open(dir);
AtomicReader ar = getOnlySegmentReader(ir); AtomicReader ar = getOnlySegmentReader(ir);
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", UninvertingReader.INT64_TERM_PREFIX); SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", FieldCache.INT64_TERM_PREFIX);
assertEquals(2, v.getValueCount()); assertEquals(2, v.getValueCount());
v.setDocument(0); v.setDocument(0);
@ -519,6 +522,8 @@ public class TestDocTermOrds extends LuceneTestCase {
doc = new Document(); doc = new Document();
doc.add(new StringField("field", "world", Field.Store.NO)); doc.add(new StringField("field", "world", Field.Store.NO));
// we need a second value for a doc, or we don't actually test DocTermOrds!
doc.add(new StringField("field", "hello", Field.Store.NO));
iwriter.addDocument(doc); iwriter.addDocument(doc);
doc = new Document(); doc = new Document();
@ -577,4 +582,62 @@ public class TestDocTermOrds extends LuceneTestCase {
ireader.close(); ireader.close();
directory.close(); directory.close();
} }
public void testActuallySingleValued() throws IOException {
Directory dir = newDirectory();
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, null);
iwconfig.setMergePolicy(newLogMergePolicy());
IndexWriter iw = new IndexWriter(dir, iwconfig);
Document doc = new Document();
doc.add(new StringField("foo", "bar", Field.Store.NO));
iw.addDocument(doc);
doc = new Document();
doc.add(new StringField("foo", "baz", Field.Store.NO));
iw.addDocument(doc);
doc = new Document();
iw.addDocument(doc);
doc = new Document();
doc.add(new StringField("foo", "baz", Field.Store.NO));
doc.add(new StringField("foo", "baz", Field.Store.NO));
iw.addDocument(doc);
iw.forceMerge(1);
iw.shutdown();
DirectoryReader ir = DirectoryReader.open(dir);
AtomicReader ar = getOnlySegmentReader(ir);
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", null);
assertNotNull(DocValues.unwrapSingleton(v)); // actually a single-valued field
assertEquals(2, v.getValueCount());
v.setDocument(0);
assertEquals(0, v.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
v.setDocument(1);
assertEquals(1, v.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
v.setDocument(2);
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
v.setDocument(3);
assertEquals(1, v.nextOrd());
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
BytesRef value = new BytesRef();
v.lookupOrd(0, value);
assertEquals("bar", value.utf8ToString());
v.lookupOrd(1, value);
assertEquals("baz", value.utf8ToString());
ir.close();
dir.close();
}
} }