mirror of https://github.com/apache/lucene.git
LUCENE-5666: actually make a single-valued fc if the field is not multi-valued
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene5666@1594095 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5aba5fda67
commit
003b2e9e11
|
@ -239,6 +239,11 @@ interface FieldCache {
|
||||||
* subsequent calls will share the same cache entry. */
|
* subsequent calls will share the same cache entry. */
|
||||||
public SortedDocValues getTermsIndex(AtomicReader reader, String field, float acceptableOverheadRatio) throws IOException;
|
public SortedDocValues getTermsIndex(AtomicReader reader, String field, float acceptableOverheadRatio) throws IOException;
|
||||||
|
|
||||||
|
/** Can be passed to {@link #getDocTermOrds} to filter for 32-bit numeric terms */
|
||||||
|
public static final BytesRef INT32_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_INT });
|
||||||
|
/** Can be passed to {@link #getDocTermOrds} to filter for 64-bit numeric terms */
|
||||||
|
public static final BytesRef INT64_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_LONG });
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values
|
* Checks the internal cache for an appropriate entry, and if none is found, reads the term values
|
||||||
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
|
* in <code>field</code> and returns a {@link DocTermOrds} instance, providing a method to retrieve
|
||||||
|
@ -246,11 +251,12 @@ interface FieldCache {
|
||||||
*
|
*
|
||||||
* @param reader Used to build a {@link DocTermOrds} instance
|
* @param reader Used to build a {@link DocTermOrds} instance
|
||||||
* @param field Which field contains the strings.
|
* @param field Which field contains the strings.
|
||||||
* @param prefix prefix for a subset of the terms which should be uninverted. Can be null.
|
* @param prefix prefix for a subset of the terms which should be uninverted. Can be null or
|
||||||
|
* {@link #INT32_TERM_PREFIX} or {@link #INT64_TERM_PREFIX}
|
||||||
|
*
|
||||||
* @return a {@link DocTermOrds} instance
|
* @return a {@link DocTermOrds} instance
|
||||||
* @throws IOException If any error occurs.
|
* @throws IOException If any error occurs.
|
||||||
*/
|
*/
|
||||||
// TODO: change this to take Parser
|
|
||||||
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException;
|
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -841,6 +841,9 @@ class FieldCacheImpl implements FieldCache {
|
||||||
// TODO: this if DocTermsIndex was already created, we
|
// TODO: this if DocTermsIndex was already created, we
|
||||||
// should share it...
|
// should share it...
|
||||||
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException {
|
public SortedSetDocValues getDocTermOrds(AtomicReader reader, String field, BytesRef prefix) throws IOException {
|
||||||
|
// not a general purpose filtering mechanism...
|
||||||
|
assert prefix == null || prefix == INT32_TERM_PREFIX || prefix == INT64_TERM_PREFIX;
|
||||||
|
|
||||||
SortedSetDocValues dv = reader.getSortedSetDocValues(field);
|
SortedSetDocValues dv = reader.getSortedSetDocValues(field);
|
||||||
if (dv != null) {
|
if (dv != null) {
|
||||||
return dv;
|
return dv;
|
||||||
|
@ -860,6 +863,21 @@ class FieldCacheImpl implements FieldCache {
|
||||||
return DocValues.EMPTY_SORTED_SET;
|
return DocValues.EMPTY_SORTED_SET;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ok we need to uninvert. check if we can optimize a bit.
|
||||||
|
|
||||||
|
Terms terms = reader.terms(field);
|
||||||
|
if (terms == null) {
|
||||||
|
return DocValues.EMPTY_SORTED_SET;
|
||||||
|
} else {
|
||||||
|
// if #postings = #docswithfield we know that the field is "single valued enough".
|
||||||
|
// its possible the same term might appear twice in the same document, but SORTED_SET discards frequency.
|
||||||
|
// its still ok with filtering (which we limit to numerics), it just means precisionStep = Inf
|
||||||
|
long numPostings = terms.getSumDocFreq();
|
||||||
|
if (numPostings != -1 && numPostings == terms.getDocCount()) {
|
||||||
|
return DocValues.singleton(getTermsIndex(reader, field));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix), false);
|
DocTermOrds dto = (DocTermOrds) caches.get(DocTermOrds.class).get(reader, new CacheKey(field, prefix), false);
|
||||||
return dto.iterator(reader);
|
return dto.iterator(reader);
|
||||||
}
|
}
|
||||||
|
|
|
@ -248,10 +248,6 @@ public class UninvertingReader extends FilterAtomicReader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: clean this up to instead just pass parsers...
|
|
||||||
static final BytesRef INT32_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_INT });
|
|
||||||
static final BytesRef INT64_TERM_PREFIX = new BytesRef(new byte[] { NumericUtils.SHIFT_START_LONG });
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
|
public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
|
||||||
Type v = mapping.get(field);
|
Type v = mapping.get(field);
|
||||||
|
@ -259,10 +255,10 @@ public class UninvertingReader extends FilterAtomicReader {
|
||||||
switch (mapping.get(field)) {
|
switch (mapping.get(field)) {
|
||||||
case SORTED_SET_INTEGER:
|
case SORTED_SET_INTEGER:
|
||||||
case SORTED_SET_FLOAT:
|
case SORTED_SET_FLOAT:
|
||||||
return FieldCache.DEFAULT.getDocTermOrds(in, field, INT32_TERM_PREFIX);
|
return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT32_TERM_PREFIX);
|
||||||
case SORTED_SET_LONG:
|
case SORTED_SET_LONG:
|
||||||
case SORTED_SET_DOUBLE:
|
case SORTED_SET_DOUBLE:
|
||||||
return FieldCache.DEFAULT.getDocTermOrds(in, field, INT64_TERM_PREFIX);
|
return FieldCache.DEFAULT.getDocTermOrds(in, field, FieldCache.INT64_TERM_PREFIX);
|
||||||
case SORTED_SET_BINARY:
|
case SORTED_SET_BINARY:
|
||||||
return FieldCache.DEFAULT.getDocTermOrds(in, field, null);
|
return FieldCache.DEFAULT.getDocTermOrds(in, field, null);
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.lucene.document.StringField;
|
||||||
import org.apache.lucene.index.AtomicReader;
|
import org.apache.lucene.index.AtomicReader;
|
||||||
import org.apache.lucene.index.AtomicReaderContext;
|
import org.apache.lucene.index.AtomicReaderContext;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.DocValues;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig;
|
import org.apache.lucene.index.IndexWriterConfig;
|
||||||
|
@ -402,6 +403,8 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(newStringField("foo", "baz", Field.Store.NO));
|
doc.add(newStringField("foo", "baz", Field.Store.NO));
|
||||||
|
// we need a second value for a doc, or we don't actually test DocTermOrds!
|
||||||
|
doc.add(newStringField("foo", "car", Field.Store.NO));
|
||||||
iw.addDocument(doc);
|
iw.addDocument(doc);
|
||||||
|
|
||||||
DirectoryReader r1 = DirectoryReader.open(iw, true);
|
DirectoryReader r1 = DirectoryReader.open(iw, true);
|
||||||
|
@ -412,7 +415,7 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r2), "foo", null);
|
FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r2), "foo", null);
|
||||||
|
|
||||||
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r1), "foo", null);
|
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(getOnlySegmentReader(r1), "foo", null);
|
||||||
assertEquals(2, v.getValueCount());
|
assertEquals(3, v.getValueCount());
|
||||||
v.setDocument(1);
|
v.setDocument(1);
|
||||||
assertEquals(1, v.nextOrd());
|
assertEquals(1, v.nextOrd());
|
||||||
|
|
||||||
|
@ -441,7 +444,7 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
DirectoryReader ir = DirectoryReader.open(dir);
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
AtomicReader ar = getOnlySegmentReader(ir);
|
AtomicReader ar = getOnlySegmentReader(ir);
|
||||||
|
|
||||||
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", UninvertingReader.INT32_TERM_PREFIX);
|
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", FieldCache.INT32_TERM_PREFIX);
|
||||||
assertEquals(2, v.getValueCount());
|
assertEquals(2, v.getValueCount());
|
||||||
|
|
||||||
v.setDocument(0);
|
v.setDocument(0);
|
||||||
|
@ -483,7 +486,7 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
DirectoryReader ir = DirectoryReader.open(dir);
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
AtomicReader ar = getOnlySegmentReader(ir);
|
AtomicReader ar = getOnlySegmentReader(ir);
|
||||||
|
|
||||||
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", UninvertingReader.INT64_TERM_PREFIX);
|
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", FieldCache.INT64_TERM_PREFIX);
|
||||||
assertEquals(2, v.getValueCount());
|
assertEquals(2, v.getValueCount());
|
||||||
|
|
||||||
v.setDocument(0);
|
v.setDocument(0);
|
||||||
|
@ -519,6 +522,8 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(new StringField("field", "world", Field.Store.NO));
|
doc.add(new StringField("field", "world", Field.Store.NO));
|
||||||
|
// we need a second value for a doc, or we don't actually test DocTermOrds!
|
||||||
|
doc.add(new StringField("field", "hello", Field.Store.NO));
|
||||||
iwriter.addDocument(doc);
|
iwriter.addDocument(doc);
|
||||||
|
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
|
@ -577,4 +582,62 @@ public class TestDocTermOrds extends LuceneTestCase {
|
||||||
ireader.close();
|
ireader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testActuallySingleValued() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriterConfig iwconfig = newIndexWriterConfig(TEST_VERSION_CURRENT, null);
|
||||||
|
iwconfig.setMergePolicy(newLogMergePolicy());
|
||||||
|
IndexWriter iw = new IndexWriter(dir, iwconfig);
|
||||||
|
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(new StringField("foo", "bar", Field.Store.NO));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new StringField("foo", "baz", Field.Store.NO));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
doc = new Document();
|
||||||
|
doc.add(new StringField("foo", "baz", Field.Store.NO));
|
||||||
|
doc.add(new StringField("foo", "baz", Field.Store.NO));
|
||||||
|
iw.addDocument(doc);
|
||||||
|
|
||||||
|
iw.forceMerge(1);
|
||||||
|
iw.shutdown();
|
||||||
|
|
||||||
|
DirectoryReader ir = DirectoryReader.open(dir);
|
||||||
|
AtomicReader ar = getOnlySegmentReader(ir);
|
||||||
|
|
||||||
|
SortedSetDocValues v = FieldCache.DEFAULT.getDocTermOrds(ar, "foo", null);
|
||||||
|
assertNotNull(DocValues.unwrapSingleton(v)); // actually a single-valued field
|
||||||
|
assertEquals(2, v.getValueCount());
|
||||||
|
|
||||||
|
v.setDocument(0);
|
||||||
|
assertEquals(0, v.nextOrd());
|
||||||
|
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
|
||||||
|
|
||||||
|
v.setDocument(1);
|
||||||
|
assertEquals(1, v.nextOrd());
|
||||||
|
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
|
||||||
|
|
||||||
|
v.setDocument(2);
|
||||||
|
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
|
||||||
|
|
||||||
|
v.setDocument(3);
|
||||||
|
assertEquals(1, v.nextOrd());
|
||||||
|
assertEquals(SortedSetDocValues.NO_MORE_ORDS, v.nextOrd());
|
||||||
|
|
||||||
|
BytesRef value = new BytesRef();
|
||||||
|
v.lookupOrd(0, value);
|
||||||
|
assertEquals("bar", value.utf8ToString());
|
||||||
|
|
||||||
|
v.lookupOrd(1, value);
|
||||||
|
assertEquals("baz", value.utf8ToString());
|
||||||
|
|
||||||
|
ir.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue