LUCENE-6529: Removed an optimization in UninvertingReader that was causing incorrect results for Numeric fields using precisionStep

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1684704 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2015-06-10 16:23:00 +00:00
parent 507ebfc247
commit 40f4e7de5e
7 changed files with 407 additions and 46 deletions

View File

@ -81,6 +81,10 @@ API Changes
SpanPayloadNearCheckQuery have moved from the .spans package to the .payloads
package. (Alan Woodward, David Smiley, Paul Elschot, Robert Muir)
* LUCENE-6529: Removed an optimization in UninvertingReader that was causing
incorrect results for Numeric fields using precisionStep
(hossman, Robert Muir)
Bug fixes
* LUCENE-6500: ParallelCompositeReader did not always call

View File

@ -67,13 +67,11 @@ import org.apache.lucene.util.StringHelper;
* are also de-dup'd (ie if doc has same term more than once
* in this field, you'll only get that ord back once).
*
* This class tests whether the provided reader is able to
* retrieve terms by ord (ie, it's single segment, and it
* uses an ord-capable terms index). If not, this class
* This class
* will create its own term index internally, allowing to
* create a wrapped TermsEnum that can handle ord. The
* {@link #getOrdTermsEnum} method then provides this
* wrapped enum, if necessary.
* wrapped enum.
*
* The RAM consumption of this class can be high!
*
@ -152,7 +150,7 @@ public class DocTermOrds implements Accountable {
protected long sizeOfIndexedStrings;
/** Holds the indexed (by default every 128th) terms. */
protected BytesRef[] indexedTermsArray;
protected BytesRef[] indexedTermsArray = new BytesRef[0];
/** If non-null, only terms matching this prefix were
* indexed. */
@ -219,27 +217,27 @@ public class DocTermOrds implements Accountable {
indexInterval = 1 << indexIntervalBits;
}
/** Returns a TermsEnum that implements ord. If the
* provided reader supports ord, we just return its
* TermsEnum; if it does not, we build a "private" terms
/**
* Returns a TermsEnum that implements ord, or null if no terms in field.
* <p>
* we build a "private" terms
* index internally (WARNING: consumes RAM) and use that
* index to implement ord. This also enables ord on top
* of a composite reader. The returned TermsEnum is
* unpositioned. This returns null if there are no terms.
*
* </p>
* <p><b>NOTE</b>: you must pass the same reader that was
* used when creating this class */
* used when creating this class
*/
public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException {
if (indexedTermsArray == null) {
//System.out.println("GET normal enum");
final Terms terms = reader.terms(field);
if (terms == null) {
return null;
} else {
return terms.iterator();
}
// NOTE: see LUCENE-6529 before attempting to optimize this method to
// return a TermsEnum directly from the reader if it already supports ord().
assert null != indexedTermsArray;
if (0 == indexedTermsArray.length) {
return null;
} else {
//System.out.println("GET wrapped enum ordBase=" + ordBase);
return new OrdWrappedTermsEnum(reader);
}
}
@ -297,12 +295,9 @@ public class DocTermOrds implements Accountable {
return;
}
// If we need our "term index wrapper", these will be
// init'd below:
List<BytesRef> indexedTerms = null;
PagedBytes indexedTermsBytes = null;
boolean testedOrd = false;
// For our "term index wrapper"
final List<BytesRef> indexedTerms = new ArrayList<>();
final PagedBytes indexedTermsBytes = new PagedBytes(15);
// we need a minimum of 9 bytes, but round up to 12 since the space would
// be wasted with most allocators anyway.
@ -336,23 +331,9 @@ public class DocTermOrds implements Accountable {
}
//System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
if (!testedOrd) {
try {
ordBase = (int) te.ord();
//System.out.println("got ordBase=" + ordBase);
} catch (UnsupportedOperationException uoe) {
// Reader cannot provide ord support, so we wrap
// our own support by creating our own terms index:
indexedTerms = new ArrayList<>();
indexedTermsBytes = new PagedBytes(15);
//System.out.println("NO ORDS");
}
testedOrd = true;
}
visitTerm(te, termNum);
if (indexedTerms != null && (termNum & indexIntervalMask) == 0) {
if ((termNum & indexIntervalMask) == 0) {
// Index this term
sizeOfIndexedStrings += t.length;
BytesRef indexedTerm = new BytesRef();
@ -547,9 +528,7 @@ public class DocTermOrds implements Accountable {
}
}
if (indexedTerms != null) {
indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
}
indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]);
long endTime = System.currentTimeMillis();
@ -598,9 +577,10 @@ public class DocTermOrds implements Accountable {
return pos;
}
/* Only used if original IndexReader doesn't implement
* ord; in this case we "wrap" our own terms index
* around it. */
/**
* "wrap" our own terms index around the original IndexReader.
* Only valid if there are terms for this field rom the original reader
*/
private final class OrdWrappedTermsEnum extends TermsEnum {
private final TermsEnum termsEnum;
private BytesRef term;
@ -608,6 +588,7 @@ public class DocTermOrds implements Accountable {
public OrdWrappedTermsEnum(LeafReader reader) throws IOException {
assert indexedTermsArray != null;
assert 0 != indexedTermsArray.length;
termsEnum = reader.fields().terms(field).iterator();
}

View File

@ -63,6 +63,36 @@ import org.apache.lucene.util.TestUtil;
public class TestDocTermOrds extends LuceneTestCase {
public void testEmptyIndex() throws IOException {
final Directory dir = newDirectory();
final IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
iw.close();
final DirectoryReader ir = DirectoryReader.open(dir);
TestUtil.checkReader(ir);
final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir);
TestUtil.checkReader(composite);
// check the leaves
// (normally there are none for an empty index, so this is really just future
// proofing in case that changes for some reason)
for (LeafReaderContext rc : ir.leaves()) {
final LeafReader r = rc.reader();
final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "any_field");
assertNull("OrdTermsEnum should be null (leaf)", dto.getOrdTermsEnum(r));
assertEquals("iterator should be empty (leaf)", 0, dto.iterator(r).getValueCount());
}
// check the composite
final DocTermOrds dto = new DocTermOrds(composite, composite.getLiveDocs(), "any_field");
assertNull("OrdTermsEnum should be null (composite)", dto.getOrdTermsEnum(composite));
assertEquals("iterator should be empty (composite)", 0, dto.iterator(composite).getValueCount());
ir.close();
dir.close();
}
public void testSimple() throws Exception {
Directory dir = newDirectory();
final RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy()));
@ -82,6 +112,7 @@ public class TestDocTermOrds extends LuceneTestCase {
w.close();
final LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field");
SortedSetDocValues iter = dto.iterator(ar);
@ -185,6 +216,7 @@ public class TestDocTermOrds extends LuceneTestCase {
System.out.println("TEST: top reader");
}
LeafReader slowR = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(slowR);
verify(slowR, idToOrds, termsArray, null);
FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheKey());
@ -270,6 +302,7 @@ public class TestDocTermOrds extends LuceneTestCase {
}
LeafReader slowR = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(slowR);
for(String prefix : prefixesArray) {
final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix);

View File

@ -119,6 +119,7 @@ public class TestFieldCache extends LuceneTestCase {
}
IndexReader r = writer.getReader();
reader = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(reader);
writer.close();
}
@ -293,6 +294,7 @@ public class TestFieldCache extends LuceneTestCase {
writer.close();
IndexReader r = DirectoryReader.open(dir);
LeafReader reader = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(reader);
FieldCache.DEFAULT.getTerms(reader, "foobar", true);
FieldCache.DEFAULT.getTermsIndex(reader, "foobar");
FieldCache.DEFAULT.purgeByCacheKey(reader.getCoreCacheKey());

View File

@ -192,6 +192,7 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase {
w.close();
LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field", false);
for(int docID=0;docID<docBytes.size();docID++) {
@ -265,6 +266,8 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase {
w.close();
LeafReader ar = SlowCompositeReaderWrapper.wrap(r);
TestUtil.checkReader(ar
);
BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field", false);
for(int docID=0;docID<docBytes.size();docID++) {

View File

@ -18,15 +18,23 @@ package org.apache.lucene.uninverting;
*/
import java.io.IOException;
import java.util.EnumSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.Collections;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.SlowCompositeReaderWrapper;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.Directory;
import org.apache.lucene.uninverting.UninvertingReader.Type;
@ -201,4 +209,133 @@ public class TestUninvertingReader extends LuceneTestCase {
ir.close();
dir.close();
}
/** Tests {@link Type#SORTED_SET_INTEGER} using Integer based fields, with and w/o precision steps */
public void testSortedSetIntegerManyValues() throws IOException {
final Directory dir = newDirectory();
final IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
final FieldType NO_TRIE_TYPE = new FieldType(IntField.TYPE_NOT_STORED);
NO_TRIE_TYPE.setNumericPrecisionStep(Integer.MAX_VALUE);
final Map<String,Type> UNINVERT_MAP = new LinkedHashMap<String,Type>();
UNINVERT_MAP.put("notrie_single", Type.SORTED_SET_INTEGER);
UNINVERT_MAP.put("notrie_multi", Type.SORTED_SET_INTEGER);
UNINVERT_MAP.put("trie_single", Type.SORTED_SET_INTEGER);
UNINVERT_MAP.put("trie_multi", Type.SORTED_SET_INTEGER);
final Set<String> MULTI_VALUES = new LinkedHashSet<String>();
MULTI_VALUES.add("trie_multi");
MULTI_VALUES.add("notrie_multi");
final int NUM_DOCS = TestUtil.nextInt(random(), 200, 1500);
final int MIN = TestUtil.nextInt(random(), 10, 100);
final int MAX = MIN + TestUtil.nextInt(random(), 10, 100);
final long EXPECTED_VALSET_SIZE = 1 + MAX - MIN;
{ // (at least) one doc should have every value, so that at least one segment has every value
final Document doc = new Document();
for (int i = MIN; i <= MAX; i++) {
doc.add(new IntField("trie_multi", i, Field.Store.NO));
doc.add(new IntField("notrie_multi", i, NO_TRIE_TYPE));
}
iw.addDocument(doc);
}
// now add some more random docs (note: starting at i=1 because of previously added doc)
for (int i = 1; i < NUM_DOCS; i++) {
final Document doc = new Document();
if (0 != TestUtil.nextInt(random(), 0, 9)) {
int val = TestUtil.nextInt(random(), MIN, MAX);
doc.add(new IntField("trie_single", val, Field.Store.NO));
doc.add(new IntField("notrie_single", val, NO_TRIE_TYPE));
}
if (0 != TestUtil.nextInt(random(), 0, 9)) {
int numMulti = atLeast(1);
while (0 < numMulti--) {
int val = TestUtil.nextInt(random(), MIN, MAX);
doc.add(new IntField("trie_multi", val, Field.Store.NO));
doc.add(new IntField("notrie_multi", val, NO_TRIE_TYPE));
}
}
iw.addDocument(doc);
}
iw.close();
final DirectoryReader ir = UninvertingReader.wrap(DirectoryReader.open(dir), UNINVERT_MAP);
TestUtil.checkReader(ir);
final int NUM_LEAVES = ir.leaves().size();
// check the leaves: no more then total set size
for (LeafReaderContext rc : ir.leaves()) {
final LeafReader ar = rc.reader();
for (String f : UNINVERT_MAP.keySet()) {
final SortedSetDocValues v = ar.getSortedSetDocValues(f);
final long valSetSize = v.getValueCount();
assertTrue(f + ": Expected no more then " + EXPECTED_VALSET_SIZE + " values per segment, got " +
valSetSize + " from: " + ar.toString(),
valSetSize <= EXPECTED_VALSET_SIZE);
if (1 == NUM_LEAVES && MULTI_VALUES.contains(f)) {
// tighter check on multi fields in single segment index since we know one doc has all of them
assertEquals(f + ": Single segment LeafReader's value set should have had exactly expected size",
EXPECTED_VALSET_SIZE, valSetSize);
}
}
}
// check the composite of all leaves: exact expectation of set size
final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir);
TestUtil.checkReader(composite);
for (String f : MULTI_VALUES) {
final SortedSetDocValues v = composite.getSortedSetDocValues(f);
final long valSetSize = v.getValueCount();
assertEquals(f + ": Composite reader value set should have had exactly expected size",
EXPECTED_VALSET_SIZE, valSetSize);
}
ir.close();
dir.close();
}
public void testSortedSetEmptyIndex() throws IOException {
final Directory dir = newDirectory();
final IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null));
iw.close();
final Map<String,Type> UNINVERT_MAP = new LinkedHashMap<String,Type>();
for (Type t : EnumSet.allOf(Type.class)) {
UNINVERT_MAP.put(t.name(), t);
}
final DirectoryReader ir = UninvertingReader.wrap(DirectoryReader.open(dir), UNINVERT_MAP);
TestUtil.checkReader(ir);
final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir);
TestUtil.checkReader(composite);
for (String f : UNINVERT_MAP.keySet()) {
// check the leaves
// (normally there are none for an empty index, so this is really just future
// proofing in case that changes for some reason)
for (LeafReaderContext rc : ir.leaves()) {
final LeafReader ar = rc.reader();
assertNull(f + ": Expected no doc values from empty index (leaf)",
ar.getSortedSetDocValues(f));
}
// check the composite
assertNull(f + ": Expected no doc values from empty index (composite)",
composite.getSortedSetDocValues(f));
}
ir.close();
dir.close();
}
}

View File

@ -0,0 +1,201 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.TrieIntField;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
public class TestTrieFacet extends SolrTestCaseJ4 {
final static int MIN_VALUE = 20;
final static int MAX_VALUE = 60;
final static String TRIE_INT_P8_S_VALUED = "foo_ti1";
final static String TRIE_INT_P8_M_VALUED = "foo_ti";
final static String TRIE_INT_P0_S_VALUED = "foo_i1";
final static String TRIE_INT_P0_M_VALUED = "foo_i";
final static String[] M_VALUED = new String[] { TRIE_INT_P0_M_VALUED, TRIE_INT_P8_M_VALUED };
final static String[] S_VALUED = new String[] { TRIE_INT_P0_S_VALUED, TRIE_INT_P8_S_VALUED };
final static String[] P0 = new String[] { TRIE_INT_P0_M_VALUED, TRIE_INT_P0_S_VALUED };
final static String[] P8 = new String[] { TRIE_INT_P8_M_VALUED, TRIE_INT_P8_S_VALUED };
static int NUM_DOCS;
private static TrieIntField assertCastFieldType(SchemaField f) {
assertTrue("who changed the schema? test isn't valid: " + f.getName(),
f.getType() instanceof TrieIntField);
return (TrieIntField) f.getType();
}
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-tlog.xml","schema.xml");
// don't break the test
assertTrue("min value must be less then max value", MIN_VALUE < MAX_VALUE);
assertTrue("min value must be greater then zero", 0 < MIN_VALUE);
// sanity check no one breaks the schema out from under us...
for (String f : M_VALUED) {
SchemaField sf = h.getCore().getLatestSchema().getField(f);
assertTrue("who changed the schema? test isn't valid: " + f, sf.multiValued());
}
for (String f : S_VALUED) {
SchemaField sf = h.getCore().getLatestSchema().getField(f);
assertFalse("who changed the schema? test isn't valid: " + f, sf.multiValued());
}
for (String f : P0) {
SchemaField sf = h.getCore().getLatestSchema().getField(f);
assertEquals("who changed the schema? test isn't valid: " + f,
0, assertCastFieldType(sf).getPrecisionStep());
}
for (String f : P8) {
SchemaField sf = h.getCore().getLatestSchema().getField(f);
assertEquals("who changed the schema? test isn't valid: " + f,
8, assertCastFieldType(sf).getPrecisionStep());
}
// we don't need a lot of docs -- at least one failure only had ~1000
NUM_DOCS = TestUtil.nextInt(random(), 200, 1500);
{ // ensure at least one doc has every valid value in the multivalued fields
SolrInputDocument doc = sdoc("id", "0");
for (int val = MIN_VALUE; val <= MAX_VALUE; val++) {
for (String f : M_VALUED) {
doc.addField(f, val);
}
}
assertU(adoc(doc));
}
// randomized docs (note: starting at i=1)
for (int i=1; i < NUM_DOCS; i++) {
SolrInputDocument doc = sdoc("id", i+"");
if (useField()) {
int val = TestUtil.nextInt(random(), MIN_VALUE, MAX_VALUE);
for (String f : S_VALUED) {
doc.addField(f, val);
}
}
if (useField()) {
int numMulti = atLeast(1);
while (0 < numMulti--) {
int val = TestUtil.nextInt(random(), MIN_VALUE, MAX_VALUE);
for (String f: M_VALUED) {
doc.addField(f, val);
}
}
}
assertU(adoc(doc));
}
assertU(commit());
}
/**
* Similar to usually() but we want it to happen just as often regardless
* of test multiplier and nightly status
*/
private static boolean useField() {
return 0 != TestUtil.nextInt(random(), 0, 9);
}
private static void doTestNoZeros(final String field, final String method) throws Exception {
assertQ("sanity check # docs in index: " + NUM_DOCS,
req("q", "*:*", "rows", "0")
,"//result[@numFound="+NUM_DOCS+"]");
assertQ("sanity check that no docs match 0 failed",
req("q", field+":0", "rows", "0")
,"//result[@numFound=0]");
assertQ("sanity check that no docs match [0 TO 0] failed",
req("q", field+":[0 TO 0]", "rows", "0")
,"//result[@numFound=0]");
assertQ("*:* facet with mincount 0 found unexpected 0 value",
req("q", "*:*"
,"rows", "0"
,"indent","true"
,"facet", "true"
,"facet.field", field
,"facet.limit", "-1"
,"facet.mincount", "0"
,"facet.method", method
)
// trivial sanity check we're at least getting facet counts in output
,"*[count(//lst[@name='facet_fields']/lst[@name='"+field+"']/int)!=0]"
// main point of test
,"*[count(//lst[@name='facet_fields']/lst[@name='"+field+"']/int[@name='0'])=0]"
);
}
// enum
public void testSingleValuedTrieP0_enum() throws Exception {
doTestNoZeros(TRIE_INT_P0_S_VALUED, "enum");
}
public void testMultiValuedTrieP0_enum() throws Exception {
doTestNoZeros(TRIE_INT_P0_M_VALUED, "enum");
}
public void testSingleValuedTrieP8_enum() throws Exception {
doTestNoZeros(TRIE_INT_P8_S_VALUED, "enum");
}
public void testMultiValuedTrieP8_enum() throws Exception {
doTestNoZeros(TRIE_INT_P8_M_VALUED, "enum");
}
// fc
public void testSingleValuedTrieP0_fc() throws Exception {
doTestNoZeros(TRIE_INT_P0_S_VALUED, "fc");
}
public void testMultiValuedTrieP0_fc() throws Exception {
doTestNoZeros(TRIE_INT_P0_M_VALUED, "fc");
}
public void testSingleValuedTrieP8_fc() throws Exception {
doTestNoZeros(TRIE_INT_P8_S_VALUED, "fc");
}
public void testMultiValuedTrieP8_fc() throws Exception {
doTestNoZeros(TRIE_INT_P8_M_VALUED, "fc");
}
// fcs
public void testSingleValuedTrieP0_fcs() throws Exception {
doTestNoZeros(TRIE_INT_P0_S_VALUED, "fcs");
}
public void testMultiValuedTrieP0_fcs() throws Exception {
doTestNoZeros(TRIE_INT_P0_M_VALUED, "fcs");
}
public void testSingleValuedTrieP8_fcs() throws Exception {
doTestNoZeros(TRIE_INT_P8_S_VALUED, "fcs");
}
public void testMultiValuedTrieP8_fcs() throws Exception {
doTestNoZeros(TRIE_INT_P8_M_VALUED, "fcs");
}
}