From 40f4e7de5ebeb9bf7e46cd8baeea18ab56e25466 Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Wed, 10 Jun 2015 16:23:00 +0000 Subject: [PATCH] LUCENE-6529: Removed an optimization in UninvertingReader that was causing incorrect results for Numeric fields using precisionStep git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1684704 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../lucene/uninverting/DocTermOrds.java | 73 +++---- .../lucene/uninverting/TestDocTermOrds.java | 33 +++ .../lucene/uninverting/TestFieldCache.java | 2 + .../TestFieldCacheVsDocValues.java | 3 + .../uninverting/TestUninvertingReader.java | 137 ++++++++++++ .../org/apache/solr/search/TestTrieFacet.java | 201 ++++++++++++++++++ 7 files changed, 407 insertions(+), 46 deletions(-) create mode 100644 solr/core/src/test/org/apache/solr/search/TestTrieFacet.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8cf21b9728a..d2ae4027459 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -81,6 +81,10 @@ API Changes SpanPayloadNearCheckQuery have moved from the .spans package to the .payloads package. (Alan Woodward, David Smiley, Paul Elschot, Robert Muir) +* LUCENE-6529: Removed an optimization in UninvertingReader that was causing + incorrect results for Numeric fields using precisionStep + (hossman, Robert Muir) + Bug fixes * LUCENE-6500: ParallelCompositeReader did not always call diff --git a/lucene/misc/src/java/org/apache/lucene/uninverting/DocTermOrds.java b/lucene/misc/src/java/org/apache/lucene/uninverting/DocTermOrds.java index b49cd27db25..39dbad4e588 100644 --- a/lucene/misc/src/java/org/apache/lucene/uninverting/DocTermOrds.java +++ b/lucene/misc/src/java/org/apache/lucene/uninverting/DocTermOrds.java @@ -67,13 +67,11 @@ import org.apache.lucene.util.StringHelper; * are also de-dup'd (ie if doc has same term more than once * in this field, you'll only get that ord back once). * - * This class tests whether the provided reader is able to - * retrieve terms by ord (ie, it's single segment, and it - * uses an ord-capable terms index). If not, this class + * This class * will create its own term index internally, allowing to * create a wrapped TermsEnum that can handle ord. The * {@link #getOrdTermsEnum} method then provides this - * wrapped enum, if necessary. + * wrapped enum. * * The RAM consumption of this class can be high! * @@ -152,7 +150,7 @@ public class DocTermOrds implements Accountable { protected long sizeOfIndexedStrings; /** Holds the indexed (by default every 128th) terms. */ - protected BytesRef[] indexedTermsArray; + protected BytesRef[] indexedTermsArray = new BytesRef[0]; /** If non-null, only terms matching this prefix were * indexed. */ @@ -219,27 +217,27 @@ public class DocTermOrds implements Accountable { indexInterval = 1 << indexIntervalBits; } - /** Returns a TermsEnum that implements ord. If the - * provided reader supports ord, we just return its - * TermsEnum; if it does not, we build a "private" terms + /** + * Returns a TermsEnum that implements ord, or null if no terms in field. + *

+ * we build a "private" terms * index internally (WARNING: consumes RAM) and use that * index to implement ord. This also enables ord on top * of a composite reader. The returned TermsEnum is * unpositioned. This returns null if there are no terms. - * + *

*

NOTE: you must pass the same reader that was - * used when creating this class */ + * used when creating this class + */ public TermsEnum getOrdTermsEnum(LeafReader reader) throws IOException { - if (indexedTermsArray == null) { - //System.out.println("GET normal enum"); - final Terms terms = reader.terms(field); - if (terms == null) { - return null; - } else { - return terms.iterator(); - } + // NOTE: see LUCENE-6529 before attempting to optimize this method to + // return a TermsEnum directly from the reader if it already supports ord(). + + assert null != indexedTermsArray; + + if (0 == indexedTermsArray.length) { + return null; } else { - //System.out.println("GET wrapped enum ordBase=" + ordBase); return new OrdWrappedTermsEnum(reader); } } @@ -297,12 +295,9 @@ public class DocTermOrds implements Accountable { return; } - // If we need our "term index wrapper", these will be - // init'd below: - List indexedTerms = null; - PagedBytes indexedTermsBytes = null; - - boolean testedOrd = false; + // For our "term index wrapper" + final List indexedTerms = new ArrayList<>(); + final PagedBytes indexedTermsBytes = new PagedBytes(15); // we need a minimum of 9 bytes, but round up to 12 since the space would // be wasted with most allocators anyway. @@ -336,23 +331,9 @@ public class DocTermOrds implements Accountable { } //System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum); - if (!testedOrd) { - try { - ordBase = (int) te.ord(); - //System.out.println("got ordBase=" + ordBase); - } catch (UnsupportedOperationException uoe) { - // Reader cannot provide ord support, so we wrap - // our own support by creating our own terms index: - indexedTerms = new ArrayList<>(); - indexedTermsBytes = new PagedBytes(15); - //System.out.println("NO ORDS"); - } - testedOrd = true; - } - visitTerm(te, termNum); - if (indexedTerms != null && (termNum & indexIntervalMask) == 0) { + if ((termNum & indexIntervalMask) == 0) { // Index this term sizeOfIndexedStrings += t.length; BytesRef indexedTerm = new BytesRef(); @@ -547,9 +528,7 @@ public class DocTermOrds implements Accountable { } } - if (indexedTerms != null) { - indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); - } + indexedTermsArray = indexedTerms.toArray(new BytesRef[indexedTerms.size()]); long endTime = System.currentTimeMillis(); @@ -598,9 +577,10 @@ public class DocTermOrds implements Accountable { return pos; } - /* Only used if original IndexReader doesn't implement - * ord; in this case we "wrap" our own terms index - * around it. */ + /** + * "wrap" our own terms index around the original IndexReader. + * Only valid if there are terms for this field rom the original reader + */ private final class OrdWrappedTermsEnum extends TermsEnum { private final TermsEnum termsEnum; private BytesRef term; @@ -608,6 +588,7 @@ public class DocTermOrds implements Accountable { public OrdWrappedTermsEnum(LeafReader reader) throws IOException { assert indexedTermsArray != null; + assert 0 != indexedTermsArray.length; termsEnum = reader.fields().terms(field).iterator(); } diff --git a/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java b/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java index 9b5f09b4d70..c8a712340c5 100644 --- a/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java +++ b/lucene/misc/src/test/org/apache/lucene/uninverting/TestDocTermOrds.java @@ -63,6 +63,36 @@ import org.apache.lucene.util.TestUtil; public class TestDocTermOrds extends LuceneTestCase { + public void testEmptyIndex() throws IOException { + final Directory dir = newDirectory(); + final IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random()))); + iw.close(); + + final DirectoryReader ir = DirectoryReader.open(dir); + TestUtil.checkReader(ir); + + final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir); + TestUtil.checkReader(composite); + + // check the leaves + // (normally there are none for an empty index, so this is really just future + // proofing in case that changes for some reason) + for (LeafReaderContext rc : ir.leaves()) { + final LeafReader r = rc.reader(); + final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "any_field"); + assertNull("OrdTermsEnum should be null (leaf)", dto.getOrdTermsEnum(r)); + assertEquals("iterator should be empty (leaf)", 0, dto.iterator(r).getValueCount()); + } + + // check the composite + final DocTermOrds dto = new DocTermOrds(composite, composite.getLiveDocs(), "any_field"); + assertNull("OrdTermsEnum should be null (composite)", dto.getOrdTermsEnum(composite)); + assertEquals("iterator should be empty (composite)", 0, dto.iterator(composite).getValueCount()); + + ir.close(); + dir.close(); + } + public void testSimple() throws Exception { Directory dir = newDirectory(); final RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(newLogMergePolicy())); @@ -82,6 +112,7 @@ public class TestDocTermOrds extends LuceneTestCase { w.close(); final LeafReader ar = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(ar); final DocTermOrds dto = new DocTermOrds(ar, ar.getLiveDocs(), "field"); SortedSetDocValues iter = dto.iterator(ar); @@ -185,6 +216,7 @@ public class TestDocTermOrds extends LuceneTestCase { System.out.println("TEST: top reader"); } LeafReader slowR = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(slowR); verify(slowR, idToOrds, termsArray, null); FieldCache.DEFAULT.purgeByCacheKey(slowR.getCoreCacheKey()); @@ -270,6 +302,7 @@ public class TestDocTermOrds extends LuceneTestCase { } LeafReader slowR = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(slowR); for(String prefix : prefixesArray) { final BytesRef prefixRef = prefix == null ? null : new BytesRef(prefix); diff --git a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCache.java b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCache.java index 19acf263dcf..d2d117b68e4 100644 --- a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCache.java +++ b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCache.java @@ -119,6 +119,7 @@ public class TestFieldCache extends LuceneTestCase { } IndexReader r = writer.getReader(); reader = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(reader); writer.close(); } @@ -293,6 +294,7 @@ public class TestFieldCache extends LuceneTestCase { writer.close(); IndexReader r = DirectoryReader.open(dir); LeafReader reader = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(reader); FieldCache.DEFAULT.getTerms(reader, "foobar", true); FieldCache.DEFAULT.getTermsIndex(reader, "foobar"); FieldCache.DEFAULT.purgeByCacheKey(reader.getCoreCacheKey()); diff --git a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java index c7c2986c9b2..adb8591ede8 100644 --- a/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java +++ b/lucene/misc/src/test/org/apache/lucene/uninverting/TestFieldCacheVsDocValues.java @@ -192,6 +192,7 @@ public class TestFieldCacheVsDocValues extends LuceneTestCase { w.close(); LeafReader ar = SlowCompositeReaderWrapper.wrap(r); + TestUtil.checkReader(ar); BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field", false); for(int docID=0;docID UNINVERT_MAP = new LinkedHashMap(); + UNINVERT_MAP.put("notrie_single", Type.SORTED_SET_INTEGER); + UNINVERT_MAP.put("notrie_multi", Type.SORTED_SET_INTEGER); + UNINVERT_MAP.put("trie_single", Type.SORTED_SET_INTEGER); + UNINVERT_MAP.put("trie_multi", Type.SORTED_SET_INTEGER); + final Set MULTI_VALUES = new LinkedHashSet(); + MULTI_VALUES.add("trie_multi"); + MULTI_VALUES.add("notrie_multi"); + + + final int NUM_DOCS = TestUtil.nextInt(random(), 200, 1500); + final int MIN = TestUtil.nextInt(random(), 10, 100); + final int MAX = MIN + TestUtil.nextInt(random(), 10, 100); + final long EXPECTED_VALSET_SIZE = 1 + MAX - MIN; + + { // (at least) one doc should have every value, so that at least one segment has every value + final Document doc = new Document(); + for (int i = MIN; i <= MAX; i++) { + doc.add(new IntField("trie_multi", i, Field.Store.NO)); + doc.add(new IntField("notrie_multi", i, NO_TRIE_TYPE)); + } + iw.addDocument(doc); + } + + // now add some more random docs (note: starting at i=1 because of previously added doc) + for (int i = 1; i < NUM_DOCS; i++) { + final Document doc = new Document(); + if (0 != TestUtil.nextInt(random(), 0, 9)) { + int val = TestUtil.nextInt(random(), MIN, MAX); + doc.add(new IntField("trie_single", val, Field.Store.NO)); + doc.add(new IntField("notrie_single", val, NO_TRIE_TYPE)); + } + if (0 != TestUtil.nextInt(random(), 0, 9)) { + int numMulti = atLeast(1); + while (0 < numMulti--) { + int val = TestUtil.nextInt(random(), MIN, MAX); + doc.add(new IntField("trie_multi", val, Field.Store.NO)); + doc.add(new IntField("notrie_multi", val, NO_TRIE_TYPE)); + } + } + iw.addDocument(doc); + } + + iw.close(); + + final DirectoryReader ir = UninvertingReader.wrap(DirectoryReader.open(dir), UNINVERT_MAP); + TestUtil.checkReader(ir); + + final int NUM_LEAVES = ir.leaves().size(); + + // check the leaves: no more then total set size + for (LeafReaderContext rc : ir.leaves()) { + final LeafReader ar = rc.reader(); + for (String f : UNINVERT_MAP.keySet()) { + final SortedSetDocValues v = ar.getSortedSetDocValues(f); + final long valSetSize = v.getValueCount(); + assertTrue(f + ": Expected no more then " + EXPECTED_VALSET_SIZE + " values per segment, got " + + valSetSize + " from: " + ar.toString(), + valSetSize <= EXPECTED_VALSET_SIZE); + + if (1 == NUM_LEAVES && MULTI_VALUES.contains(f)) { + // tighter check on multi fields in single segment index since we know one doc has all of them + assertEquals(f + ": Single segment LeafReader's value set should have had exactly expected size", + EXPECTED_VALSET_SIZE, valSetSize); + } + } + } + + // check the composite of all leaves: exact expectation of set size + final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir); + TestUtil.checkReader(composite); + + for (String f : MULTI_VALUES) { + final SortedSetDocValues v = composite.getSortedSetDocValues(f); + final long valSetSize = v.getValueCount(); + assertEquals(f + ": Composite reader value set should have had exactly expected size", + EXPECTED_VALSET_SIZE, valSetSize); + } + + ir.close(); + dir.close(); + } + + public void testSortedSetEmptyIndex() throws IOException { + final Directory dir = newDirectory(); + final IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); + iw.close(); + + final Map UNINVERT_MAP = new LinkedHashMap(); + for (Type t : EnumSet.allOf(Type.class)) { + UNINVERT_MAP.put(t.name(), t); + } + + final DirectoryReader ir = UninvertingReader.wrap(DirectoryReader.open(dir), UNINVERT_MAP); + TestUtil.checkReader(ir); + + final LeafReader composite = SlowCompositeReaderWrapper.wrap(ir); + TestUtil.checkReader(composite); + + for (String f : UNINVERT_MAP.keySet()) { + // check the leaves + // (normally there are none for an empty index, so this is really just future + // proofing in case that changes for some reason) + for (LeafReaderContext rc : ir.leaves()) { + final LeafReader ar = rc.reader(); + assertNull(f + ": Expected no doc values from empty index (leaf)", + ar.getSortedSetDocValues(f)); + } + + // check the composite + assertNull(f + ": Expected no doc values from empty index (composite)", + composite.getSortedSetDocValues(f)); + + } + + ir.close(); + dir.close(); + } + } diff --git a/solr/core/src/test/org/apache/solr/search/TestTrieFacet.java b/solr/core/src/test/org/apache/solr/search/TestTrieFacet.java new file mode 100644 index 00000000000..b647a66863a --- /dev/null +++ b/solr/core/src/test/org/apache/solr/search/TestTrieFacet.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.search; + +import org.apache.lucene.util.TestUtil; + +import org.apache.solr.schema.SchemaField; +import org.apache.solr.schema.TrieIntField; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.SolrTestCaseJ4; + +import org.junit.BeforeClass; + +public class TestTrieFacet extends SolrTestCaseJ4 { + + final static int MIN_VALUE = 20; + final static int MAX_VALUE = 60; + + final static String TRIE_INT_P8_S_VALUED = "foo_ti1"; + final static String TRIE_INT_P8_M_VALUED = "foo_ti"; + + final static String TRIE_INT_P0_S_VALUED = "foo_i1"; + final static String TRIE_INT_P0_M_VALUED = "foo_i"; + + final static String[] M_VALUED = new String[] { TRIE_INT_P0_M_VALUED, TRIE_INT_P8_M_VALUED }; + final static String[] S_VALUED = new String[] { TRIE_INT_P0_S_VALUED, TRIE_INT_P8_S_VALUED }; + + final static String[] P0 = new String[] { TRIE_INT_P0_M_VALUED, TRIE_INT_P0_S_VALUED }; + final static String[] P8 = new String[] { TRIE_INT_P8_M_VALUED, TRIE_INT_P8_S_VALUED }; + + static int NUM_DOCS; + + private static TrieIntField assertCastFieldType(SchemaField f) { + assertTrue("who changed the schema? test isn't valid: " + f.getName(), + f.getType() instanceof TrieIntField); + return (TrieIntField) f.getType(); + } + + @BeforeClass + public static void beforeClass() throws Exception { + + initCore("solrconfig-tlog.xml","schema.xml"); + + // don't break the test + assertTrue("min value must be less then max value", MIN_VALUE < MAX_VALUE); + assertTrue("min value must be greater then zero", 0 < MIN_VALUE); + + // sanity check no one breaks the schema out from under us... + for (String f : M_VALUED) { + SchemaField sf = h.getCore().getLatestSchema().getField(f); + assertTrue("who changed the schema? test isn't valid: " + f, sf.multiValued()); + } + + for (String f : S_VALUED) { + SchemaField sf = h.getCore().getLatestSchema().getField(f); + assertFalse("who changed the schema? test isn't valid: " + f, sf.multiValued()); + } + + for (String f : P0) { + SchemaField sf = h.getCore().getLatestSchema().getField(f); + assertEquals("who changed the schema? test isn't valid: " + f, + 0, assertCastFieldType(sf).getPrecisionStep()); + } + for (String f : P8) { + SchemaField sf = h.getCore().getLatestSchema().getField(f); + assertEquals("who changed the schema? test isn't valid: " + f, + 8, assertCastFieldType(sf).getPrecisionStep()); + } + + // we don't need a lot of docs -- at least one failure only had ~1000 + NUM_DOCS = TestUtil.nextInt(random(), 200, 1500); + + { // ensure at least one doc has every valid value in the multivalued fields + SolrInputDocument doc = sdoc("id", "0"); + for (int val = MIN_VALUE; val <= MAX_VALUE; val++) { + for (String f : M_VALUED) { + doc.addField(f, val); + } + } + assertU(adoc(doc)); + } + + // randomized docs (note: starting at i=1) + for (int i=1; i < NUM_DOCS; i++) { + SolrInputDocument doc = sdoc("id", i+""); + if (useField()) { + int val = TestUtil.nextInt(random(), MIN_VALUE, MAX_VALUE); + for (String f : S_VALUED) { + doc.addField(f, val); + } + } + if (useField()) { + int numMulti = atLeast(1); + while (0 < numMulti--) { + int val = TestUtil.nextInt(random(), MIN_VALUE, MAX_VALUE); + for (String f: M_VALUED) { + doc.addField(f, val); + } + } + } + assertU(adoc(doc)); + } + assertU(commit()); + } + + /** + * Similar to usually() but we want it to happen just as often regardless + * of test multiplier and nightly status + */ + private static boolean useField() { + return 0 != TestUtil.nextInt(random(), 0, 9); + } + + private static void doTestNoZeros(final String field, final String method) throws Exception { + + assertQ("sanity check # docs in index: " + NUM_DOCS, + req("q", "*:*", "rows", "0") + ,"//result[@numFound="+NUM_DOCS+"]"); + assertQ("sanity check that no docs match 0 failed", + req("q", field+":0", "rows", "0") + ,"//result[@numFound=0]"); + assertQ("sanity check that no docs match [0 TO 0] failed", + req("q", field+":[0 TO 0]", "rows", "0") + ,"//result[@numFound=0]"); + + assertQ("*:* facet with mincount 0 found unexpected 0 value", + req("q", "*:*" + ,"rows", "0" + ,"indent","true" + ,"facet", "true" + ,"facet.field", field + ,"facet.limit", "-1" + ,"facet.mincount", "0" + ,"facet.method", method + ) + // trivial sanity check we're at least getting facet counts in output + ,"*[count(//lst[@name='facet_fields']/lst[@name='"+field+"']/int)!=0]" + // main point of test + ,"*[count(//lst[@name='facet_fields']/lst[@name='"+field+"']/int[@name='0'])=0]" + ); + } + + // enum + public void testSingleValuedTrieP0_enum() throws Exception { + doTestNoZeros(TRIE_INT_P0_S_VALUED, "enum"); + } + public void testMultiValuedTrieP0_enum() throws Exception { + doTestNoZeros(TRIE_INT_P0_M_VALUED, "enum"); + } + public void testSingleValuedTrieP8_enum() throws Exception { + doTestNoZeros(TRIE_INT_P8_S_VALUED, "enum"); + } + public void testMultiValuedTrieP8_enum() throws Exception { + doTestNoZeros(TRIE_INT_P8_M_VALUED, "enum"); + } + + // fc + public void testSingleValuedTrieP0_fc() throws Exception { + doTestNoZeros(TRIE_INT_P0_S_VALUED, "fc"); + } + public void testMultiValuedTrieP0_fc() throws Exception { + doTestNoZeros(TRIE_INT_P0_M_VALUED, "fc"); + } + public void testSingleValuedTrieP8_fc() throws Exception { + doTestNoZeros(TRIE_INT_P8_S_VALUED, "fc"); + } + public void testMultiValuedTrieP8_fc() throws Exception { + doTestNoZeros(TRIE_INT_P8_M_VALUED, "fc"); + } + + // fcs + public void testSingleValuedTrieP0_fcs() throws Exception { + doTestNoZeros(TRIE_INT_P0_S_VALUED, "fcs"); + } + public void testMultiValuedTrieP0_fcs() throws Exception { + doTestNoZeros(TRIE_INT_P0_M_VALUED, "fcs"); + } + public void testSingleValuedTrieP8_fcs() throws Exception { + doTestNoZeros(TRIE_INT_P8_S_VALUED, "fcs"); + } + public void testMultiValuedTrieP8_fcs() throws Exception { + doTestNoZeros(TRIE_INT_P8_M_VALUED, "fcs"); + } + +} +