SOLR-11363: handle repeated values in points docvalues fields

This commit is contained in:
yonik 2017-09-19 13:39:45 -04:00
parent 8e12f20113
commit 7cc9ee6563
7 changed files with 58 additions and 10 deletions

View File

@ -133,6 +133,10 @@ Bug Fixes
* SOLR-11348: Fix the DIH database example (James Dyer)
* SOLR-11363: JSON Facet API: repeated values in a numeric points field with docValues enabled
were double counted. (Hossman, yonik)
Optimizations
----------------------

View File

@ -390,9 +390,16 @@ class FacetFieldProcessorByHashDV extends FacetFieldProcessor {
values.advance(segDoc);
}
if (segDoc == values.docID()) {
for (int i = 0; i < values.docValueCount(); i++) {
collectValFirstPhase(segDoc, values.nextValue());
long l = values.nextValue(); // This document must have at least one value
collectValFirstPhase(segDoc, l);
for (int i = 1; i < values.docValueCount(); i++) {
long lnew = values.nextValue();
if (lnew != l) { // Skip the value if it's equal to the last one, we don't want to double-count it
collectValFirstPhase(segDoc, lnew);
}
l = lnew;
}
}
}
});

View File

@ -235,6 +235,7 @@ public class HLLAgg extends StrAggValueSource {
@Override
protected void collectValues(int doc, HLL hll) throws IOException {
for (int i = 0; i < values.docValueCount(); i++) {
// duplicates may be produced for a single doc, but won't matter here.
long val = values.nextValue();
long hash = Hash.fmix64(val);
hll.addRaw(hash);

View File

@ -254,6 +254,7 @@ public class UniqueAgg extends StrAggValueSource {
@Override
protected void collectValues(int doc, LongSet set) throws IOException {
for (int i = 0; i < values.docValueCount(); i++) {
// duplicates may be produced for a single doc, but won't matter here.
set.add(values.nextValue());
}
}

View File

@ -60,6 +60,7 @@ public class GraphPointsCollector extends GraphEdgeCollector {
if (valuesDoc == doc) {
int count = values.docValueCount();
for (int i = 0; i < count; i++) {
// duplicates may be produced for a single doc, but won't matter here.
long v = values.nextValue();
set.add(v);
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.cloud;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
import java.nio.file.Paths;
@ -29,10 +28,9 @@ import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.SolrTestCaseJ4.SuppressPointFields;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
@ -46,7 +44,6 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.slf4j.Logger;
@ -62,7 +59,6 @@ import org.slf4j.LoggerFactory;
*
* @see TestCloudPivotFacet
*/
@SuppressPointFields(bugUrl="https://issues.apache.org/jira/browse/SOLR-10939")
public class TestCloudJSONFacetJoinDomain extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -600,9 +596,21 @@ public class TestCloudJSONFacetJoinDomain extends SolrCloudTestCase {
final String[] suffixes = random().nextBoolean() ? STR_FIELD_SUFFIXES : INT_FIELD_SUFFIXES;
final boolean noJoin = random().nextBoolean();
final String from = noJoin ? null : field(suffixes, random().nextInt(MAX_FIELD_NUM));
final String to = noJoin ? null : field(suffixes, random().nextInt(MAX_FIELD_NUM));
String from = null;
String to = null;
for (;;) {
if (noJoin) break;
from = field(suffixes, random().nextInt(MAX_FIELD_NUM));
to = field(suffixes, random().nextInt(MAX_FIELD_NUM));
// HACK: joined numeric point fields need docValues.. for now just skip _is fields if we are dealing with points.
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP) && (from.endsWith("_is") || to.endsWith("_is")))
{
continue;
}
break;
}
// keep it simple, only filter on string fields - not point of test
final String filterField = strfield(random().nextInt(MAX_FIELD_NUM));

View File

@ -42,6 +42,10 @@ import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
// Related tests:
// TestCloudJSONFacetJoinDomain for random field faceting tests with domain modifications
// TestJsonFacetRefinement for refinement tests
@LuceneTestCase.SuppressCodecs({"Lucene3x","Lucene40","Lucene41","Lucene42","Lucene45","Appending"})
public class TestJsonFacets extends SolrTestCaseHS {
@ -208,6 +212,28 @@ public class TestJsonFacets extends SolrTestCaseHS {
client.commit();
}
@Test
public void testRepeatedNumerics() throws Exception {
Client client = Client.localClient();
String field = "num_is"; // docValues of multi-valued points field can contain duplicate values... make sure they don't mess up our counts.
client.add(sdoc("id", "1", "cat_s", "A", "where_s", "NY", "num_d", "4", "num_i", "2", "val_b", "true", "sparse_s", "one", field,"0", field,"0"), null);
client.commit();
client.testJQ(params("q", "id:1", "field", field
, "json.facet", "{" +
"f1:{terms:${field}}" +
",f2:'hll(${field})'" +
",f3:{type:range, field:${field}, start:0, end:1, gap:1}" +
"}"
)
, "facets=={count:1, " +
"f1:{buckets:[{val:0, count:1}]}" +
",f2:1" +
",f3:{buckets:[{val:0, count:1}]}" +
"}"
);
}
public void testDomainJoinSelf() throws Exception {
Client client = Client.localClient();
indexSimple(client);