From 650cad19a2ccf780a8ce30a6a8dfcf0d9d85022e Mon Sep 17 00:00:00 2001 From: Greg Miller Date: Mon, 3 May 2021 08:18:38 -0700 Subject: [PATCH] LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122) The public API in LongValueFacetCounts previously required the user to specify whether-or-not a field being counted should be single- or multi-valued (i.e., is it NumericDocValues or SortedNumericDocValues). Since we can detect this automatically, it seems unnecessary to ask users to specify. Co-authored-by: Greg Miller --- lucene/CHANGES.txt | 5 +- lucene/MIGRATE.md | 6 + .../lucene/facet/LongValueFacetCounts.java | 172 ++++++------------ .../facet/TestLongValueFacetCounts.java | 40 ++-- 4 files changed, 88 insertions(+), 135 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 61fbee6c8b0..45dde7284a3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -110,7 +110,10 @@ API Changes Adrien Grand, Simon Willnauer) * LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand) - + +* LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in + LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller) + Improvements * LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words, diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md index 783efffeb05..29f5f7b5129 100644 --- a/lucene/MIGRATE.md +++ b/lucene/MIGRATE.md @@ -400,3 +400,9 @@ field. Lucene index readers are now using so little memory with the default codec that it was decided to remove the ability to estimate their RAM usage. + +## LongValueFacetCounts no longer accepts multiValued param in constructors (LUCENE-9948) + +LongValueFacetCounts will now automatically detect whether-or-not an indexed field is single- or +multi-valued. The user no longer needs to provide this information to the ctors. Migrating should +be as simple as no longer providing this boolean. \ No newline at end of file diff --git a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java index 4379ab72f7c..520d931d66c 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/LongValueFacetCounts.java @@ -54,52 +54,35 @@ public class LongValueFacetCounts extends Facets { /** Used for all values that are >= 1K. */ private final LongIntScatterMap hashCounts = new LongIntScatterMap(); + /** Field being counted. */ private final String field; /** - * Total number of values counted, which is the subset of hits that had a value for this field. + * Total value count. For single-value cases, this is the subset of hits that had a value for this + * field. */ private int totCount; /** * Create {@code LongValueFacetCounts}, using either single-valued {@link NumericDocValues} or - * multi-valued {@link SortedNumericDocValues} from the specified field. + * multi-valued {@link SortedNumericDocValues} from the specified field (depending on what has + * been indexed). */ - public LongValueFacetCounts(String field, FacetsCollector hits, boolean multiValued) - throws IOException { - this(field, null, hits, multiValued); + public LongValueFacetCounts(String field, FacetsCollector hits) throws IOException { + this(field, null, hits); } /** - * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. If hits is - * null then all facets are counted. + * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource} if non-null. + * If {@code valueSource} is null, doc values from the provided {@code field} will be used. */ public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits) throws IOException { - this(field, valueSource, hits, false); - } - - /** - * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. random access - * (implement {@link org.apache.lucene.search.DocIdSet#bits}). - */ - public LongValueFacetCounts( - String field, LongValuesSource valueSource, FacetsCollector hits, boolean multiValued) - throws IOException { this.field = field; - if (valueSource == null) { - if (multiValued) { - countMultiValued(field, hits.getMatchingDocs()); - } else { - count(field, hits.getMatchingDocs()); - } - } else { - // value source is always single valued - if (multiValued) { - throw new IllegalArgumentException( - "can only compute multi-valued facets directly from doc values (when valueSource is null)"); - } + if (valueSource != null) { count(valueSource, hits.getMatchingDocs()); + } else { + count(field, hits.getMatchingDocs()); } } @@ -107,31 +90,32 @@ public class LongValueFacetCounts extends Facets { * Counts all facet values for this reader. This produces the same result as computing facets on a * {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient. */ - public LongValueFacetCounts(String field, IndexReader reader, boolean multiValued) + public LongValueFacetCounts(String field, IndexReader reader) throws IOException { + this(field, null, reader); + } + + /** + * Counts all facet values for the provided {@link LongValuesSource} if non-null. If {@code + * valueSource} is null, doc values from the provided {@code field} will be used. This produces + * the same result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, + * but is more efficient. + */ + public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader) throws IOException { this.field = field; - if (multiValued) { - countAllMultiValued(reader, field); + if (valueSource != null) { + countAll(reader, valueSource); } else { countAll(reader, field); } } - /** - * Counts all facet values for the provided {@link LongValuesSource}. This produces the same - * result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more - * efficient. - */ - public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader) - throws IOException { - this.field = field; - countAll(valueSource, field, reader); - } - + /** Counts from the provided valueSource. */ private void count(LongValuesSource valueSource, List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { + LongValues fv = valueSource.getValues(hits.context, null); // NOTE: this is not as efficient as working directly with the doc values APIs in the sparse @@ -152,80 +136,41 @@ public class LongValueFacetCounts extends Facets { } } + /** Counts from the field's indexed doc values. */ private void count(String field, List matchingDocs) throws IOException { - for (MatchingDocs hits : matchingDocs) { - NumericDocValues fv = hits.context.reader().getNumericDocValues(field); - if (fv == null) { - continue; - } - countOneSegment(fv, hits); - } - } - - private void countOneSegment(NumericDocValues values, MatchingDocs hits) throws IOException { - DocIdSetIterator it = - ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values)); - - for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { - increment(values.longValue()); - totCount++; - } - } - - /** Counts directly from SortedNumericDocValues. */ - private void countMultiValued(String field, List matchingDocs) throws IOException { for (MatchingDocs hits : matchingDocs) { - SortedNumericDocValues values = hits.context.reader().getSortedNumericDocValues(field); - if (values == null) { - // this field has no doc values for this segment - continue; - } - NumericDocValues singleValues = DocValues.unwrapSingleton(values); + SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field); + NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues); if (singleValues != null) { - countOneSegment(singleValues, hits); + + DocIdSetIterator it = + ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), singleValues)); + + for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { + increment(singleValues.longValue()); + totCount++; + } } else { DocIdSetIterator it = - ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values)); + ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), multiValues)); for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { - int limit = values.docValueCount(); + int limit = multiValues.docValueCount(); totCount += limit; for (int i = 0; i < limit; i++) { - increment(values.nextValue()); + increment(multiValues.nextValue()); } } } } } - /** Optimized version that directly counts all doc values. */ - private void countAll(IndexReader reader, String field) throws IOException { - - for (LeafReaderContext context : reader.leaves()) { - - NumericDocValues values = context.reader().getNumericDocValues(field); - if (values == null) { - // this field has no doc values for this segment - continue; - } - - countAllOneSegment(values); - } - } - - private void countAllOneSegment(NumericDocValues values) throws IOException { - while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - totCount++; - increment(values.longValue()); - } - } - - private void countAll(LongValuesSource valueSource, String field, IndexReader reader) - throws IOException { + /** Count everything in the provided valueSource. */ + private void countAll(IndexReader reader, LongValuesSource valueSource) throws IOException { for (LeafReaderContext context : reader.leaves()) { LongValues fv = valueSource.getValues(context, null); @@ -241,24 +186,27 @@ public class LongValueFacetCounts extends Facets { } } - private void countAllMultiValued(IndexReader reader, String field) throws IOException { + /** Count everything in the specified field. */ + private void countAll(IndexReader reader, String field) throws IOException { for (LeafReaderContext context : reader.leaves()) { - SortedNumericDocValues values = context.reader().getSortedNumericDocValues(field); - if (values == null) { - // this field has no doc values for this segment - continue; - } - NumericDocValues singleValues = DocValues.unwrapSingleton(values); + SortedNumericDocValues multiValues = DocValues.getSortedNumeric(context.reader(), field); + NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues); + if (singleValues != null) { - countAllOneSegment(singleValues); + + while (singleValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + totCount++; + increment(singleValues.longValue()); + } } else { - while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - int limit = values.docValueCount(); + + while (multiValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + int limit = multiValues.docValueCount(); totCount += limit; for (int i = 0; i < limit; i++) { - increment(values.nextValue()); + increment(multiValues.nextValue()); } } } @@ -294,7 +242,7 @@ public class LongValueFacetCounts extends Facets { /** Returns the specified top number of facets, sorted by count. */ public FacetResult getTopChildrenSortByCount(int topN) { PriorityQueue pq = - new PriorityQueue(Math.min(topN, counts.length + hashCounts.size())) { + new PriorityQueue<>(Math.min(topN, counts.length + hashCounts.size())) { @Override protected boolean lessThan(Entry a, Entry b) { // sort by count descending, breaking ties by value ascending: @@ -409,13 +357,13 @@ public class LongValueFacetCounts extends Facets { } @Override - public Number getSpecificValue(String dim, String... path) throws IOException { + public Number getSpecificValue(String dim, String... path) { // TODO: should we impl this? throw new UnsupportedOperationException(); } @Override - public List getAllDims(int topN) throws IOException { + public List getAllDims(int topN) { return Collections.singletonList(getTopChildren(topN, field)); } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java index 1c4b5eb943b..5821052d12e 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/TestLongValueFacetCounts.java @@ -19,7 +19,7 @@ package org.apache.lucene.facet; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -60,7 +60,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { IndexSearcher s = newSearcher(r); s.search(new MatchAllDocsQuery(), fc); - LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false); + LongValueFacetCounts facets = new LongValueFacetCounts("field", fc); FacetResult result = facets.getAllChildrenSortByValue(); assertEquals( @@ -87,7 +87,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { IndexSearcher s = newSearcher(r); s.search(new MatchAllDocsQuery(), fc); - LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false); + LongValueFacetCounts facets = new LongValueFacetCounts("field", fc); FacetResult result = facets.getAllChildrenSortByValue(); assertEquals( @@ -119,7 +119,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { IndexSearcher s = newSearcher(r); s.search(new MatchAllDocsQuery(), fc); - Facets facets = new LongValueFacetCounts("field", fc, false); + Facets facets = new LongValueFacetCounts("field", fc); List result = facets.getAllDims(10); assertEquals(1, result.size()); @@ -199,7 +199,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { List> expectedCounts = new ArrayList<>(expected.entrySet()); // sort by value - Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); LongValueFacetCounts facetCounts; if (random().nextBoolean()) { @@ -214,7 +214,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println(" use doc values"); } - facetCounts = new LongValueFacetCounts("field", fc, false); + facetCounts = new LongValueFacetCounts("field", fc); } } else { // optimized count all: @@ -228,7 +228,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println(" count all doc values"); } - facetCounts = new LongValueFacetCounts("field", r, false); + facetCounts = new LongValueFacetCounts("field", r); } } @@ -242,8 +242,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { Integer.MAX_VALUE); // sort by count - Collections.sort( - expectedCounts, + expectedCounts.sort( (a, b) -> { int cmp = -Integer.compare(a.getValue(), b.getValue()); if (cmp == 0) { @@ -288,7 +287,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println(" use doc values"); } - facetCounts = new LongValueFacetCounts("field", fc, false); + facetCounts = new LongValueFacetCounts("field", fc); } else { if (VERBOSE) { System.out.println(" use value source"); @@ -314,7 +313,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { expectedCounts = new ArrayList<>(expected.entrySet()); // sort by value - Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); actual = facetCounts.getAllChildrenSortByValue(); assertSame( "id " + minId + "-" + maxId + ", sort facets by value", @@ -325,8 +324,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { Integer.MAX_VALUE); // sort by count - Collections.sort( - expectedCounts, + expectedCounts.sort( (a, b) -> { int cmp = -Integer.compare(a.getValue(), b.getValue()); if (cmp == 0) { @@ -447,7 +445,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { List> expectedCounts = new ArrayList<>(expected.entrySet()); // sort by value - Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); LongValueFacetCounts facetCounts; if (random().nextBoolean()) { @@ -455,13 +453,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase { if (VERBOSE) { System.out.println(" use doc values"); } - facetCounts = new LongValueFacetCounts("field", fc, true); + facetCounts = new LongValueFacetCounts("field", fc); } else { // optimized count all: if (VERBOSE) { System.out.println(" count all doc values"); } - facetCounts = new LongValueFacetCounts("field", r, true); + facetCounts = new LongValueFacetCounts("field", r); } FacetResult actual = facetCounts.getAllChildrenSortByValue(); @@ -474,8 +472,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { Integer.MAX_VALUE); // sort by count - Collections.sort( - expectedCounts, + expectedCounts.sort( (a, b) -> { int cmp = -Integer.compare(a.getValue(), b.getValue()); if (cmp == 0) { @@ -517,7 +514,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { fc = new FacetsCollector(); s.search(IntPoint.newRangeQuery("id", minId, maxId), fc); // cannot use value source here because we are multi valued - facetCounts = new LongValueFacetCounts("field", fc, true); + facetCounts = new LongValueFacetCounts("field", fc); expected = new HashMap<>(); expectedChildCount = 0; @@ -538,7 +535,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { expectedCounts = new ArrayList<>(expected.entrySet()); // sort by value - Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); + expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey)); actual = facetCounts.getAllChildrenSortByValue(); assertSame( "id " + minId + "-" + maxId + ", sort facets by value", @@ -549,8 +546,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase { Integer.MAX_VALUE); // sort by count - Collections.sort( - expectedCounts, + expectedCounts.sort( (a, b) -> { int cmp = -Integer.compare(a.getValue(), b.getValue()); if (cmp == 0) {