LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122)

The public API in LongValueFacetCounts previously required the user to specify whether-or-not a field being counted should be single- or multi-valued (i.e., is it NumericDocValues or SortedNumericDocValues). Since we can detect this automatically, it seems unnecessary to ask users to specify.

Co-authored-by: Greg Miller <gmiller@amazon.com>
This commit is contained in:
Greg Miller 2021-05-03 08:18:38 -07:00 committed by GitHub
parent a91bde5104
commit 650cad19a2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 88 additions and 135 deletions

View File

@ -111,6 +111,9 @@ API Changes
* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand) * LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand)
* LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in
LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller)
Improvements Improvements
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words, * LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,

View File

@ -400,3 +400,9 @@ field.
Lucene index readers are now using so little memory with the default codec that Lucene index readers are now using so little memory with the default codec that
it was decided to remove the ability to estimate their RAM usage. it was decided to remove the ability to estimate their RAM usage.
## LongValueFacetCounts no longer accepts multiValued param in constructors (LUCENE-9948)
LongValueFacetCounts will now automatically detect whether-or-not an indexed field is single- or
multi-valued. The user no longer needs to provide this information to the ctors. Migrating should
be as simple as no longer providing this boolean.

View File

@ -54,52 +54,35 @@ public class LongValueFacetCounts extends Facets {
/** Used for all values that are >= 1K. */ /** Used for all values that are >= 1K. */
private final LongIntScatterMap hashCounts = new LongIntScatterMap(); private final LongIntScatterMap hashCounts = new LongIntScatterMap();
/** Field being counted. */
private final String field; private final String field;
/** /**
* Total number of values counted, which is the subset of hits that had a value for this field. * Total value count. For single-value cases, this is the subset of hits that had a value for this
* field.
*/ */
private int totCount; private int totCount;
/** /**
* Create {@code LongValueFacetCounts}, using either single-valued {@link NumericDocValues} or * Create {@code LongValueFacetCounts}, using either single-valued {@link NumericDocValues} or
* multi-valued {@link SortedNumericDocValues} from the specified field. * multi-valued {@link SortedNumericDocValues} from the specified field (depending on what has
* been indexed).
*/ */
public LongValueFacetCounts(String field, FacetsCollector hits, boolean multiValued) public LongValueFacetCounts(String field, FacetsCollector hits) throws IOException {
throws IOException { this(field, null, hits);
this(field, null, hits, multiValued);
} }
/** /**
* Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. If hits is * Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource} if non-null.
* null then all facets are counted. * If {@code valueSource} is null, doc values from the provided {@code field} will be used.
*/ */
public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits) public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits)
throws IOException { throws IOException {
this(field, valueSource, hits, false);
}
/**
* Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. random access
* (implement {@link org.apache.lucene.search.DocIdSet#bits}).
*/
public LongValueFacetCounts(
String field, LongValuesSource valueSource, FacetsCollector hits, boolean multiValued)
throws IOException {
this.field = field; this.field = field;
if (valueSource == null) { if (valueSource != null) {
if (multiValued) {
countMultiValued(field, hits.getMatchingDocs());
} else {
count(field, hits.getMatchingDocs());
}
} else {
// value source is always single valued
if (multiValued) {
throw new IllegalArgumentException(
"can only compute multi-valued facets directly from doc values (when valueSource is null)");
}
count(valueSource, hits.getMatchingDocs()); count(valueSource, hits.getMatchingDocs());
} else {
count(field, hits.getMatchingDocs());
} }
} }
@ -107,31 +90,32 @@ public class LongValueFacetCounts extends Facets {
* Counts all facet values for this reader. This produces the same result as computing facets on a * Counts all facet values for this reader. This produces the same result as computing facets on a
* {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient. * {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient.
*/ */
public LongValueFacetCounts(String field, IndexReader reader, boolean multiValued) public LongValueFacetCounts(String field, IndexReader reader) throws IOException {
this(field, null, reader);
}
/**
* Counts all facet values for the provided {@link LongValuesSource} if non-null. If {@code
* valueSource} is null, doc values from the provided {@code field} will be used. This produces
* the same result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery},
* but is more efficient.
*/
public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader)
throws IOException { throws IOException {
this.field = field; this.field = field;
if (multiValued) { if (valueSource != null) {
countAllMultiValued(reader, field); countAll(reader, valueSource);
} else { } else {
countAll(reader, field); countAll(reader, field);
} }
} }
/** /** Counts from the provided valueSource. */
* Counts all facet values for the provided {@link LongValuesSource}. This produces the same
* result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more
* efficient.
*/
public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader)
throws IOException {
this.field = field;
countAll(valueSource, field, reader);
}
private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs) private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs)
throws IOException { throws IOException {
for (MatchingDocs hits : matchingDocs) { for (MatchingDocs hits : matchingDocs) {
LongValues fv = valueSource.getValues(hits.context, null); LongValues fv = valueSource.getValues(hits.context, null);
// NOTE: this is not as efficient as working directly with the doc values APIs in the sparse // NOTE: this is not as efficient as working directly with the doc values APIs in the sparse
@ -152,80 +136,41 @@ public class LongValueFacetCounts extends Facets {
} }
} }
/** Counts from the field's indexed doc values. */
private void count(String field, List<MatchingDocs> matchingDocs) throws IOException { private void count(String field, List<MatchingDocs> matchingDocs) throws IOException {
for (MatchingDocs hits : matchingDocs) {
NumericDocValues fv = hits.context.reader().getNumericDocValues(field);
if (fv == null) {
continue;
}
countOneSegment(fv, hits);
}
}
private void countOneSegment(NumericDocValues values, MatchingDocs hits) throws IOException {
DocIdSetIterator it =
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values));
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
increment(values.longValue());
totCount++;
}
}
/** Counts directly from SortedNumericDocValues. */
private void countMultiValued(String field, List<MatchingDocs> matchingDocs) throws IOException {
for (MatchingDocs hits : matchingDocs) { for (MatchingDocs hits : matchingDocs) {
SortedNumericDocValues values = hits.context.reader().getSortedNumericDocValues(field);
if (values == null) {
// this field has no doc values for this segment
continue;
}
NumericDocValues singleValues = DocValues.unwrapSingleton(values); SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field);
NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
if (singleValues != null) { if (singleValues != null) {
countOneSegment(singleValues, hits);
DocIdSetIterator it =
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), singleValues));
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
increment(singleValues.longValue());
totCount++;
}
} else { } else {
DocIdSetIterator it = DocIdSetIterator it =
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values)); ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), multiValues));
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) { for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
int limit = values.docValueCount(); int limit = multiValues.docValueCount();
totCount += limit; totCount += limit;
for (int i = 0; i < limit; i++) { for (int i = 0; i < limit; i++) {
increment(values.nextValue()); increment(multiValues.nextValue());
} }
} }
} }
} }
} }
/** Optimized version that directly counts all doc values. */ /** Count everything in the provided valueSource. */
private void countAll(IndexReader reader, String field) throws IOException { private void countAll(IndexReader reader, LongValuesSource valueSource) throws IOException {
for (LeafReaderContext context : reader.leaves()) {
NumericDocValues values = context.reader().getNumericDocValues(field);
if (values == null) {
// this field has no doc values for this segment
continue;
}
countAllOneSegment(values);
}
}
private void countAllOneSegment(NumericDocValues values) throws IOException {
while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totCount++;
increment(values.longValue());
}
}
private void countAll(LongValuesSource valueSource, String field, IndexReader reader)
throws IOException {
for (LeafReaderContext context : reader.leaves()) { for (LeafReaderContext context : reader.leaves()) {
LongValues fv = valueSource.getValues(context, null); LongValues fv = valueSource.getValues(context, null);
@ -241,24 +186,27 @@ public class LongValueFacetCounts extends Facets {
} }
} }
private void countAllMultiValued(IndexReader reader, String field) throws IOException { /** Count everything in the specified field. */
private void countAll(IndexReader reader, String field) throws IOException {
for (LeafReaderContext context : reader.leaves()) { for (LeafReaderContext context : reader.leaves()) {
SortedNumericDocValues values = context.reader().getSortedNumericDocValues(field); SortedNumericDocValues multiValues = DocValues.getSortedNumeric(context.reader(), field);
if (values == null) { NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
// this field has no doc values for this segment
continue;
}
NumericDocValues singleValues = DocValues.unwrapSingleton(values);
if (singleValues != null) { if (singleValues != null) {
countAllOneSegment(singleValues);
while (singleValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
totCount++;
increment(singleValues.longValue());
}
} else { } else {
while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int limit = values.docValueCount(); while (multiValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
int limit = multiValues.docValueCount();
totCount += limit; totCount += limit;
for (int i = 0; i < limit; i++) { for (int i = 0; i < limit; i++) {
increment(values.nextValue()); increment(multiValues.nextValue());
} }
} }
} }
@ -294,7 +242,7 @@ public class LongValueFacetCounts extends Facets {
/** Returns the specified top number of facets, sorted by count. */ /** Returns the specified top number of facets, sorted by count. */
public FacetResult getTopChildrenSortByCount(int topN) { public FacetResult getTopChildrenSortByCount(int topN) {
PriorityQueue<Entry> pq = PriorityQueue<Entry> pq =
new PriorityQueue<Entry>(Math.min(topN, counts.length + hashCounts.size())) { new PriorityQueue<>(Math.min(topN, counts.length + hashCounts.size())) {
@Override @Override
protected boolean lessThan(Entry a, Entry b) { protected boolean lessThan(Entry a, Entry b) {
// sort by count descending, breaking ties by value ascending: // sort by count descending, breaking ties by value ascending:
@ -409,13 +357,13 @@ public class LongValueFacetCounts extends Facets {
} }
@Override @Override
public Number getSpecificValue(String dim, String... path) throws IOException { public Number getSpecificValue(String dim, String... path) {
// TODO: should we impl this? // TODO: should we impl this?
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
@Override @Override
public List<FacetResult> getAllDims(int topN) throws IOException { public List<FacetResult> getAllDims(int topN) {
return Collections.singletonList(getTopChildren(topN, field)); return Collections.singletonList(getTopChildren(topN, field));
} }

View File

@ -19,7 +19,7 @@ package org.apache.lucene.facet;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -60,7 +60,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
IndexSearcher s = newSearcher(r); IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc); s.search(new MatchAllDocsQuery(), fc);
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false); LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
FacetResult result = facets.getAllChildrenSortByValue(); FacetResult result = facets.getAllChildrenSortByValue();
assertEquals( assertEquals(
@ -87,7 +87,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
IndexSearcher s = newSearcher(r); IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc); s.search(new MatchAllDocsQuery(), fc);
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false); LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
FacetResult result = facets.getAllChildrenSortByValue(); FacetResult result = facets.getAllChildrenSortByValue();
assertEquals( assertEquals(
@ -119,7 +119,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
IndexSearcher s = newSearcher(r); IndexSearcher s = newSearcher(r);
s.search(new MatchAllDocsQuery(), fc); s.search(new MatchAllDocsQuery(), fc);
Facets facets = new LongValueFacetCounts("field", fc, false); Facets facets = new LongValueFacetCounts("field", fc);
List<FacetResult> result = facets.getAllDims(10); List<FacetResult> result = facets.getAllDims(10);
assertEquals(1, result.size()); assertEquals(1, result.size());
@ -199,7 +199,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet()); List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value // sort by value
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
LongValueFacetCounts facetCounts; LongValueFacetCounts facetCounts;
if (random().nextBoolean()) { if (random().nextBoolean()) {
@ -214,7 +214,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" use doc values"); System.out.println(" use doc values");
} }
facetCounts = new LongValueFacetCounts("field", fc, false); facetCounts = new LongValueFacetCounts("field", fc);
} }
} else { } else {
// optimized count all: // optimized count all:
@ -228,7 +228,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" count all doc values"); System.out.println(" count all doc values");
} }
facetCounts = new LongValueFacetCounts("field", r, false); facetCounts = new LongValueFacetCounts("field", r);
} }
} }
@ -242,8 +242,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
Integer.MAX_VALUE); Integer.MAX_VALUE);
// sort by count // sort by count
Collections.sort( expectedCounts.sort(
expectedCounts,
(a, b) -> { (a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue()); int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) { if (cmp == 0) {
@ -288,7 +287,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" use doc values"); System.out.println(" use doc values");
} }
facetCounts = new LongValueFacetCounts("field", fc, false); facetCounts = new LongValueFacetCounts("field", fc);
} else { } else {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" use value source"); System.out.println(" use value source");
@ -314,7 +313,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
expectedCounts = new ArrayList<>(expected.entrySet()); expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value // sort by value
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
actual = facetCounts.getAllChildrenSortByValue(); actual = facetCounts.getAllChildrenSortByValue();
assertSame( assertSame(
"id " + minId + "-" + maxId + ", sort facets by value", "id " + minId + "-" + maxId + ", sort facets by value",
@ -325,8 +324,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
Integer.MAX_VALUE); Integer.MAX_VALUE);
// sort by count // sort by count
Collections.sort( expectedCounts.sort(
expectedCounts,
(a, b) -> { (a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue()); int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) { if (cmp == 0) {
@ -447,7 +445,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet()); List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value // sort by value
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
LongValueFacetCounts facetCounts; LongValueFacetCounts facetCounts;
if (random().nextBoolean()) { if (random().nextBoolean()) {
@ -455,13 +453,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
if (VERBOSE) { if (VERBOSE) {
System.out.println(" use doc values"); System.out.println(" use doc values");
} }
facetCounts = new LongValueFacetCounts("field", fc, true); facetCounts = new LongValueFacetCounts("field", fc);
} else { } else {
// optimized count all: // optimized count all:
if (VERBOSE) { if (VERBOSE) {
System.out.println(" count all doc values"); System.out.println(" count all doc values");
} }
facetCounts = new LongValueFacetCounts("field", r, true); facetCounts = new LongValueFacetCounts("field", r);
} }
FacetResult actual = facetCounts.getAllChildrenSortByValue(); FacetResult actual = facetCounts.getAllChildrenSortByValue();
@ -474,8 +472,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
Integer.MAX_VALUE); Integer.MAX_VALUE);
// sort by count // sort by count
Collections.sort( expectedCounts.sort(
expectedCounts,
(a, b) -> { (a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue()); int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) { if (cmp == 0) {
@ -517,7 +514,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
fc = new FacetsCollector(); fc = new FacetsCollector();
s.search(IntPoint.newRangeQuery("id", minId, maxId), fc); s.search(IntPoint.newRangeQuery("id", minId, maxId), fc);
// cannot use value source here because we are multi valued // cannot use value source here because we are multi valued
facetCounts = new LongValueFacetCounts("field", fc, true); facetCounts = new LongValueFacetCounts("field", fc);
expected = new HashMap<>(); expected = new HashMap<>();
expectedChildCount = 0; expectedChildCount = 0;
@ -538,7 +535,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
expectedCounts = new ArrayList<>(expected.entrySet()); expectedCounts = new ArrayList<>(expected.entrySet());
// sort by value // sort by value
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey()))); expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
actual = facetCounts.getAllChildrenSortByValue(); actual = facetCounts.getAllChildrenSortByValue();
assertSame( assertSame(
"id " + minId + "-" + maxId + ", sort facets by value", "id " + minId + "-" + maxId + ", sort facets by value",
@ -549,8 +546,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
Integer.MAX_VALUE); Integer.MAX_VALUE);
// sort by count // sort by count
Collections.sort( expectedCounts.sort(
expectedCounts,
(a, b) -> { (a, b) -> {
int cmp = -Integer.compare(a.getValue(), b.getValue()); int cmp = -Integer.compare(a.getValue(), b.getValue());
if (cmp == 0) { if (cmp == 0) {