mirror of https://github.com/apache/lucene.git
LUCENE-9948: Automatically detect multi- vs. single-valued cases in LongValueFacetCounts (#122)
The public API in LongValueFacetCounts previously required the user to specify whether-or-not a field being counted should be single- or multi-valued (i.e., is it NumericDocValues or SortedNumericDocValues). Since we can detect this automatically, it seems unnecessary to ask users to specify. Co-authored-by: Greg Miller <gmiller@amazon.com>
This commit is contained in:
parent
a91bde5104
commit
650cad19a2
lucene
|
@ -110,7 +110,10 @@ API Changes
|
|||
Adrien Grand, Simon Willnauer)
|
||||
|
||||
* LUCENE-9047: Directory API is now little endian. (Ignacio Vera, Adrien Grand)
|
||||
|
||||
|
||||
* LUCENE-9948: No longer require the user to specify whether-or-not a field is multi-valued in
|
||||
LongValueFacetCounts (detect automatically based on what is indexed). (Greg Miller)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9687: Hunspell support improvements: add API for spell-checking and suggestions, support compound words,
|
||||
|
|
|
@ -400,3 +400,9 @@ field.
|
|||
|
||||
Lucene index readers are now using so little memory with the default codec that
|
||||
it was decided to remove the ability to estimate their RAM usage.
|
||||
|
||||
## LongValueFacetCounts no longer accepts multiValued param in constructors (LUCENE-9948)
|
||||
|
||||
LongValueFacetCounts will now automatically detect whether-or-not an indexed field is single- or
|
||||
multi-valued. The user no longer needs to provide this information to the ctors. Migrating should
|
||||
be as simple as no longer providing this boolean.
|
|
@ -54,52 +54,35 @@ public class LongValueFacetCounts extends Facets {
|
|||
/** Used for all values that are >= 1K. */
|
||||
private final LongIntScatterMap hashCounts = new LongIntScatterMap();
|
||||
|
||||
/** Field being counted. */
|
||||
private final String field;
|
||||
|
||||
/**
|
||||
* Total number of values counted, which is the subset of hits that had a value for this field.
|
||||
* Total value count. For single-value cases, this is the subset of hits that had a value for this
|
||||
* field.
|
||||
*/
|
||||
private int totCount;
|
||||
|
||||
/**
|
||||
* Create {@code LongValueFacetCounts}, using either single-valued {@link NumericDocValues} or
|
||||
* multi-valued {@link SortedNumericDocValues} from the specified field.
|
||||
* multi-valued {@link SortedNumericDocValues} from the specified field (depending on what has
|
||||
* been indexed).
|
||||
*/
|
||||
public LongValueFacetCounts(String field, FacetsCollector hits, boolean multiValued)
|
||||
throws IOException {
|
||||
this(field, null, hits, multiValued);
|
||||
public LongValueFacetCounts(String field, FacetsCollector hits) throws IOException {
|
||||
this(field, null, hits);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. If hits is
|
||||
* null then all facets are counted.
|
||||
* Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource} if non-null.
|
||||
* If {@code valueSource} is null, doc values from the provided {@code field} will be used.
|
||||
*/
|
||||
public LongValueFacetCounts(String field, LongValuesSource valueSource, FacetsCollector hits)
|
||||
throws IOException {
|
||||
this(field, valueSource, hits, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create {@code LongValueFacetCounts}, using the provided {@link LongValuesSource}. random access
|
||||
* (implement {@link org.apache.lucene.search.DocIdSet#bits}).
|
||||
*/
|
||||
public LongValueFacetCounts(
|
||||
String field, LongValuesSource valueSource, FacetsCollector hits, boolean multiValued)
|
||||
throws IOException {
|
||||
this.field = field;
|
||||
if (valueSource == null) {
|
||||
if (multiValued) {
|
||||
countMultiValued(field, hits.getMatchingDocs());
|
||||
} else {
|
||||
count(field, hits.getMatchingDocs());
|
||||
}
|
||||
} else {
|
||||
// value source is always single valued
|
||||
if (multiValued) {
|
||||
throw new IllegalArgumentException(
|
||||
"can only compute multi-valued facets directly from doc values (when valueSource is null)");
|
||||
}
|
||||
if (valueSource != null) {
|
||||
count(valueSource, hits.getMatchingDocs());
|
||||
} else {
|
||||
count(field, hits.getMatchingDocs());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -107,31 +90,32 @@ public class LongValueFacetCounts extends Facets {
|
|||
* Counts all facet values for this reader. This produces the same result as computing facets on a
|
||||
* {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more efficient.
|
||||
*/
|
||||
public LongValueFacetCounts(String field, IndexReader reader, boolean multiValued)
|
||||
public LongValueFacetCounts(String field, IndexReader reader) throws IOException {
|
||||
this(field, null, reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts all facet values for the provided {@link LongValuesSource} if non-null. If {@code
|
||||
* valueSource} is null, doc values from the provided {@code field} will be used. This produces
|
||||
* the same result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery},
|
||||
* but is more efficient.
|
||||
*/
|
||||
public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader)
|
||||
throws IOException {
|
||||
this.field = field;
|
||||
if (multiValued) {
|
||||
countAllMultiValued(reader, field);
|
||||
if (valueSource != null) {
|
||||
countAll(reader, valueSource);
|
||||
} else {
|
||||
countAll(reader, field);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Counts all facet values for the provided {@link LongValuesSource}. This produces the same
|
||||
* result as computing facets on a {@link org.apache.lucene.search.MatchAllDocsQuery}, but is more
|
||||
* efficient.
|
||||
*/
|
||||
public LongValueFacetCounts(String field, LongValuesSource valueSource, IndexReader reader)
|
||||
throws IOException {
|
||||
this.field = field;
|
||||
countAll(valueSource, field, reader);
|
||||
}
|
||||
|
||||
/** Counts from the provided valueSource. */
|
||||
private void count(LongValuesSource valueSource, List<MatchingDocs> matchingDocs)
|
||||
throws IOException {
|
||||
|
||||
for (MatchingDocs hits : matchingDocs) {
|
||||
|
||||
LongValues fv = valueSource.getValues(hits.context, null);
|
||||
|
||||
// NOTE: this is not as efficient as working directly with the doc values APIs in the sparse
|
||||
|
@ -152,80 +136,41 @@ public class LongValueFacetCounts extends Facets {
|
|||
}
|
||||
}
|
||||
|
||||
/** Counts from the field's indexed doc values. */
|
||||
private void count(String field, List<MatchingDocs> matchingDocs) throws IOException {
|
||||
for (MatchingDocs hits : matchingDocs) {
|
||||
NumericDocValues fv = hits.context.reader().getNumericDocValues(field);
|
||||
if (fv == null) {
|
||||
continue;
|
||||
}
|
||||
countOneSegment(fv, hits);
|
||||
}
|
||||
}
|
||||
|
||||
private void countOneSegment(NumericDocValues values, MatchingDocs hits) throws IOException {
|
||||
DocIdSetIterator it =
|
||||
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values));
|
||||
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
increment(values.longValue());
|
||||
totCount++;
|
||||
}
|
||||
}
|
||||
|
||||
/** Counts directly from SortedNumericDocValues. */
|
||||
private void countMultiValued(String field, List<MatchingDocs> matchingDocs) throws IOException {
|
||||
|
||||
for (MatchingDocs hits : matchingDocs) {
|
||||
SortedNumericDocValues values = hits.context.reader().getSortedNumericDocValues(field);
|
||||
if (values == null) {
|
||||
// this field has no doc values for this segment
|
||||
continue;
|
||||
}
|
||||
|
||||
NumericDocValues singleValues = DocValues.unwrapSingleton(values);
|
||||
SortedNumericDocValues multiValues = DocValues.getSortedNumeric(hits.context.reader(), field);
|
||||
NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
|
||||
|
||||
if (singleValues != null) {
|
||||
countOneSegment(singleValues, hits);
|
||||
|
||||
DocIdSetIterator it =
|
||||
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), singleValues));
|
||||
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
increment(singleValues.longValue());
|
||||
totCount++;
|
||||
}
|
||||
} else {
|
||||
|
||||
DocIdSetIterator it =
|
||||
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), values));
|
||||
ConjunctionDISI.intersectIterators(Arrays.asList(hits.bits.iterator(), multiValues));
|
||||
|
||||
for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
|
||||
int limit = values.docValueCount();
|
||||
int limit = multiValues.docValueCount();
|
||||
totCount += limit;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
increment(values.nextValue());
|
||||
increment(multiValues.nextValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Optimized version that directly counts all doc values. */
|
||||
private void countAll(IndexReader reader, String field) throws IOException {
|
||||
|
||||
for (LeafReaderContext context : reader.leaves()) {
|
||||
|
||||
NumericDocValues values = context.reader().getNumericDocValues(field);
|
||||
if (values == null) {
|
||||
// this field has no doc values for this segment
|
||||
continue;
|
||||
}
|
||||
|
||||
countAllOneSegment(values);
|
||||
}
|
||||
}
|
||||
|
||||
private void countAllOneSegment(NumericDocValues values) throws IOException {
|
||||
while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
totCount++;
|
||||
increment(values.longValue());
|
||||
}
|
||||
}
|
||||
|
||||
private void countAll(LongValuesSource valueSource, String field, IndexReader reader)
|
||||
throws IOException {
|
||||
/** Count everything in the provided valueSource. */
|
||||
private void countAll(IndexReader reader, LongValuesSource valueSource) throws IOException {
|
||||
|
||||
for (LeafReaderContext context : reader.leaves()) {
|
||||
LongValues fv = valueSource.getValues(context, null);
|
||||
|
@ -241,24 +186,27 @@ public class LongValueFacetCounts extends Facets {
|
|||
}
|
||||
}
|
||||
|
||||
private void countAllMultiValued(IndexReader reader, String field) throws IOException {
|
||||
/** Count everything in the specified field. */
|
||||
private void countAll(IndexReader reader, String field) throws IOException {
|
||||
|
||||
for (LeafReaderContext context : reader.leaves()) {
|
||||
|
||||
SortedNumericDocValues values = context.reader().getSortedNumericDocValues(field);
|
||||
if (values == null) {
|
||||
// this field has no doc values for this segment
|
||||
continue;
|
||||
}
|
||||
NumericDocValues singleValues = DocValues.unwrapSingleton(values);
|
||||
SortedNumericDocValues multiValues = DocValues.getSortedNumeric(context.reader(), field);
|
||||
NumericDocValues singleValues = DocValues.unwrapSingleton(multiValues);
|
||||
|
||||
if (singleValues != null) {
|
||||
countAllOneSegment(singleValues);
|
||||
|
||||
while (singleValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
totCount++;
|
||||
increment(singleValues.longValue());
|
||||
}
|
||||
} else {
|
||||
while (values.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int limit = values.docValueCount();
|
||||
|
||||
while (multiValues.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
int limit = multiValues.docValueCount();
|
||||
totCount += limit;
|
||||
for (int i = 0; i < limit; i++) {
|
||||
increment(values.nextValue());
|
||||
increment(multiValues.nextValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -294,7 +242,7 @@ public class LongValueFacetCounts extends Facets {
|
|||
/** Returns the specified top number of facets, sorted by count. */
|
||||
public FacetResult getTopChildrenSortByCount(int topN) {
|
||||
PriorityQueue<Entry> pq =
|
||||
new PriorityQueue<Entry>(Math.min(topN, counts.length + hashCounts.size())) {
|
||||
new PriorityQueue<>(Math.min(topN, counts.length + hashCounts.size())) {
|
||||
@Override
|
||||
protected boolean lessThan(Entry a, Entry b) {
|
||||
// sort by count descending, breaking ties by value ascending:
|
||||
|
@ -409,13 +357,13 @@ public class LongValueFacetCounts extends Facets {
|
|||
}
|
||||
|
||||
@Override
|
||||
public Number getSpecificValue(String dim, String... path) throws IOException {
|
||||
public Number getSpecificValue(String dim, String... path) {
|
||||
// TODO: should we impl this?
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<FacetResult> getAllDims(int topN) throws IOException {
|
||||
public List<FacetResult> getAllDims(int topN) {
|
||||
return Collections.singletonList(getTopChildren(topN, field));
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.facet;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -60,7 +60,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
IndexSearcher s = newSearcher(r);
|
||||
s.search(new MatchAllDocsQuery(), fc);
|
||||
|
||||
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
|
||||
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
|
||||
|
||||
FacetResult result = facets.getAllChildrenSortByValue();
|
||||
assertEquals(
|
||||
|
@ -87,7 +87,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
IndexSearcher s = newSearcher(r);
|
||||
s.search(new MatchAllDocsQuery(), fc);
|
||||
|
||||
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc, false);
|
||||
LongValueFacetCounts facets = new LongValueFacetCounts("field", fc);
|
||||
|
||||
FacetResult result = facets.getAllChildrenSortByValue();
|
||||
assertEquals(
|
||||
|
@ -119,7 +119,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
IndexSearcher s = newSearcher(r);
|
||||
s.search(new MatchAllDocsQuery(), fc);
|
||||
|
||||
Facets facets = new LongValueFacetCounts("field", fc, false);
|
||||
Facets facets = new LongValueFacetCounts("field", fc);
|
||||
|
||||
List<FacetResult> result = facets.getAllDims(10);
|
||||
assertEquals(1, result.size());
|
||||
|
@ -199,7 +199,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
|
||||
// sort by value
|
||||
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
|
||||
LongValueFacetCounts facetCounts;
|
||||
if (random().nextBoolean()) {
|
||||
|
@ -214,7 +214,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" use doc values");
|
||||
}
|
||||
facetCounts = new LongValueFacetCounts("field", fc, false);
|
||||
facetCounts = new LongValueFacetCounts("field", fc);
|
||||
}
|
||||
} else {
|
||||
// optimized count all:
|
||||
|
@ -228,7 +228,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" count all doc values");
|
||||
}
|
||||
facetCounts = new LongValueFacetCounts("field", r, false);
|
||||
facetCounts = new LongValueFacetCounts("field", r);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -242,8 +242,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
Integer.MAX_VALUE);
|
||||
|
||||
// sort by count
|
||||
Collections.sort(
|
||||
expectedCounts,
|
||||
expectedCounts.sort(
|
||||
(a, b) -> {
|
||||
int cmp = -Integer.compare(a.getValue(), b.getValue());
|
||||
if (cmp == 0) {
|
||||
|
@ -288,7 +287,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" use doc values");
|
||||
}
|
||||
facetCounts = new LongValueFacetCounts("field", fc, false);
|
||||
facetCounts = new LongValueFacetCounts("field", fc);
|
||||
} else {
|
||||
if (VERBOSE) {
|
||||
System.out.println(" use value source");
|
||||
|
@ -314,7 +313,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
|
||||
// sort by value
|
||||
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
actual = facetCounts.getAllChildrenSortByValue();
|
||||
assertSame(
|
||||
"id " + minId + "-" + maxId + ", sort facets by value",
|
||||
|
@ -325,8 +324,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
Integer.MAX_VALUE);
|
||||
|
||||
// sort by count
|
||||
Collections.sort(
|
||||
expectedCounts,
|
||||
expectedCounts.sort(
|
||||
(a, b) -> {
|
||||
int cmp = -Integer.compare(a.getValue(), b.getValue());
|
||||
if (cmp == 0) {
|
||||
|
@ -447,7 +445,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
List<Map.Entry<Long, Integer>> expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
|
||||
// sort by value
|
||||
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
|
||||
LongValueFacetCounts facetCounts;
|
||||
if (random().nextBoolean()) {
|
||||
|
@ -455,13 +453,13 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" use doc values");
|
||||
}
|
||||
facetCounts = new LongValueFacetCounts("field", fc, true);
|
||||
facetCounts = new LongValueFacetCounts("field", fc);
|
||||
} else {
|
||||
// optimized count all:
|
||||
if (VERBOSE) {
|
||||
System.out.println(" count all doc values");
|
||||
}
|
||||
facetCounts = new LongValueFacetCounts("field", r, true);
|
||||
facetCounts = new LongValueFacetCounts("field", r);
|
||||
}
|
||||
|
||||
FacetResult actual = facetCounts.getAllChildrenSortByValue();
|
||||
|
@ -474,8 +472,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
Integer.MAX_VALUE);
|
||||
|
||||
// sort by count
|
||||
Collections.sort(
|
||||
expectedCounts,
|
||||
expectedCounts.sort(
|
||||
(a, b) -> {
|
||||
int cmp = -Integer.compare(a.getValue(), b.getValue());
|
||||
if (cmp == 0) {
|
||||
|
@ -517,7 +514,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
fc = new FacetsCollector();
|
||||
s.search(IntPoint.newRangeQuery("id", minId, maxId), fc);
|
||||
// cannot use value source here because we are multi valued
|
||||
facetCounts = new LongValueFacetCounts("field", fc, true);
|
||||
facetCounts = new LongValueFacetCounts("field", fc);
|
||||
|
||||
expected = new HashMap<>();
|
||||
expectedChildCount = 0;
|
||||
|
@ -538,7 +535,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
expectedCounts = new ArrayList<>(expected.entrySet());
|
||||
|
||||
// sort by value
|
||||
Collections.sort(expectedCounts, (a, b) -> (Long.compare(a.getKey(), b.getKey())));
|
||||
expectedCounts.sort(Comparator.comparingLong(Map.Entry::getKey));
|
||||
actual = facetCounts.getAllChildrenSortByValue();
|
||||
assertSame(
|
||||
"id " + minId + "-" + maxId + ", sort facets by value",
|
||||
|
@ -549,8 +546,7 @@ public class TestLongValueFacetCounts extends LuceneTestCase {
|
|||
Integer.MAX_VALUE);
|
||||
|
||||
// sort by count
|
||||
Collections.sort(
|
||||
expectedCounts,
|
||||
expectedCounts.sort(
|
||||
(a, b) -> {
|
||||
int cmp = -Integer.compare(a.getValue(), b.getValue());
|
||||
if (cmp == 0) {
|
||||
|
|
Loading…
Reference in New Issue