Add dynamic range facets (#13689)

This commit is contained in:
Stefan Vodita 2024-09-10 17:24:28 +01:00 committed by GitHub
parent 634eff1851
commit 0d579338ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 597 additions and 1 deletions

View File

@ -312,6 +312,13 @@ New Features
* GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty)
* GITHUB#13689: Add a new faceting feature, dynamic range facets, which automatically picks a balanced set of numeric
ranges based on the distribution of values that occur across all hits. For use cases that have a highly variable
numeric doc values field, such as "price" in an e-commerce application, this facet method is powerful as it allows the
presented ranges to adapt depending on what hits the query actually matches. This is in contrast to existing range
faceting that requires the application to provide the specific fixed ranges up front. (Yuting Gan, Greg Miller,
Stefan Vodita)
Improvements
---------------------

View File

@ -0,0 +1,158 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.facet;
import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.facet.FacetsCollectorManager;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.facet.range.DynamicRangeUtil;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.NamedThreadFactory;
/**
* Demo dynamic range faceting.
*
* <p>The results look like so: min: 63 max: 75 centroid: 69.000000 count: 2 weight: 137 min: 79
* max: 96 centroid: 86.000000 count: 3 weight: 83
*
* <p>We've computed dynamic ranges over popularity weighted by number of books. We can read the
* results as so: There are 137 books written by authors in the 63 to 75 popularity range.
*
* <p>How it works: We collect all the values (popularity) and their weights (book counts). We sort
* the values and find the approximate weight per range. In this case the total weight is 220 (total
* books by all authors) and we want 2 ranges, so we're aiming for 110 books in each range. We add
* Chesterton to the first range, since he is the least popular author. He's written a lot of books,
* the range's weight is 90. We add Tolstoy to the first range, since he is next in line of
* popularity. He's written another 47 books, which brings the total weight to 137. We're over the
* 110 target weight, so we stop and add everyone left to the second range.
*/
public class DynamicRangeFacetsExample {
private final Directory indexDir = new ByteBuffersDirectory();
private final FacetsConfig config = new FacetsConfig();
/** Empty constructor */
public DynamicRangeFacetsExample() {}
/** Build the example index. */
private void index() throws IOException {
IndexWriter indexWriter =
new IndexWriter(
indexDir,
new IndexWriterConfig(new WhitespaceAnalyzer())
.setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
doc.add(new StringField("Author", "J. R. R. Tolkien", Field.Store.NO));
doc.add(new NumericDocValuesField("Popularity", 96));
doc.add(new NumericDocValuesField("Books", 24));
indexWriter.addDocument(config.build(doc));
doc = new Document();
doc.add(new StringField("Author", "C. S. Lewis", Field.Store.NO));
doc.add(new NumericDocValuesField("Popularity", 83));
doc.add(new NumericDocValuesField("Books", 48));
indexWriter.addDocument(config.build(doc));
doc = new Document();
doc.add(new StringField("Author", "G. K. Chesterton", Field.Store.NO));
doc.add(new NumericDocValuesField("Popularity", 63));
doc.add(new NumericDocValuesField("Books", 90));
indexWriter.addDocument(config.build(doc));
indexWriter.commit();
doc = new Document();
doc.add(new StringField("Author", "Fyodor Dostoevsky", Field.Store.NO));
doc.add(new NumericDocValuesField("Popularity", 79));
doc.add(new NumericDocValuesField("Books", 11));
indexWriter.addDocument(config.build(doc));
doc = new Document();
doc.add(new StringField("Author", "Leo Tolstoy", Field.Store.NO));
doc.add(new NumericDocValuesField("Popularity", 75));
doc.add(new NumericDocValuesField("Books", 47));
indexWriter.addDocument(config.build(doc));
indexWriter.close();
}
/** User runs a query and counts facets. */
private List<DynamicRangeUtil.DynamicRangeInfo> search() throws IOException {
DirectoryReader indexReader = DirectoryReader.open(indexDir);
IndexSearcher searcher = new IndexSearcher(indexReader);
LongValuesSource valuesSource = LongValuesSource.fromLongField("Popularity");
LongValuesSource weightsSource = LongValuesSource.fromLongField("Books");
// Aggregates the facet counts
FacetsCollectorManager fcm = new FacetsCollectorManager();
// MatchAllDocsQuery is for "browsing" (counts facets
// for all non-deleted docs in the index); normally
// you'd use a "normal" query:
FacetsCollector fc =
FacetsCollectorManager.search(searcher, new MatchAllDocsQuery(), 10, fcm).facetsCollector();
try (ExecutorService executor =
Executors.newFixedThreadPool(2, new NamedThreadFactory("dynamic-ranges"))) {
// We ask for 2 ranges over popularity weighted by book count
return DynamicRangeUtil.computeDynamicRanges(
"Books", weightsSource, valuesSource, fc, 2, executor);
}
}
/** Runs the search example. */
public List<DynamicRangeUtil.DynamicRangeInfo> runSearch() throws IOException {
index();
return search();
}
/** Runs the search example and prints the results. */
public static void main(String[] args) throws Exception {
System.out.println("Dynamic range facets example:");
System.out.println("-----------------------");
DynamicRangeFacetsExample example = new DynamicRangeFacetsExample();
List<DynamicRangeUtil.DynamicRangeInfo> results = example.runSearch();
for (DynamicRangeUtil.DynamicRangeInfo range : results) {
System.out.printf(
Locale.ROOT,
"min: %d max: %d centroid: %f count: %d weight: %d%n",
range.min(),
range.max(),
range.centroid(),
range.count(),
range.weight());
}
}
}

View File

@ -385,6 +385,12 @@
* <p>Sampling support is implemented in {@link
* org.apache.lucene.facet.RandomSamplingFacetsCollector}.
*
* <h3 id="drf">Dynamic Range Facets</h3>
*
* We can build ranges over numeric fields and count the number of values falling in each range. The
* values can be weighted and the number of desired ranges can be specified. To see an example,
* check {@link org.apache.lucene.demo.facet.DynamicRangeFacetsExample}.
*
* <h2 id="concurrent_indexing_search">Concurrent Indexing and Search</h2>
*
* <p>Sometimes, indexing is done once, and when the index is fully prepared, searching starts.

View File

@ -215,6 +215,17 @@ by score (i.e. relevance).</p>
a more complete set of embeddings is needed to get reasonable results.
</p>
</div>
<h2 id="Faceting" class="boxed">Working with facets</h2>
<div class="section">
<p>Lucene also provides aggregation capabilities over the index, e.g. counting results across a category
(<a href="src-html/org/apache/lucene/demo/facet/SimpleFacetsExample.html">SimpleFacetsExample</a>),
computing expressions (<a href=
"src-html/org/apache/lucene/demo/facet/ExpressionAggregationFacetsExample.html">
ExpressionAggregationFacetsExample</a>), dynamic ranges (<a href=
"src-html/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.html">DynamicRangeFacetsExample</a>).
For more details, see the dedicated
<a href="org/apache/lucene/demo/facet/package-summary.html">faceting guide</a>.
</p>
</div>
</body>
</html>

View File

@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.demo.facet;
import java.util.List;
import org.apache.lucene.facet.range.DynamicRangeUtil;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.junit.Test;
public class TestDynamicRangeFacetsExample extends LuceneTestCase {
@Test
public void testExample() throws Exception {
List<DynamicRangeUtil.DynamicRangeInfo> res = new DynamicRangeFacetsExample().runSearch();
assertEquals(
List.of(
new DynamicRangeUtil.DynamicRangeInfo(2, 137, 63, 75, 69d),
new DynamicRangeUtil.DynamicRangeInfo(3, 83, 79, 96, 86)),
res);
}
}

View File

@ -0,0 +1,277 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.lucene.facet.FacetsCollector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LongValues;
import org.apache.lucene.search.LongValuesSource;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
/**
* Methods to create dynamic ranges for numeric fields.
*
* @lucene.experimental
*/
public final class DynamicRangeUtil {
private DynamicRangeUtil() {}
/**
* Construct dynamic ranges using the specified weight field to generate equi-weight range for the
* specified numeric bin field
*
* @param weightFieldName Name of the specified weight field
* @param weightValueSource Value source of the weight field
* @param fieldValueSource Value source of the value field
* @param facetsCollector FacetsCollector
* @param topN Number of requested ranges
* @param exec An executor service that is used to do the computation
* @return A list of DynamicRangeInfo that contains count, relevance, min, max, and centroid for
* each range
*/
public static List<DynamicRangeInfo> computeDynamicRanges(
String weightFieldName,
LongValuesSource weightValueSource,
LongValuesSource fieldValueSource,
FacetsCollector facetsCollector,
int topN,
ExecutorService exec)
throws IOException {
List<FacetsCollector.MatchingDocs> matchingDocsList = facetsCollector.getMatchingDocs();
int totalDoc = matchingDocsList.stream().mapToInt(matchingDoc -> matchingDoc.totalHits).sum();
long[] values = new long[totalDoc];
long[] weights = new long[totalDoc];
long totalWeight = 0;
int overallLength = 0;
List<Future<?>> futures = new ArrayList<>();
List<SegmentTask> tasks = new ArrayList<>();
for (FacetsCollector.MatchingDocs matchingDocs : matchingDocsList) {
if (matchingDocs.totalHits > 0) {
SegmentOutput segmentOutput = new SegmentOutput(matchingDocs.totalHits);
// [1] retrieve values and associated weights concurrently
SegmentTask task =
new SegmentTask(matchingDocs, fieldValueSource, weightValueSource, segmentOutput);
tasks.add(task);
futures.add(exec.submit(task));
}
}
// [2] wait for all segment runs to finish
for (Future<?> future : futures) {
try {
future.get();
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
} catch (ExecutionException ee) {
IOUtils.rethrowAlways(ee.getCause());
}
}
// [3] merge the segment value and weight arrays into one array respectively and update the
// total weights
// and valid value length
for (SegmentTask task : tasks) {
SegmentOutput curSegmentOutput = task.segmentOutput;
// if segment total weight overflows, return null
if (curSegmentOutput == null) {
return null;
}
assert curSegmentOutput.values.length == curSegmentOutput.weights.length;
try {
totalWeight = Math.addExact(curSegmentOutput.segmentTotalWeight, totalWeight);
} catch (ArithmeticException ae) {
throw new IllegalArgumentException(
"weight field \"" + weightFieldName + "\": long totalWeight value out of bounds", ae);
}
int currSegmentLen = curSegmentOutput.segmentIdx;
System.arraycopy(curSegmentOutput.values, 0, values, overallLength, currSegmentLen);
System.arraycopy(curSegmentOutput.weights, 0, weights, overallLength, currSegmentLen);
overallLength += currSegmentLen;
}
return computeDynamicNumericRanges(values, weights, overallLength, totalWeight, topN);
}
private static class SegmentTask implements Callable<Void> {
private final FacetsCollector.MatchingDocs matchingDocs;
private final DocIdSetIterator matchingParentDocsItr;
private final LongValuesSource fieldValueSource;
private final LongValuesSource weightValueSource;
private SegmentOutput segmentOutput;
SegmentTask(
FacetsCollector.MatchingDocs matchingDocs,
LongValuesSource fieldValueSource,
LongValuesSource weightValueSource,
SegmentOutput segmentOutput)
throws IOException {
this.matchingDocs = matchingDocs;
this.matchingParentDocsItr = matchingDocs.bits.iterator();
this.fieldValueSource = fieldValueSource;
this.weightValueSource = weightValueSource;
this.segmentOutput = segmentOutput;
}
@Override
public Void call() throws Exception {
LongValues fieldValue = fieldValueSource.getValues(matchingDocs.context, null);
LongValues weightValue = weightValueSource.getValues(matchingDocs.context, null);
for (int doc = matchingParentDocsItr.nextDoc();
doc != DocIdSetIterator.NO_MORE_DOCS;
doc = matchingParentDocsItr.nextDoc()) {
// If this doc doesn't have a weight, we skip it.
if (fieldValue.advanceExact(doc) == false || weightValue.advanceExact(doc) == false) {
continue;
}
long curValue = fieldValue.longValue();
long curWeight = weightValue.longValue();
// We skip weights equal to zero, otherwise they can skew the ranges.
// Imagine all the weights were zero - any ranges would be valid.
if (curWeight == 0) {
continue;
}
segmentOutput.values[segmentOutput.segmentIdx] = curValue;
segmentOutput.weights[segmentOutput.segmentIdx] = curWeight;
try {
segmentOutput.segmentTotalWeight =
Math.addExact(segmentOutput.segmentTotalWeight, curWeight);
} catch (ArithmeticException ae) {
throw new IllegalArgumentException("segment long totalWeight value out of bounds", ae);
}
segmentOutput.segmentIdx++;
}
return null;
}
}
/** Holds field value array, weight array, totalWeight, valid value index for each segment */
private static final class SegmentOutput {
private final long[] values;
private final long[] weights;
private long segmentTotalWeight = 0;
private int segmentIdx = 0;
public SegmentOutput(int hitsLength) {
this.values = new long[hitsLength];
this.weights = new long[hitsLength];
}
}
/**
* Compute dynamic numeric ranges using weights.
*
* @param values an array that contains the values of matching documents
* @param weights an array that contains the weights of matching documents
* @param len actual length of values and weights
* @param totalWeight the sum of weight values
* @param topN the requested top-n parameter
* @return A list of DynamicRangeInfo that contains count, relevance, min, max, and centroid
* values for each range. The size of dynamic ranges may not be exactly equal to top-N. top-N
* is used to compute the equi-weight per bin.
*/
public static List<DynamicRangeInfo> computeDynamicNumericRanges(
long[] values, long[] weights, int len, long totalWeight, int topN) {
assert values.length == weights.length && len <= values.length && len >= 0;
assert topN >= 0;
List<DynamicRangeInfo> dynamicRangeResult = new ArrayList<>();
if (len == 0 || topN == 0) {
return dynamicRangeResult;
}
new InPlaceMergeSorter() {
@Override
protected int compare(int index1, int index2) {
int cmp = Long.compare(values[index1], values[index2]);
if (cmp == 0) {
// If the values are equal, sort based on the weights.
// Any weight order is correct as long as it's deterministic.
return Long.compare(weights[index1], weights[index2]);
}
return cmp;
}
@Override
protected void swap(int index1, int index2) {
long tmp = values[index1];
values[index1] = values[index2];
values[index2] = tmp;
tmp = weights[index1];
weights[index1] = weights[index2];
weights[index2] = tmp;
}
}.sort(0, len);
long accuWeight = 0;
long valueSum = 0;
int count = 0;
int minIdx = 0;
double rangeWeightTarget = (double) totalWeight / Math.min(topN, len);
for (int i = 0; i < len; i++) {
accuWeight += weights[i];
valueSum += values[i];
count++;
if (accuWeight >= rangeWeightTarget) {
dynamicRangeResult.add(
new DynamicRangeInfo(
count, accuWeight, values[minIdx], values[i], (double) valueSum / count));
count = 0;
accuWeight = 0;
valueSum = 0;
minIdx = i + 1;
}
}
// capture the remaining values to create the last range
if (minIdx < len) {
dynamicRangeResult.add(
new DynamicRangeInfo(
count, accuWeight, values[minIdx], values[len - 1], (double) valueSum / count));
}
return dynamicRangeResult;
}
/**
* Holds parameters of a dynamic numeric range.
*
* @param count the number of items in the range
* @param weight the summed weight of the items in the range
* @param min the lower bound of the range (inclusive)
* @param max the upper bound of the range (inclusive)
* @param centroid the average value in the range
*/
public record DynamicRangeInfo(int count, long weight, long min, long max, double centroid) {}
}

View File

@ -0,0 +1,103 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.facet.range;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.tests.util.LuceneTestCase;
public class TestDynamicRangeUtil extends LuceneTestCase {
public void testComputeDynamicNumericRangesBasic() {
List<DynamicRangeUtil.DynamicRangeInfo> expectedRangeInfoList = new ArrayList<>();
long[] values = new long[1000];
long[] weights = new long[1000];
long totalWeight = 0;
for (int i = 0; i < 1000; i++) {
values[i] = i + 1;
weights[i] = i;
totalWeight += i;
}
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(501, 125250L, 1L, 501L, 251D));
expectedRangeInfoList.add(
new DynamicRangeUtil.DynamicRangeInfo(207, 125028L, 502L, 708L, 605D));
expectedRangeInfoList.add(
new DynamicRangeUtil.DynamicRangeInfo(159, 125133L, 709L, 867L, 788D));
expectedRangeInfoList.add(
new DynamicRangeUtil.DynamicRangeInfo(133, 124089L, 868L, 1000L, 934D));
assertDynamicNumericRangeResults(values, weights, 4, totalWeight, expectedRangeInfoList);
}
public void testComputeDynamicNumericRangesWithSameValues() {
List<DynamicRangeUtil.DynamicRangeInfo> expectedRangeInfoList = new ArrayList<>();
long totalWeight = 0;
long[] values = new long[100];
long[] weights = new long[100];
for (int i = 0; i < 100; i++) {
values[i] = 50;
weights[i] = i;
totalWeight += i;
}
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(51, 1275L, 50L, 50L, 50D));
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(21, 1281L, 50L, 50L, 50D));
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(16, 1272L, 50L, 50L, 50D));
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(12, 1122L, 50L, 50L, 50D));
assertDynamicNumericRangeResults(values, weights, 4, totalWeight, expectedRangeInfoList);
}
public void testComputeDynamicNumericRangesWithOneValue() {
long[] values = new long[] {50};
long[] weights = new long[] {1};
List<DynamicRangeUtil.DynamicRangeInfo> expectedRangeInfoList = new ArrayList<>();
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(1, 1L, 50L, 50L, 50D));
assertDynamicNumericRangeResults(values, weights, 4, 1, expectedRangeInfoList);
}
public void testComputeDynamicNumericRangesWithOneLargeWeight() {
List<DynamicRangeUtil.DynamicRangeInfo> expectedRangeInfoList = new ArrayList<>();
long[] values = new long[] {45, 32, 52, 14, 455, 342, 53};
long[] weights = new long[] {143, 23, 1, 52343, 53, 12, 2534};
// value 14 has its own bin since the weight is large, and the rest of values fall the other bin
expectedRangeInfoList.add(new DynamicRangeUtil.DynamicRangeInfo(1, 52343, 14L, 14L, 14D));
expectedRangeInfoList.add(
new DynamicRangeUtil.DynamicRangeInfo(6, 2766, 32L, 455L, 163.16666666666666D));
assertDynamicNumericRangeResults(values, weights, 4, 55109, expectedRangeInfoList);
}
private static void assertDynamicNumericRangeResults(
long[] values,
long[] weights,
int topN,
long totalWeight,
List<DynamicRangeUtil.DynamicRangeInfo> expectedDynamicRangeResult) {
List<DynamicRangeUtil.DynamicRangeInfo> mockDynamicRangeResult =
DynamicRangeUtil.computeDynamicNumericRanges(
values, weights, values.length, totalWeight, topN);
assertTrue(compareDynamicRangeResult(mockDynamicRangeResult, expectedDynamicRangeResult));
}
private static boolean compareDynamicRangeResult(
List<DynamicRangeUtil.DynamicRangeInfo> mockResult,
List<DynamicRangeUtil.DynamicRangeInfo> expectedResult) {
return mockResult.size() == expectedResult.size() && mockResult.containsAll(expectedResult);
}
}