From 0d579338eee8ef29ef8a222e87d987ebb8cd7577 Mon Sep 17 00:00:00 2001 From: Stefan Vodita <41467371+stefanvodita@users.noreply.github.com> Date: Tue, 10 Sep 2024 17:24:28 +0100 Subject: [PATCH] Add dynamic range facets (#13689) --- lucene/CHANGES.txt | 7 + .../demo/facet/DynamicRangeFacetsExample.java | 158 ++++++++++ .../lucene/demo/facet/package-info.java | 6 + lucene/demo/src/java/overview.html | 13 +- .../facet/TestDynamicRangeFacetsExample.java | 34 +++ .../lucene/facet/range/DynamicRangeUtil.java | 277 ++++++++++++++++++ .../facet/range/TestDynamicRangeUtil.java | 103 +++++++ 7 files changed, 597 insertions(+), 1 deletion(-) create mode 100644 lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java create mode 100644 lucene/demo/src/test/org/apache/lucene/demo/facet/TestDynamicRangeFacetsExample.java create mode 100644 lucene/facet/src/java/org/apache/lucene/facet/range/DynamicRangeUtil.java create mode 100644 lucene/facet/src/test/org/apache/lucene/facet/range/TestDynamicRangeUtil.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 12f7b960ccb..016ac462976 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -312,6 +312,13 @@ New Features * GITHUB#13678: Add support JDK 23 to the Panama Vectorization Provider. (Chris Hegarty) +* GITHUB#13689: Add a new faceting feature, dynamic range facets, which automatically picks a balanced set of numeric + ranges based on the distribution of values that occur across all hits. For use cases that have a highly variable + numeric doc values field, such as "price" in an e-commerce application, this facet method is powerful as it allows the + presented ranges to adapt depending on what hits the query actually matches. This is in contrast to existing range + faceting that requires the application to provide the specific fixed ranges up front. (Yuting Gan, Greg Miller, + Stefan Vodita) + Improvements --------------------- diff --git a/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java b/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java new file mode 100644 index 00000000000..5b188d42927 --- /dev/null +++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/DynamicRangeFacetsExample.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.demo.facet; + +import java.io.IOException; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.range.DynamicRangeUtil; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.LongValuesSource; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.NamedThreadFactory; + +/** + * Demo dynamic range faceting. + * + *
The results look like so: min: 63 max: 75 centroid: 69.000000 count: 2 weight: 137 min: 79 + * max: 96 centroid: 86.000000 count: 3 weight: 83 + * + *
We've computed dynamic ranges over popularity weighted by number of books. We can read the + * results as so: There are 137 books written by authors in the 63 to 75 popularity range. + * + *
How it works: We collect all the values (popularity) and their weights (book counts). We sort
+ * the values and find the approximate weight per range. In this case the total weight is 220 (total
+ * books by all authors) and we want 2 ranges, so we're aiming for 110 books in each range. We add
+ * Chesterton to the first range, since he is the least popular author. He's written a lot of books,
+ * the range's weight is 90. We add Tolstoy to the first range, since he is next in line of
+ * popularity. He's written another 47 books, which brings the total weight to 137. We're over the
+ * 110 target weight, so we stop and add everyone left to the second range.
+ */
+public class DynamicRangeFacetsExample {
+
+ private final Directory indexDir = new ByteBuffersDirectory();
+ private final FacetsConfig config = new FacetsConfig();
+
+ /** Empty constructor */
+ public DynamicRangeFacetsExample() {}
+
+ /** Build the example index. */
+ private void index() throws IOException {
+ IndexWriter indexWriter =
+ new IndexWriter(
+ indexDir,
+ new IndexWriterConfig(new WhitespaceAnalyzer())
+ .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+
+ Document doc = new Document();
+ doc.add(new StringField("Author", "J. R. R. Tolkien", Field.Store.NO));
+ doc.add(new NumericDocValuesField("Popularity", 96));
+ doc.add(new NumericDocValuesField("Books", 24));
+ indexWriter.addDocument(config.build(doc));
+
+ doc = new Document();
+ doc.add(new StringField("Author", "C. S. Lewis", Field.Store.NO));
+ doc.add(new NumericDocValuesField("Popularity", 83));
+ doc.add(new NumericDocValuesField("Books", 48));
+ indexWriter.addDocument(config.build(doc));
+
+ doc = new Document();
+ doc.add(new StringField("Author", "G. K. Chesterton", Field.Store.NO));
+ doc.add(new NumericDocValuesField("Popularity", 63));
+ doc.add(new NumericDocValuesField("Books", 90));
+ indexWriter.addDocument(config.build(doc));
+ indexWriter.commit();
+
+ doc = new Document();
+ doc.add(new StringField("Author", "Fyodor Dostoevsky", Field.Store.NO));
+ doc.add(new NumericDocValuesField("Popularity", 79));
+ doc.add(new NumericDocValuesField("Books", 11));
+ indexWriter.addDocument(config.build(doc));
+
+ doc = new Document();
+ doc.add(new StringField("Author", "Leo Tolstoy", Field.Store.NO));
+ doc.add(new NumericDocValuesField("Popularity", 75));
+ doc.add(new NumericDocValuesField("Books", 47));
+ indexWriter.addDocument(config.build(doc));
+
+ indexWriter.close();
+ }
+
+ /** User runs a query and counts facets. */
+ private List Sampling support is implemented in {@link
* org.apache.lucene.facet.RandomSamplingFacetsCollector}.
*
+ * Sometimes, indexing is done once, and when the index is fully prepared, searching starts.
diff --git a/lucene/demo/src/java/overview.html b/lucene/demo/src/java/overview.html
index fd62e2b45f7..f904aca8380 100644
--- a/lucene/demo/src/java/overview.html
+++ b/lucene/demo/src/java/overview.html
@@ -215,6 +215,17 @@ by score (i.e. relevance).Dynamic Range Facets
+ *
+ * We can build ranges over numeric fields and count the number of values falling in each range. The
+ * values can be weighted and the number of desired ranges can be specified. To see an example,
+ * check {@link org.apache.lucene.demo.facet.DynamicRangeFacetsExample}.
+ *
* Concurrent Indexing and Search
*
*
Lucene also provides aggregation capabilities over the index, e.g. counting results across a category + (SimpleFacetsExample), + computing expressions ( + ExpressionAggregationFacetsExample), dynamic ranges (DynamicRangeFacetsExample). + For more details, see the dedicated + faceting guide. +
+