diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9b9f6d37d2f..ff2713ba6a5 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -93,6 +93,10 @@ New Features * LUCENE-10629: Support match set filtering with a query in MatchingFacetSetCounts. (Stefan Vodita, Shai Erera) +* LUCENE-10633: SortField#setOptimizeSortWithIndexedData and + SortField#getOptimizeSortWithIndexedData were introduce to provide + an option to disable sort optimization for various sort fields. (Mayya Sharipova) + Improvements --------------------- * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani) diff --git a/lucene/core/src/java/org/apache/lucene/search/SortField.java b/lucene/core/src/java/org/apache/lucene/search/SortField.java index 0240dc32d45..953e6ff7740 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SortField.java +++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java @@ -45,6 +45,10 @@ import org.apache.lucene.util.NumericUtils; * optimization to skip non-competitive documents. This optimization relies on the assumption that * the same data is stored in these points and doc values. * + *

Sorting on a SORTED(_SET) field that is indexed with both doc values and term index may use an + * optimization to skip non-competitive documents. This optimization relies on the assumption that + * the same data is stored in these term index and doc values. + * *

Created: Feb 11, 2004 1:25:29 PM * * @since lucene 1.4 @@ -131,8 +135,8 @@ public class SortField { // Used for 'sortMissingFirst/Last' protected Object missingValue = null; - // Indicates if numeric sort should be optimized with Points index. Set to true by default. - @Deprecated private boolean optimizeSortWithPoints = true; + // Indicates if sort should be optimized with indexed data. Set to true by default. + @Deprecated private boolean optimizeSortWithIndexedData = true; /** * Creates a sort by terms in the given field with the type of term values explicitly given. @@ -537,7 +541,10 @@ public class SortField { break; case STRING: - return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse); + fieldComparator = + new TermOrdValComparator( + numHits, field, missingValue == STRING_LAST, reverse, enableSkipping); + break; case STRING_VAL: fieldComparator = @@ -551,7 +558,7 @@ public class SortField { default: throw new IllegalStateException("Illegal sort type: " + type); } - if (getOptimizeSortWithPoints() == false) { + if (getOptimizeSortWithIndexedData() == false) { fieldComparator.disableSkipping(); } return fieldComparator; @@ -626,6 +633,39 @@ public class SortField { } } + /** + * Enables/disables numeric sort optimization to use the indexed data. + * + *

Enabled by default. By default, sorting on a numeric field activates point sort optimization + * that can efficiently skip over non-competitive hits. Sort optimization has a number of + * requirements, one of which is that SortField.Type matches the Point type with which the field + * was indexed (e.g. sort on IntPoint field should use SortField.Type.INT). Another requirement is + * that the same data is indexed with points and doc values for the field. + * + *

By default, sorting on a SORTED(_SET) field activates sort optimization that can efficiently + * skip over non-competitive hits. Sort optimization requires that the same data is indexed with + * term index and doc values for the field. + * + * @param optimizeSortWithIndexedData providing {@code false} disables the optimization, in cases + * where these requirements can't be met. + * @deprecated should only be used for compatibility with 8.x indices that got created with + * inconsistent data across fields, or the wrong sort configuration in the index sort + */ + @Deprecated // Remove in Lucene 10 + public void setOptimizeSortWithIndexedData(boolean optimizeSortWithIndexedData) { + this.optimizeSortWithIndexedData = optimizeSortWithIndexedData; + } + + /** + * Returns whether sort optimization should be optimized with indexed data + * + * @return whether sort optimization should be optimized with indexed data + */ + @Deprecated // Remove in Lucene 10 + public boolean getOptimizeSortWithIndexedData() { + return optimizeSortWithIndexedData; + } + /** * Enables/disables numeric sort optimization to use the Points index. * @@ -638,20 +678,22 @@ public class SortField { * @param optimizeSortWithPoints providing {@code false} disables the optimization, in cases where * these requirements can't be met. * @deprecated should only be used for compatibility with 8.x indices that got created with - * inconsistent data across fields, or the wrong sort configuration in the index sort + * inconsistent data across fields, or the wrong sort configuration in the index sort. This is + * a duplicate method for {@code SortField#setOptimizeSortWithIndexedData}. */ @Deprecated // Remove in Lucene 10 public void setOptimizeSortWithPoints(boolean optimizeSortWithPoints) { - this.optimizeSortWithPoints = optimizeSortWithPoints; + setOptimizeSortWithIndexedData(optimizeSortWithPoints); } /** * Returns whether sort optimization should be optimized with points index * * @return whether sort optimization should be optimized with points index + * @deprecated This is a duplicate method for {@code SortField#getOptimizeSortWithIndexedData}. */ @Deprecated // Remove in Lucene 10 public boolean getOptimizeSortWithPoints() { - return optimizeSortWithPoints; + return getOptimizeSortWithIndexedData(); } } diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java index 38de65414eb..069ecdccd50 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java +++ b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java @@ -333,7 +333,7 @@ public class SortedNumericSortField extends SortField { default: throw new AssertionError(); } - if (getOptimizeSortWithPoints() == false) { + if (getOptimizeSortWithIndexedData() == false) { fieldComparator.disableSkipping(); } return fieldComparator; diff --git a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java index 80db1ebecc9..5d1736a8ca4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java +++ b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java @@ -179,7 +179,9 @@ public class SortedSetSortField extends SortField { @Override public FieldComparator getComparator(int numHits, boolean enableSkipping) { - return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) { + boolean finalEnableSkipping = enableSkipping && getOptimizeSortWithIndexedData(); + return new TermOrdValComparator( + numHits, getField(), missingValue == STRING_LAST, reverse, finalEnableSkipping) { @Override protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index cff3f846162..b0f2260148c 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -97,7 +97,9 @@ public class TermOrdValComparator extends FieldComparator { * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to put * missing values at the end. */ - public TermOrdValComparator(int numHits, String field, boolean sortMissingLast, boolean reverse) { + public TermOrdValComparator( + int numHits, String field, boolean sortMissingLast, boolean reverse, boolean enableSkipping) { + canSkipDocuments = enableSkipping; ords = new int[numHits]; values = new BytesRef[numHits]; tempBRs = new BytesRefBuilder[numHits]; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java index 9ec87be9019..d300d5e5e03 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java @@ -679,7 +679,7 @@ public class TestSortOptimization extends LuceneTestCase { IllegalArgumentException.class, () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField))); // assert that when sort optimization is disabled we can use LONG sort on int field - longSortOnIntField.setOptimizeSortWithPoints(false); + longSortOnIntField.setOptimizeSortWithIndexedData(false); searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField)); SortField intSortOnLongField = new SortField("longField", SortField.Type.INT); @@ -687,7 +687,7 @@ public class TestSortOptimization extends LuceneTestCase { IllegalArgumentException.class, () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField))); // assert that when sort optimization is disabled we can use INT sort on long field - intSortOnLongField.setOptimizeSortWithPoints(false); + intSortOnLongField.setOptimizeSortWithIndexedData(false); searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField)); SortField intSortOnIntRangeField = new SortField("intRange", SortField.Type.INT); @@ -695,7 +695,7 @@ public class TestSortOptimization extends LuceneTestCase { IllegalArgumentException.class, () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField))); // assert that when sort optimization is disabled we can use INT sort on intRange field - intSortOnIntRangeField.setOptimizeSortWithPoints(false); + intSortOnIntRangeField.setOptimizeSortWithIndexedData(false); searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField)); reader.close(); @@ -823,7 +823,7 @@ public class TestSortOptimization extends LuceneTestCase { boolean reverse = random().nextBoolean(); final SortField sortField = new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type); - sortField.setOptimizeSortWithPoints(false); + sortField.setOptimizeSortWithIndexedData(false); final Sort sort = new Sort(sortField); // sort without sort optimization final SortField sortField2 = new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type); @@ -901,6 +901,7 @@ public class TestSortOptimization extends LuceneTestCase { final DirectoryReader reader = DirectoryReader.open(writer); writer.close(); doTestStringSortOptimization(reader); + doTestStringSortOptimizationDisabled(reader); reader.close(); dir.close(); } @@ -1025,6 +1026,27 @@ public class TestSortOptimization extends LuceneTestCase { } } + public void doTestStringSortOptimizationDisabled(DirectoryReader reader) throws IOException { + SortField sortField = + random().nextBoolean() + ? new SortedSetSortField("my_field", false) + : new SortField("my_field", SortField.Type.STRING); + sortField.setMissingValue(SortField.STRING_LAST); + sortField.setOptimizeSortWithIndexedData(false); + + Sort sort = new Sort(sortField); + final int numDocs = reader.numDocs(); + final int numHits = 5; + final int totalHitsThreshold = 5; + + CollectorManager manager = + TopFieldCollector.createSharedManager(sort, numHits, null, totalHitsThreshold); + IndexSearcher searcher = + newSearcher(reader, random().nextBoolean(), random().nextBoolean(), false); + TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager); + assertEquals(numDocs, topDocs.totalHits.value); + } + private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after) throws IOException { TopDocs topDocs = assertSearchHits(reader, sort, n, after); diff --git a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java index a29294c1cb8..b3749ee724c 100644 --- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java +++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java @@ -135,25 +135,22 @@ public class ToParentBlockJoinSortField extends SortField { } private FieldComparator getStringComparator(int numHits) { - FieldComparator cmp = - new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, getReverse()) { - - @Override - protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) - throws IOException { - SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field); - final BlockJoinSelector.Type type = - order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN; - final BitSet parents = parentFilter.getBitSet(context); - final BitSet children = childFilter.getBitSet(context); - if (children == null) { - return DocValues.emptySorted(); - } - return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children)); - } - }; - cmp.disableSkipping(); - return cmp; + return new TermOrdValComparator( + numHits, getField(), missingValue == STRING_LAST, getReverse(), false) { + @Override + protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) + throws IOException { + SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field); + final BlockJoinSelector.Type type = + order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN; + final BitSet parents = parentFilter.getBitSet(context); + final BitSet children = childFilter.getBitSet(context); + if (children == null) { + return DocValues.emptySorted(); + } + return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children)); + } + }; } private FieldComparator getIntComparator(int numHits) {