LUCENE-10633 Disable sort optimization for SortedSetSortField (#3125)

Add ability to SortedSetSortField to disable sort optimization
This commit is contained in:
Mayya Sharipova 2022-08-30 16:52:28 -04:00 committed by GitHub
parent 61ef031f7f
commit 554fabf682
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 102 additions and 33 deletions

View File

@ -93,6 +93,10 @@ New Features
* LUCENE-10629: Support match set filtering with a query in MatchingFacetSetCounts. (Stefan Vodita, Shai Erera)
* LUCENE-10633: SortField#setOptimizeSortWithIndexedData and
SortField#getOptimizeSortWithIndexedData were introduce to provide
an option to disable sort optimization for various sort fields. (Mayya Sharipova)
Improvements
---------------------
* LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)

View File

@ -45,6 +45,10 @@ import org.apache.lucene.util.NumericUtils;
* optimization to skip non-competitive documents. This optimization relies on the assumption that
* the same data is stored in these points and doc values.
*
* <p>Sorting on a SORTED(_SET) field that is indexed with both doc values and term index may use an
* optimization to skip non-competitive documents. This optimization relies on the assumption that
* the same data is stored in these term index and doc values.
*
* <p>Created: Feb 11, 2004 1:25:29 PM
*
* @since lucene 1.4
@ -131,8 +135,8 @@ public class SortField {
// Used for 'sortMissingFirst/Last'
protected Object missingValue = null;
// Indicates if numeric sort should be optimized with Points index. Set to true by default.
@Deprecated private boolean optimizeSortWithPoints = true;
// Indicates if sort should be optimized with indexed data. Set to true by default.
@Deprecated private boolean optimizeSortWithIndexedData = true;
/**
* Creates a sort by terms in the given field with the type of term values explicitly given.
@ -537,7 +541,10 @@ public class SortField {
break;
case STRING:
return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);
fieldComparator =
new TermOrdValComparator(
numHits, field, missingValue == STRING_LAST, reverse, enableSkipping);
break;
case STRING_VAL:
fieldComparator =
@ -551,7 +558,7 @@ public class SortField {
default:
throw new IllegalStateException("Illegal sort type: " + type);
}
if (getOptimizeSortWithPoints() == false) {
if (getOptimizeSortWithIndexedData() == false) {
fieldComparator.disableSkipping();
}
return fieldComparator;
@ -626,6 +633,39 @@ public class SortField {
}
}
/**
* Enables/disables numeric sort optimization to use the indexed data.
*
* <p>Enabled by default. By default, sorting on a numeric field activates point sort optimization
* that can efficiently skip over non-competitive hits. Sort optimization has a number of
* requirements, one of which is that SortField.Type matches the Point type with which the field
* was indexed (e.g. sort on IntPoint field should use SortField.Type.INT). Another requirement is
* that the same data is indexed with points and doc values for the field.
*
* <p>By default, sorting on a SORTED(_SET) field activates sort optimization that can efficiently
* skip over non-competitive hits. Sort optimization requires that the same data is indexed with
* term index and doc values for the field.
*
* @param optimizeSortWithIndexedData providing {@code false} disables the optimization, in cases
* where these requirements can't be met.
* @deprecated should only be used for compatibility with 8.x indices that got created with
* inconsistent data across fields, or the wrong sort configuration in the index sort
*/
@Deprecated // Remove in Lucene 10
public void setOptimizeSortWithIndexedData(boolean optimizeSortWithIndexedData) {
this.optimizeSortWithIndexedData = optimizeSortWithIndexedData;
}
/**
* Returns whether sort optimization should be optimized with indexed data
*
* @return whether sort optimization should be optimized with indexed data
*/
@Deprecated // Remove in Lucene 10
public boolean getOptimizeSortWithIndexedData() {
return optimizeSortWithIndexedData;
}
/**
* Enables/disables numeric sort optimization to use the Points index.
*
@ -638,20 +678,22 @@ public class SortField {
* @param optimizeSortWithPoints providing {@code false} disables the optimization, in cases where
* these requirements can't be met.
* @deprecated should only be used for compatibility with 8.x indices that got created with
* inconsistent data across fields, or the wrong sort configuration in the index sort
* inconsistent data across fields, or the wrong sort configuration in the index sort. This is
* a duplicate method for {@code SortField#setOptimizeSortWithIndexedData}.
*/
@Deprecated // Remove in Lucene 10
public void setOptimizeSortWithPoints(boolean optimizeSortWithPoints) {
this.optimizeSortWithPoints = optimizeSortWithPoints;
setOptimizeSortWithIndexedData(optimizeSortWithPoints);
}
/**
* Returns whether sort optimization should be optimized with points index
*
* @return whether sort optimization should be optimized with points index
* @deprecated This is a duplicate method for {@code SortField#getOptimizeSortWithIndexedData}.
*/
@Deprecated // Remove in Lucene 10
public boolean getOptimizeSortWithPoints() {
return optimizeSortWithPoints;
return getOptimizeSortWithIndexedData();
}
}

View File

@ -333,7 +333,7 @@ public class SortedNumericSortField extends SortField {
default:
throw new AssertionError();
}
if (getOptimizeSortWithPoints() == false) {
if (getOptimizeSortWithIndexedData() == false) {
fieldComparator.disableSkipping();
}
return fieldComparator;

View File

@ -179,7 +179,9 @@ public class SortedSetSortField extends SortField {
@Override
public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
boolean finalEnableSkipping = enableSkipping && getOptimizeSortWithIndexedData();
return new TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST, reverse, finalEnableSkipping) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {

View File

@ -97,7 +97,9 @@ public class TermOrdValComparator extends FieldComparator<BytesRef> {
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to put
* missing values at the end.
*/
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast, boolean reverse) {
public TermOrdValComparator(
int numHits, String field, boolean sortMissingLast, boolean reverse, boolean enableSkipping) {
canSkipDocuments = enableSkipping;
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];

View File

@ -679,7 +679,7 @@ public class TestSortOptimization extends LuceneTestCase {
IllegalArgumentException.class,
() -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField)));
// assert that when sort optimization is disabled we can use LONG sort on int field
longSortOnIntField.setOptimizeSortWithPoints(false);
longSortOnIntField.setOptimizeSortWithIndexedData(false);
searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField));
SortField intSortOnLongField = new SortField("longField", SortField.Type.INT);
@ -687,7 +687,7 @@ public class TestSortOptimization extends LuceneTestCase {
IllegalArgumentException.class,
() -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField)));
// assert that when sort optimization is disabled we can use INT sort on long field
intSortOnLongField.setOptimizeSortWithPoints(false);
intSortOnLongField.setOptimizeSortWithIndexedData(false);
searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField));
SortField intSortOnIntRangeField = new SortField("intRange", SortField.Type.INT);
@ -695,7 +695,7 @@ public class TestSortOptimization extends LuceneTestCase {
IllegalArgumentException.class,
() -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField)));
// assert that when sort optimization is disabled we can use INT sort on intRange field
intSortOnIntRangeField.setOptimizeSortWithPoints(false);
intSortOnIntRangeField.setOptimizeSortWithIndexedData(false);
searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField));
reader.close();
@ -823,7 +823,7 @@ public class TestSortOptimization extends LuceneTestCase {
boolean reverse = random().nextBoolean();
final SortField sortField =
new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
sortField.setOptimizeSortWithPoints(false);
sortField.setOptimizeSortWithIndexedData(false);
final Sort sort = new Sort(sortField); // sort without sort optimization
final SortField sortField2 =
new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
@ -901,6 +901,7 @@ public class TestSortOptimization extends LuceneTestCase {
final DirectoryReader reader = DirectoryReader.open(writer);
writer.close();
doTestStringSortOptimization(reader);
doTestStringSortOptimizationDisabled(reader);
reader.close();
dir.close();
}
@ -1025,6 +1026,27 @@ public class TestSortOptimization extends LuceneTestCase {
}
}
public void doTestStringSortOptimizationDisabled(DirectoryReader reader) throws IOException {
SortField sortField =
random().nextBoolean()
? new SortedSetSortField("my_field", false)
: new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_LAST);
sortField.setOptimizeSortWithIndexedData(false);
Sort sort = new Sort(sortField);
final int numDocs = reader.numDocs();
final int numHits = 5;
final int totalHitsThreshold = 5;
CollectorManager<TopFieldCollector, TopFieldDocs> manager =
TopFieldCollector.createSharedManager(sort, numHits, null, totalHitsThreshold);
IndexSearcher searcher =
newSearcher(reader, random().nextBoolean(), random().nextBoolean(), false);
TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager);
assertEquals(numDocs, topDocs.totalHits.value);
}
private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after)
throws IOException {
TopDocs topDocs = assertSearchHits(reader, sort, n, after);

View File

@ -135,25 +135,22 @@ public class ToParentBlockJoinSortField extends SortField {
}
private FieldComparator<?> getStringComparator(int numHits) {
FieldComparator<?> cmp =
new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, getReverse()) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
final BlockJoinSelector.Type type =
order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
final BitSet parents = parentFilter.getBitSet(context);
final BitSet children = childFilter.getBitSet(context);
if (children == null) {
return DocValues.emptySorted();
}
return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
}
};
cmp.disableSkipping();
return cmp;
return new TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST, getReverse(), false) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
final BlockJoinSelector.Type type =
order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
final BitSet parents = parentFilter.getBitSet(context);
final BitSet children = childFilter.getBitSet(context);
if (children == null) {
return DocValues.emptySorted();
}
return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
}
};
}
private FieldComparator<?> getIntComparator(int numHits) {