LUCENE-10633 Disable sort optimization for SortedSetSortField (#3125)

Add ability to SortedSetSortField to disable sort optimization
2022-08-30 16:52:28 -04:00 · 2022-08-30 16:52:28 -04:00 · 554fabf682
parent 61ef031f7f
commit 554fabf682
7 changed files with 102 additions and 33 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -93,6 +93,10 @@ New Features

 * LUCENE-10629: Support match set filtering with a query in MatchingFacetSetCounts. (Stefan Vodita, Shai Erera)

+* LUCENE-10633: SortField#setOptimizeSortWithIndexedData and
+  SortField#getOptimizeSortWithIndexedData were introduce to provide
+  an option to disable sort optimization for various sort fields. (Mayya Sharipova)
+
 Improvements
 ---------------------
 * LUCENE-10592: Build HNSW Graph on indexing. (Mayya Sharipova, Adrien Grand, Julie Tibshirani)
--- a/lucene/core/src/java/org/apache/lucene/search/SortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java
@ -45,6 +45,10 @@ import org.apache.lucene.util.NumericUtils;
 * optimization to skip non-competitive documents. This optimization relies on the assumption that
 * the same data is stored in these points and doc values.
 *
+ * <p>Sorting on a SORTED(_SET) field that is indexed with both doc values and term index may use an
+ * optimization to skip non-competitive documents. This optimization relies on the assumption that
+ * the same data is stored in these term index and doc values.
+ *
 * <p>Created: Feb 11, 2004 1:25:29 PM
 *
 * @since lucene 1.4
@ -131,8 +135,8 @@ public class SortField {
  // Used for 'sortMissingFirst/Last'
  protected Object missingValue = null;

-  // Indicates if numeric sort should be optimized with Points index. Set to true by default.
-  @Deprecated private boolean optimizeSortWithPoints = true;
+  // Indicates if sort should be optimized with indexed data. Set to true by default.
+  @Deprecated private boolean optimizeSortWithIndexedData = true;

  /**
   * Creates a sort by terms in the given field with the type of term values explicitly given.
@ -537,7 +541,10 @@ public class SortField {
        break;

      case STRING:
-        return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);
+        fieldComparator =
+            new TermOrdValComparator(
+                numHits, field, missingValue == STRING_LAST, reverse, enableSkipping);
+        break;

      case STRING_VAL:
        fieldComparator =
@ -551,7 +558,7 @@ public class SortField {
      default:
        throw new IllegalStateException("Illegal sort type: " + type);
    }
-    if (getOptimizeSortWithPoints() == false) {
+    if (getOptimizeSortWithIndexedData() == false) {
      fieldComparator.disableSkipping();
    }
    return fieldComparator;
@ -626,6 +633,39 @@ public class SortField {
    }
  }

+  /**
+   * Enables/disables numeric sort optimization to use the indexed data.
+   *
+   * <p>Enabled by default. By default, sorting on a numeric field activates point sort optimization
+   * that can efficiently skip over non-competitive hits. Sort optimization has a number of
+   * requirements, one of which is that SortField.Type matches the Point type with which the field
+   * was indexed (e.g. sort on IntPoint field should use SortField.Type.INT). Another requirement is
+   * that the same data is indexed with points and doc values for the field.
+   *
+   * <p>By default, sorting on a SORTED(_SET) field activates sort optimization that can efficiently
+   * skip over non-competitive hits. Sort optimization requires that the same data is indexed with
+   * term index and doc values for the field.
+   *
+   * @param optimizeSortWithIndexedData providing {@code false} disables the optimization, in cases
+   *     where these requirements can't be met.
+   * @deprecated should only be used for compatibility with 8.x indices that got created with
+   *     inconsistent data across fields, or the wrong sort configuration in the index sort
+   */
+  @Deprecated // Remove in Lucene 10
+  public void setOptimizeSortWithIndexedData(boolean optimizeSortWithIndexedData) {
+    this.optimizeSortWithIndexedData = optimizeSortWithIndexedData;
+  }
+
+  /**
+   * Returns whether sort optimization should be optimized with indexed data
+   *
+   * @return whether sort optimization should be optimized with indexed data
+   */
+  @Deprecated // Remove in Lucene 10
+  public boolean getOptimizeSortWithIndexedData() {
+    return optimizeSortWithIndexedData;
+  }
+
  /**
   * Enables/disables numeric sort optimization to use the Points index.
   *
@ -638,20 +678,22 @@ public class SortField {
   * @param optimizeSortWithPoints providing {@code false} disables the optimization, in cases where
   *     these requirements can't be met.
   * @deprecated should only be used for compatibility with 8.x indices that got created with
-   *     inconsistent data across fields, or the wrong sort configuration in the index sort
+   *     inconsistent data across fields, or the wrong sort configuration in the index sort. This is
+   *     a duplicate method for {@code SortField#setOptimizeSortWithIndexedData}.
   */
  @Deprecated // Remove in Lucene 10
  public void setOptimizeSortWithPoints(boolean optimizeSortWithPoints) {
-    this.optimizeSortWithPoints = optimizeSortWithPoints;
+    setOptimizeSortWithIndexedData(optimizeSortWithPoints);
  }

  /**
   * Returns whether sort optimization should be optimized with points index
   *
   * @return whether sort optimization should be optimized with points index
+   * @deprecated This is a duplicate method for {@code SortField#getOptimizeSortWithIndexedData}.
   */
  @Deprecated // Remove in Lucene 10
  public boolean getOptimizeSortWithPoints() {
-    return optimizeSortWithPoints;
+    return getOptimizeSortWithIndexedData();
  }
 }
--- a/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedNumericSortField.java
@ -333,7 +333,7 @@ public class SortedNumericSortField extends SortField {
      default:
        throw new AssertionError();
    }
-    if (getOptimizeSortWithPoints() == false) {
+    if (getOptimizeSortWithIndexedData() == false) {
      fieldComparator.disableSkipping();
    }
    return fieldComparator;
--- a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
@ -179,7 +179,9 @@ public class SortedSetSortField extends SortField {

  @Override
  public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
-    return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
+    boolean finalEnableSkipping = enableSkipping && getOptimizeSortWithIndexedData();
+    return new TermOrdValComparator(
+        numHits, getField(), missingValue == STRING_LAST, reverse, finalEnableSkipping) {
      @Override
      protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
          throws IOException {
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
@ -97,7 +97,9 @@ public class TermOrdValComparator extends FieldComparator<BytesRef> {
   * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to put
   * missing values at the end.
   */
-  public TermOrdValComparator(int numHits, String field, boolean sortMissingLast, boolean reverse) {
+  public TermOrdValComparator(
+      int numHits, String field, boolean sortMissingLast, boolean reverse, boolean enableSkipping) {
+    canSkipDocuments = enableSkipping;
    ords = new int[numHits];
    values = new BytesRef[numHits];
    tempBRs = new BytesRefBuilder[numHits];
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@ -679,7 +679,7 @@ public class TestSortOptimization extends LuceneTestCase {
        IllegalArgumentException.class,
        () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField)));
    // assert that when sort optimization is disabled we can use LONG sort on int field
-    longSortOnIntField.setOptimizeSortWithPoints(false);
+    longSortOnIntField.setOptimizeSortWithIndexedData(false);
    searcher.search(new MatchAllDocsQuery(), 1, new Sort(longSortOnIntField));

    SortField intSortOnLongField = new SortField("longField", SortField.Type.INT);
@ -687,7 +687,7 @@ public class TestSortOptimization extends LuceneTestCase {
        IllegalArgumentException.class,
        () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField)));
    // assert that when sort optimization is disabled we can use INT sort on long field
-    intSortOnLongField.setOptimizeSortWithPoints(false);
+    intSortOnLongField.setOptimizeSortWithIndexedData(false);
    searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnLongField));

    SortField intSortOnIntRangeField = new SortField("intRange", SortField.Type.INT);
@ -695,7 +695,7 @@ public class TestSortOptimization extends LuceneTestCase {
        IllegalArgumentException.class,
        () -> searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField)));
    // assert that when sort optimization is disabled we can use INT sort on intRange field
-    intSortOnIntRangeField.setOptimizeSortWithPoints(false);
+    intSortOnIntRangeField.setOptimizeSortWithIndexedData(false);
    searcher.search(new MatchAllDocsQuery(), 1, new Sort(intSortOnIntRangeField));

    reader.close();
@ -823,7 +823,7 @@ public class TestSortOptimization extends LuceneTestCase {
    boolean reverse = random().nextBoolean();
    final SortField sortField =
        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
-    sortField.setOptimizeSortWithPoints(false);
+    sortField.setOptimizeSortWithIndexedData(false);
    final Sort sort = new Sort(sortField); // sort without sort optimization
    final SortField sortField2 =
        new SortedNumericSortField("my_field", SortField.Type.LONG, reverse, type);
@ -901,6 +901,7 @@ public class TestSortOptimization extends LuceneTestCase {
    final DirectoryReader reader = DirectoryReader.open(writer);
    writer.close();
    doTestStringSortOptimization(reader);
+    doTestStringSortOptimizationDisabled(reader);
    reader.close();
    dir.close();
  }
@ -1025,6 +1026,27 @@ public class TestSortOptimization extends LuceneTestCase {
    }
  }

+  public void doTestStringSortOptimizationDisabled(DirectoryReader reader) throws IOException {
+    SortField sortField =
+        random().nextBoolean()
+            ? new SortedSetSortField("my_field", false)
+            : new SortField("my_field", SortField.Type.STRING);
+    sortField.setMissingValue(SortField.STRING_LAST);
+    sortField.setOptimizeSortWithIndexedData(false);
+
+    Sort sort = new Sort(sortField);
+    final int numDocs = reader.numDocs();
+    final int numHits = 5;
+    final int totalHitsThreshold = 5;
+
+    CollectorManager<TopFieldCollector, TopFieldDocs> manager =
+        TopFieldCollector.createSharedManager(sort, numHits, null, totalHitsThreshold);
+    IndexSearcher searcher =
+        newSearcher(reader, random().nextBoolean(), random().nextBoolean(), false);
+    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), manager);
+    assertEquals(numDocs, topDocs.totalHits.value);
+  }
+
  private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after)
      throws IOException {
    TopDocs topDocs = assertSearchHits(reader, sort, n, after);
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
@ -135,25 +135,22 @@ public class ToParentBlockJoinSortField extends SortField {
  }

  private FieldComparator<?> getStringComparator(int numHits) {
-    FieldComparator<?> cmp =
-        new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, getReverse()) {
-
-          @Override
-          protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
-              throws IOException {
-            SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
-            final BlockJoinSelector.Type type =
-                order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
-            final BitSet parents = parentFilter.getBitSet(context);
-            final BitSet children = childFilter.getBitSet(context);
-            if (children == null) {
-              return DocValues.emptySorted();
-            }
-            return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
-          }
-        };
-    cmp.disableSkipping();
-    return cmp;
+    return new TermOrdValComparator(
+        numHits, getField(), missingValue == STRING_LAST, getReverse(), false) {
+      @Override
+      protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
+          throws IOException {
+        SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
+        final BlockJoinSelector.Type type =
+            order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
+        final BitSet parents = parentFilter.getBitSet(context);
+        final BitSet children = childFilter.getBitSet(context);
+        if (children == null) {
+          return DocValues.emptySorted();
+        }
+        return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
+      }
+    };
  }

  private FieldComparator<?> getIntComparator(int numHits) {