LUCENE-10633: Dynamic pruning for sorting on SORTED(_SET) fields. (#1023)

This commit enables dynamic pruning for queries sorted on SORTED(_SET) fields by using postings to filter competitive documents.
2022-07-29 11:12:32 +02:00 · 2022-07-29 11:12:32 +02:00 · eb7b7791ba
parent e1d2005df4
commit eb7b7791ba
7 changed files with 900 additions and 297 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -102,6 +102,10 @@ Optimizations

 * GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)

+* LUCENE-10633: Added support for dynamic pruning to queries sorted by a string
+  field that is indexed with terms and SORTED or SORTED_SET doc values.
+  (Adrien Grand)
+
 Bug Fixes
 ---------------------
 * LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)
--- a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
@ -20,7 +20,6 @@ import java.io.IOException;
 import org.apache.lucene.index.BinaryDocValues;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;

@ -211,282 +210,6 @@ public abstract class FieldComparator<T> {
    }
  }

-  /**
-   * Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
-   * {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
-   * string to their relative ordinal positions (using the index returned by {@link
-   * org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
-   * using the ordinals. For medium to large results, this comparator will be much faster than
-   * {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
-   * it may be slower.
-   */
-  public static class TermOrdValComparator extends FieldComparator<BytesRef>
-      implements LeafFieldComparator {
-    /* Ords for each slot.
-    @lucene.internal */
-    final int[] ords;
-
-    /* Values for each slot.
-    @lucene.internal */
-    final BytesRef[] values;
-    private final BytesRefBuilder[] tempBRs;
-
-    /* Which reader last copied a value into the slot. When
-    we compare two slots, we just compare-by-ord if the
-    readerGen is the same; else we must compare the
-    values (slower).
-    @lucene.internal */
-    final int[] readerGen;
-
-    /* Gen of current reader we are on.
-    @lucene.internal */
-    int currentReaderGen = -1;
-
-    /* Current reader's doc ord/values.
-    @lucene.internal */
-    SortedDocValues termsIndex;
-
-    private final String field;
-
-    /* Bottom slot, or -1 if queue isn't full yet
-    @lucene.internal */
-    int bottomSlot = -1;
-
-    /* Bottom ord (same as ords[bottomSlot] once bottomSlot
-    is set).  Cached for faster compares.
-    @lucene.internal */
-    int bottomOrd;
-
-    /* True if current bottom slot matches the current
-    reader.
-    @lucene.internal */
-    boolean bottomSameReader;
-
-    /* Bottom value (same as values[bottomSlot] once
-     bottomSlot is set).  Cached for faster compares.
-    @lucene.internal */
-    BytesRef bottomValue;
-
-    /** Set by setTopValue. */
-    BytesRef topValue;
-
-    boolean topSameReader;
-    int topOrd;
-
-    /** -1 if missing values are sorted first, 1 if they are sorted last */
-    final int missingSortCmp;
-
-    /** Which ordinal to use for a missing value. */
-    final int missingOrd;
-
-    /** Creates this, sorting missing values first. */
-    public TermOrdValComparator(int numHits, String field) {
-      this(numHits, field, false);
-    }
-
-    /**
-     * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
-     * put missing values at the end.
-     */
-    public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
-      ords = new int[numHits];
-      values = new BytesRef[numHits];
-      tempBRs = new BytesRefBuilder[numHits];
-      readerGen = new int[numHits];
-      this.field = field;
-      if (sortMissingLast) {
-        missingSortCmp = 1;
-        missingOrd = Integer.MAX_VALUE;
-      } else {
-        missingSortCmp = -1;
-        missingOrd = -1;
-      }
-    }
-
-    private int getOrdForDoc(int doc) throws IOException {
-      if (termsIndex.advanceExact(doc)) {
-        return termsIndex.ordValue();
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compare(int slot1, int slot2) {
-      if (readerGen[slot1] == readerGen[slot2]) {
-        return ords[slot1] - ords[slot2];
-      }
-
-      final BytesRef val1 = values[slot1];
-      final BytesRef val2 = values[slot2];
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public int compareBottom(int doc) throws IOException {
-      assert bottomSlot != -1;
-      int docOrd = getOrdForDoc(doc);
-      if (docOrd == -1) {
-        docOrd = missingOrd;
-      }
-      if (bottomSameReader) {
-        // ord is precisely comparable, even in the equal case
-        return bottomOrd - docOrd;
-      } else if (bottomOrd >= docOrd) {
-        // the equals case always means bottom is > doc
-        // (because we set bottomOrd to the lower bound in
-        // setBottom):
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public void copy(int slot, int doc) throws IOException {
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-        values[slot] = null;
-      } else {
-        assert ord >= 0;
-        if (tempBRs[slot] == null) {
-          tempBRs[slot] = new BytesRefBuilder();
-        }
-        tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
-        values[slot] = tempBRs[slot].get();
-      }
-      ords[slot] = ord;
-      readerGen[slot] = currentReaderGen;
-    }
-
-    /** Retrieves the SortedDocValues for the field in this segment */
-    protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
-        throws IOException {
-      return DocValues.getSorted(context.reader(), field);
-    }
-
-    @Override
-    public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
-      termsIndex = getSortedDocValues(context, field);
-      currentReaderGen++;
-
-      if (topValue != null) {
-        // Recompute topOrd/SameReader
-        int ord = termsIndex.lookupTerm(topValue);
-        if (ord >= 0) {
-          topSameReader = true;
-          topOrd = ord;
-        } else {
-          topSameReader = false;
-          topOrd = -ord - 2;
-        }
-      } else {
-        topOrd = missingOrd;
-        topSameReader = true;
-      }
-      // System.out.println("  getLeafComparator topOrd=" + topOrd + " topSameReader=" +
-      // topSameReader);
-
-      if (bottomSlot != -1) {
-        // Recompute bottomOrd/SameReader
-        setBottom(bottomSlot);
-      }
-
-      return this;
-    }
-
-    @Override
-    public void setBottom(final int bottom) throws IOException {
-      bottomSlot = bottom;
-
-      bottomValue = values[bottomSlot];
-      if (currentReaderGen == readerGen[bottomSlot]) {
-        bottomOrd = ords[bottomSlot];
-        bottomSameReader = true;
-      } else {
-        if (bottomValue == null) {
-          // missingOrd is null for all segments
-          assert ords[bottomSlot] == missingOrd;
-          bottomOrd = missingOrd;
-          bottomSameReader = true;
-          readerGen[bottomSlot] = currentReaderGen;
-        } else {
-          final int ord = termsIndex.lookupTerm(bottomValue);
-          if (ord < 0) {
-            bottomOrd = -ord - 2;
-            bottomSameReader = false;
-          } else {
-            bottomOrd = ord;
-            // exact value match
-            bottomSameReader = true;
-            readerGen[bottomSlot] = currentReaderGen;
-            ords[bottomSlot] = bottomOrd;
-          }
-        }
-      }
-    }
-
-    @Override
-    public void setTopValue(BytesRef value) {
-      // null is fine: it means the last doc of the prior
-      // search was missing this value
-      topValue = value;
-      // System.out.println("setTopValue " + topValue);
-    }
-
-    @Override
-    public BytesRef value(int slot) {
-      return values[slot];
-    }
-
-    @Override
-    public int compareTop(int doc) throws IOException {
-
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-      }
-
-      if (topSameReader) {
-        // ord is precisely comparable, even in the equal
-        // case
-        // System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
-        return topOrd - ord;
-      } else if (ord <= topOrd) {
-        // the equals case always means doc is < value
-        // (because we set lastOrd to the lower bound)
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compareValues(BytesRef val1, BytesRef val2) {
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public void setScorer(Scorable scorer) {}
-  }
-
  /**
   * Sorts by field's natural Term sort order. All comparisons are done using BytesRef.compareTo,
   * which is slow for medium to large result sets but possibly very fast for very small results
--- a/lucene/core/src/java/org/apache/lucene/search/SortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortField.java
@ -31,6 +31,7 @@ import org.apache.lucene.search.comparators.DoubleComparator;
 import org.apache.lucene.search.comparators.FloatComparator;
 import org.apache.lucene.search.comparators.IntComparator;
 import org.apache.lucene.search.comparators.LongComparator;
+import org.apache.lucene.search.comparators.TermOrdValComparator;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.BytesRef;
@ -536,8 +537,7 @@ public class SortField {
        break;

      case STRING:
-        return new FieldComparator.TermOrdValComparator(
-            numHits, field, missingValue == STRING_LAST);
+        return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);

      case STRING_VAL:
        fieldComparator =
--- a/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
+++ b/lucene/core/src/java/org/apache/lucene/search/SortedSetSortField.java
@ -24,6 +24,7 @@ import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortFieldProvider;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.search.comparators.TermOrdValComparator;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;

@ -178,8 +179,7 @@ public class SortedSetSortField extends SortField {

  @Override
  public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
-    return new FieldComparator.TermOrdValComparator(
-        numHits, getField(), missingValue == STRING_LAST) {
+    return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
      @Override
      protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
          throws IOException {
--- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
+++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java
@ -0,0 +1,608 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.comparators;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.FieldComparator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.LeafFieldComparator;
+import org.apache.lucene.search.Scorable;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
+ * {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
+ * string to their relative ordinal positions (using the index returned by {@link
+ * org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons using
+ * the ordinals. For medium to large results, this comparator will be much faster than {@link
+ * org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets it may be
+ * slower.
+ */
+public class TermOrdValComparator extends FieldComparator<BytesRef> {
+
+  /* Ords for each slot.
+  @lucene.internal */
+  final int[] ords;
+
+  /* Values for each slot.
+  @lucene.internal */
+  final BytesRef[] values;
+  private final BytesRefBuilder[] tempBRs;
+
+  /* Which reader last copied a value into the slot. When
+  we compare two slots, we just compare-by-ord if the
+  readerGen is the same; else we must compare the
+  values (slower).
+  @lucene.internal */
+  final int[] readerGen;
+
+  /* Gen of current reader we are on.
+  @lucene.internal */
+  int currentReaderGen = -1;
+
+  private final String field;
+  private final boolean reverse;
+  private final boolean sortMissingLast;
+
+  /* Bottom value (same as values[bottomSlot] once
+   bottomSlot is set).  Cached for faster compares.
+  @lucene.internal */
+  BytesRef bottomValue;
+
+  /* Bottom slot, or -1 if queue isn't full yet */
+  int bottomSlot = -1;
+
+  /** Set by setTopValue. */
+  BytesRef topValue;
+
+  /** -1 if missing values are sorted first, 1 if they are sorted last */
+  final int missingSortCmp;
+
+  /** Whether this is the only comparator. */
+  private boolean singleSort;
+
+  /** Whether this comparator is allowed to skip documents. */
+  private boolean canSkipDocuments = true;
+
+  /** Whether the collector is done with counting hits so that we can start skipping documents. */
+  private boolean hitsThresholdReached = false;
+
+  /**
+   * Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to put
+   * missing values at the end.
+   */
+  public TermOrdValComparator(int numHits, String field, boolean sortMissingLast, boolean reverse) {
+    ords = new int[numHits];
+    values = new BytesRef[numHits];
+    tempBRs = new BytesRefBuilder[numHits];
+    readerGen = new int[numHits];
+    this.field = field;
+    this.reverse = reverse;
+    this.sortMissingLast = sortMissingLast;
+    if (sortMissingLast) {
+      missingSortCmp = 1;
+    } else {
+      missingSortCmp = -1;
+    }
+  }
+
+  @Override
+  public void disableSkipping() {
+    canSkipDocuments = false;
+  }
+
+  @Override
+  public void setSingleSort() {
+    singleSort = true;
+  }
+
+  @Override
+  public int compare(int slot1, int slot2) {
+    if (readerGen[slot1] == readerGen[slot2]) {
+      return ords[slot1] - ords[slot2];
+    }
+
+    final BytesRef val1 = values[slot1];
+    final BytesRef val2 = values[slot2];
+    if (val1 == null) {
+      if (val2 == null) {
+        return 0;
+      }
+      return missingSortCmp;
+    } else if (val2 == null) {
+      return -missingSortCmp;
+    }
+    return val1.compareTo(val2);
+  }
+
+  /** Retrieves the SortedDocValues for the field in this segment */
+  protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
+      throws IOException {
+    return DocValues.getSorted(context.reader(), field);
+  }
+
+  @Override
+  public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
+    currentReaderGen++;
+    return new TermOrdValLeafComparator(context, getSortedDocValues(context, field));
+  }
+
+  @Override
+  public void setTopValue(BytesRef value) {
+    // null is fine: it means the last doc of the prior
+    // search was missing this value
+    topValue = value;
+    // System.out.println("setTopValue " + topValue);
+  }
+
+  @Override
+  public BytesRef value(int slot) {
+    return values[slot];
+  }
+
+  @Override
+  public int compareValues(BytesRef val1, BytesRef val2) {
+    if (val1 == null) {
+      if (val2 == null) {
+        return 0;
+      }
+      return missingSortCmp;
+    } else if (val2 == null) {
+      return -missingSortCmp;
+    }
+    return val1.compareTo(val2);
+  }
+
+  private class TermOrdValLeafComparator implements LeafFieldComparator {
+
+    /* Current reader's doc ord/values. */
+    final SortedDocValues termsIndex;
+
+    /* True if current bottom slot matches the current reader. */
+    boolean bottomSameReader;
+
+    /* Bottom ord (same as ords[bottomSlot] once bottomSlot is set).  Cached for faster compares. */
+    int bottomOrd;
+
+    final boolean topSameReader;
+    final int topOrd;
+
+    /** Which ordinal to use for a missing value. */
+    final int missingOrd;
+
+    private final CompetitiveIterator competitiveIterator;
+
+    private final boolean dense;
+
+    TermOrdValLeafComparator(LeafReaderContext context, SortedDocValues values) throws IOException {
+      termsIndex = values;
+
+      if (sortMissingLast) {
+        missingOrd = Integer.MAX_VALUE;
+      } else {
+        missingOrd = -1;
+      }
+
+      if (topValue != null) {
+        // Recompute topOrd/SameReader
+        int ord = termsIndex.lookupTerm(topValue);
+        if (ord >= 0) {
+          topSameReader = true;
+          topOrd = ord;
+        } else {
+          topSameReader = false;
+          topOrd = -ord - 2;
+        }
+      } else {
+        topOrd = missingOrd;
+        topSameReader = true;
+      }
+      // System.out.println("  getLeafComparator topOrd=" + topOrd + " topSameReader=" +
+      // topSameReader);
+
+      if (bottomSlot != -1) {
+        // Recompute bottomOrd/SameReader
+        setBottom(bottomSlot);
+      }
+
+      final boolean enableSkipping;
+      if (canSkipDocuments == false) {
+        dense = false;
+        enableSkipping = false;
+      } else {
+        FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field);
+        if (fieldInfo == null) {
+          if (termsIndex.getValueCount() != 0) {
+            throw new IllegalStateException("Field [" + field + "] cannot be found in field infos");
+          }
+          dense = false;
+          enableSkipping = true;
+        } else if (fieldInfo.getIndexOptions() == IndexOptions.NONE) {
+          // No terms index
+          dense = false;
+          enableSkipping = false;
+        } else {
+          Terms terms = context.reader().terms(field);
+          dense = terms.getDocCount() == context.reader().maxDoc();
+          if (dense || topValue != null) {
+            enableSkipping = true;
+          } else if (reverse == sortMissingLast) {
+            // Missing values are always competitive, we can never skip
+            enableSkipping = false;
+          } else {
+            enableSkipping = true;
+          }
+        }
+      }
+      if (enableSkipping) {
+        competitiveIterator = new CompetitiveIterator(context, field, dense, values.termsEnum());
+      } else {
+        competitiveIterator = null;
+      }
+      updateCompetitiveIterator();
+    }
+
+    private int getOrdForDoc(int doc) throws IOException {
+      if (termsIndex.advanceExact(doc)) {
+        return termsIndex.ordValue();
+      } else {
+        return -1;
+      }
+    }
+
+    @Override
+    public void setHitsThresholdReached() throws IOException {
+      hitsThresholdReached = true;
+      updateCompetitiveIterator();
+    }
+
+    @Override
+    public int compareBottom(int doc) throws IOException {
+      assert bottomSlot != -1;
+      int docOrd = getOrdForDoc(doc);
+      if (docOrd == -1) {
+        docOrd = missingOrd;
+      }
+      if (bottomSameReader) {
+        // ord is precisely comparable, even in the equal case
+        return bottomOrd - docOrd;
+      } else if (bottomOrd >= docOrd) {
+        // the equals case always means bottom is > doc
+        // (because we set bottomOrd to the lower bound in
+        // setBottom):
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+
+    @Override
+    public void copy(int slot, int doc) throws IOException {
+      int ord = getOrdForDoc(doc);
+      if (ord == -1) {
+        ord = missingOrd;
+        values[slot] = null;
+      } else {
+        assert ord >= 0;
+        if (tempBRs[slot] == null) {
+          tempBRs[slot] = new BytesRefBuilder();
+        }
+        tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
+        values[slot] = tempBRs[slot].get();
+      }
+      ords[slot] = ord;
+      readerGen[slot] = currentReaderGen;
+    }
+
+    @Override
+    public void setBottom(final int bottom) throws IOException {
+      bottomSlot = bottom;
+
+      bottomValue = values[bottomSlot];
+      if (currentReaderGen == readerGen[bottomSlot]) {
+        bottomOrd = ords[bottomSlot];
+        bottomSameReader = true;
+      } else {
+        if (bottomValue == null) {
+          // missingOrd is null for all segments
+          assert ords[bottomSlot] == missingOrd;
+          bottomOrd = missingOrd;
+          bottomSameReader = true;
+          readerGen[bottomSlot] = currentReaderGen;
+        } else {
+          final int ord = termsIndex.lookupTerm(bottomValue);
+          if (ord < 0) {
+            bottomOrd = -ord - 2;
+            bottomSameReader = false;
+          } else {
+            bottomOrd = ord;
+            // exact value match
+            bottomSameReader = true;
+            readerGen[bottomSlot] = currentReaderGen;
+            ords[bottomSlot] = bottomOrd;
+          }
+        }
+      }
+
+      updateCompetitiveIterator();
+    }
+
+    @Override
+    public int compareTop(int doc) throws IOException {
+
+      int ord = getOrdForDoc(doc);
+      if (ord == -1) {
+        ord = missingOrd;
+      }
+
+      if (topSameReader) {
+        // ord is precisely comparable, even in the equal case
+        // System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
+        return topOrd - ord;
+      } else if (ord <= topOrd) {
+        // the equals case always means doc is < value
+        // (because we set topOrd to the lower bound)
+        return 1;
+      } else {
+        return -1;
+      }
+    }
+
+    @Override
+    public void setScorer(Scorable scorer) {}
+
+    private void updateCompetitiveIterator() throws IOException {
+      if (competitiveIterator == null || hitsThresholdReached == false || bottomSlot == -1) {
+        return;
+      }
+      // This logic to figure out min and max ords is quite complex and verbose, can it be made
+      // simpler?
+      final int minOrd;
+      final int maxOrd;
+      if (reverse == false) {
+
+        if (topValue != null) {
+          if (topSameReader) {
+            minOrd = topOrd;
+          } else {
+            // In the case when the top value doesn't exist in the segment, topOrd is set as the
+            // previous ord, and we are only interested in values that compare strictly greater than
+            // this.
+            minOrd = topOrd + 1;
+          }
+        } else if (sortMissingLast || dense) {
+          minOrd = 0;
+        } else {
+          // Missing values are still competitive.
+          minOrd = -1;
+        }
+
+        if (bottomOrd == Integer.MAX_VALUE) {
+          // The queue still contains missing values.
+          if (singleSort) {
+            // If there is no tie breaker, we can start ignoring missing values from now on.
+            maxOrd = termsIndex.getValueCount() - 1;
+          } else {
+            maxOrd = Integer.MAX_VALUE;
+          }
+        } else if (bottomSameReader) {
+          // If there is no tie breaker, we can start ignoring values that compare equal to the
+          // current top value too.
+          maxOrd = singleSort ? bottomOrd - 1 : bottomOrd;
+        } else {
+          maxOrd = bottomOrd;
+        }
+
+      } else {
+
+        if (bottomOrd == -1) {
+          // The queue still contains missing values.
+          if (singleSort) {
+            // If there is no tie breaker, we can start ignoring missing values from now on.
+            minOrd = 0;
+          } else {
+            minOrd = -1;
+          }
+        } else if (bottomSameReader) {
+          // If there is no tie breaker, we can start ignoring values that compare equal to the
+          // current top value too.
+          minOrd = singleSort ? bottomOrd + 1 : bottomOrd;
+        } else {
+          minOrd = bottomOrd + 1;
+        }
+
+        if (topValue != null) {
+          maxOrd = topOrd;
+        } else if (sortMissingLast == false || dense) {
+          maxOrd = termsIndex.getValueCount() - 1;
+        } else {
+          maxOrd = Integer.MAX_VALUE;
+        }
+      }
+
+      if (minOrd == -1 || maxOrd == Integer.MAX_VALUE) {
+        // Missing values are still competitive, we can't skip yet.
+        return;
+      }
+      assert minOrd >= 0;
+      assert maxOrd < termsIndex.getValueCount();
+      competitiveIterator.update(minOrd, maxOrd);
+    }
+
+    @Override
+    public DocIdSetIterator competitiveIterator() {
+      return competitiveIterator;
+    }
+  }
+
+  private static class PostingsEnumAndOrd {
+    private final PostingsEnum postings;
+    private final int ord;
+
+    PostingsEnumAndOrd(PostingsEnum postings, int ord) {
+      this.postings = postings;
+      this.ord = ord;
+    }
+  }
+
+  private class CompetitiveIterator extends DocIdSetIterator {
+
+    private static final int MAX_TERMS = 128;
+
+    private final LeafReaderContext context;
+    private final int maxDoc;
+    private final String field;
+    private final boolean dense;
+    private final TermsEnum docValuesTerms;
+    private int doc = -1;
+    private ArrayDeque<PostingsEnumAndOrd> postings;
+    private DocIdSetIterator docsWithField;
+    private PriorityQueue<PostingsEnumAndOrd> disjunction;
+
+    CompetitiveIterator(
+        LeafReaderContext context, String field, boolean dense, TermsEnum docValuesTerms) {
+      this.context = context;
+      this.maxDoc = context.reader().maxDoc();
+      this.field = field;
+      this.dense = dense;
+      this.docValuesTerms = docValuesTerms;
+    }
+
+    @Override
+    public int docID() {
+      return doc;
+    }
+
+    @Override
+    public int nextDoc() throws IOException {
+      return advance(docID() + 1);
+    }
+
+    @Override
+    public int advance(int target) throws IOException {
+      if (target >= maxDoc) {
+        return doc = NO_MORE_DOCS;
+      } else if (disjunction == null) {
+        if (docsWithField != null) {
+          // The field is sparse and we're only interested in documents that have a value.
+          assert dense == false;
+          return doc = docsWithField.advance(target);
+        } else {
+          // We haven't started skipping yet
+          return doc = target;
+        }
+      } else {
+        PostingsEnumAndOrd top = disjunction.top();
+        if (top == null) {
+          // priority queue is empty, none of the remaining documents are competitive
+          return doc = NO_MORE_DOCS;
+        }
+        while (top.postings.docID() < target) {
+          top.postings.advance(target);
+          top = disjunction.updateTop();
+        }
+        return doc = top.postings.docID();
+      }
+    }
+
+    @Override
+    public long cost() {
+      return context.reader().maxDoc();
+    }
+
+    /**
+     * Update this iterator to only match postings whose term has an ordinal between {@code minOrd}
+     * included and {@code maxOrd} included.
+     */
+    private void update(int minOrd, int maxOrd) throws IOException {
+      final int maxTerms = Math.min(MAX_TERMS, IndexSearcher.getMaxClauseCount());
+      final int size = Math.max(0, maxOrd - minOrd + 1);
+      if (size > maxTerms) {
+        if (dense == false && docsWithField == null) {
+          docsWithField = getSortedDocValues(context, field);
+        }
+      } else if (postings == null) {
+        init(minOrd, maxOrd);
+      } else if (size < postings.size()) {
+        // One or more ords got removed
+        assert postings.isEmpty() || postings.getFirst().ord <= minOrd;
+        while (postings.isEmpty() == false && postings.getFirst().ord < minOrd) {
+          postings.removeFirst();
+        }
+        assert postings.isEmpty() || postings.getLast().ord >= maxOrd;
+        while (postings.isEmpty() == false && postings.getLast().ord > maxOrd) {
+          postings.removeLast();
+        }
+        disjunction.clear();
+        disjunction.addAll(postings);
+      }
+    }
+
+    /**
+     * For the first time, this iterator is allowed to skip documents. It needs to pull {@link
+     * PostingsEnum}s from the terms dictionary of the inverted index and create a priority queue
+     * out of them.
+     */
+    private void init(int minOrd, int maxOrd) throws IOException {
+      final int size = Math.max(0, maxOrd - minOrd + 1);
+      postings = new ArrayDeque<>(size);
+      if (size > 0) {
+        docValuesTerms.seekExact(minOrd);
+        BytesRef minTerm = docValuesTerms.term();
+        TermsEnum terms = context.reader().terms(field).iterator();
+        if (terms.seekExact(minTerm) == false) {
+          throw new IllegalStateException(
+              "Term " + minTerm + " exists in doc values but not in the terms index");
+        }
+        postings.add(new PostingsEnumAndOrd(terms.postings(null, PostingsEnum.NONE), minOrd));
+        for (int ord = minOrd + 1; ord <= maxOrd; ++ord) {
+          BytesRef next = terms.next();
+          if (next == null) {
+            throw new IllegalStateException(
+                "Terms have more than "
+                    + ord
+                    + " unique terms while doc values have exactly "
+                    + ord
+                    + " terms");
+          }
+          assert docValuesTerms.seekExact(next) && docValuesTerms.ord() == ord;
+          postings.add(new PostingsEnumAndOrd(terms.postings(null, PostingsEnum.NONE), ord));
+        }
+      }
+      disjunction =
+          new PriorityQueue<PostingsEnumAndOrd>(size) {
+            @Override
+            protected boolean lessThan(PostingsEnumAndOrd a, PostingsEnumAndOrd b) {
+              return a.postings.docID() < b.postings.docID();
+            }
+          };
+      disjunction.addAll(postings);
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestSortOptimization.java
@ -26,24 +26,39 @@ import java.util.Collections;
 import java.util.List;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.FloatDocValuesField;
 import org.apache.lucene.document.FloatPoint;
 import org.apache.lucene.document.IntPoint;
 import org.apache.lucene.document.IntRange;
 import org.apache.lucene.document.LongPoint;
 import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedDocValuesField;
 import org.apache.lucene.document.SortedNumericDocValuesField;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.FilterDirectoryReader;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.PointValues;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.SortField.Type;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.index.RandomIndexWriter;
+import org.apache.lucene.tests.search.CheckHits;
 import org.apache.lucene.tests.util.LuceneTestCase;
 import org.apache.lucene.tests.util.TestUtil;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;

 public class TestSortOptimization extends LuceneTestCase {
@ -869,4 +884,254 @@ public class TestSortOptimization extends LuceneTestCase {
              + numDocs);
    }
  }
+
+  public void testStringSortOptimization() throws IOException {
+    final Directory dir = newDirectory();
+    final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
+    final int numDocs = atLeast(10000);
+    for (int i = 0; i < numDocs; ++i) {
+      final Document doc = new Document();
+      final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
+      doc.add(new StringField("my_field", value, Store.NO));
+      doc.add(new SortedDocValuesField("my_field", value));
+      writer.addDocument(doc);
+      if (i % 2000 == 0) writer.flush(); // multiple segments
+    }
+    final DirectoryReader reader = DirectoryReader.open(writer);
+    writer.close();
+    doTestStringSortOptimization(reader);
+    reader.close();
+    dir.close();
+  }
+
+  public void testStringSortOptimizationWithMissingValues() throws IOException {
+    final Directory dir = newDirectory();
+    final IndexWriter writer =
+        new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(newLogMergePolicy()));
+    final int numDocs = atLeast(10000);
+    // one segment with all values missing to start with
+    writer.addDocument(new Document());
+    for (int i = 0; i < numDocs - 2; ++i) {
+      if (i % 2000 == 0) writer.flush(); // multiple segments
+      final Document doc = new Document();
+      if (random().nextInt(2) == 0) {
+        final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
+        doc.add(new StringField("my_field", value, Store.NO));
+        doc.add(new SortedDocValuesField("my_field", value));
+      }
+      writer.addDocument(doc);
+    }
+    writer.flush();
+    // And one empty segment with all values missing to finish with
+    writer.addDocument(new Document());
+    final DirectoryReader reader = DirectoryReader.open(writer);
+    writer.close();
+    doTestStringSortOptimization(reader);
+    reader.close();
+    dir.close();
+  }
+
+  private void doTestStringSortOptimization(DirectoryReader reader) throws IOException {
+    final int numDocs = reader.numDocs();
+    final int numHits = 5;
+
+    { // simple ascending sort
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(sortField);
+      TopDocs topDocs = assertSort(reader, sort, numHits, null);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // simple descending sort
+      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      sortField.setMissingValue(SortField.STRING_FIRST);
+      Sort sort = new Sort(sortField);
+      TopDocs topDocs = assertSort(reader, sort, numHits, null);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // ascending sort that returns missing values first
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_FIRST);
+      Sort sort = new Sort(sortField);
+      assertSort(reader, sort, numHits, null);
+    }
+
+    { // descending sort that returns missing values last
+      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(sortField);
+      assertSort(reader, sort, numHits, null);
+    }
+
+    { // paging ascending sort with after
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(sortField);
+      BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
+      FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
+      TopDocs topDocs = assertSort(reader, sort, numHits, after);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // paging descending sort with after
+      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      sortField.setMissingValue(SortField.STRING_FIRST);
+      Sort sort = new Sort(sortField);
+      BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
+      FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
+      TopDocs topDocs = assertSort(reader, sort, numHits, after);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // paging ascending sort with after that returns missing values first
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_FIRST);
+      Sort sort = new Sort(sortField);
+      BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
+      FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
+      TopDocs topDocs = assertSort(reader, sort, numHits, after);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // paging descending sort with after that returns missing values first
+      SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(sortField);
+      BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
+      FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
+      TopDocs topDocs = assertSort(reader, sort, numHits, after);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // test that if there is the secondary sort on _score, hits are still skipped
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(sortField, FIELD_SCORE);
+      TopDocs topDocs = assertSort(reader, sort, numHits, null);
+      assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
+    }
+
+    { // test that if string field is a secondary sort, no optimization is run
+      SortField sortField = new SortField("my_field", SortField.Type.STRING);
+      sortField.setMissingValue(SortField.STRING_LAST);
+      Sort sort = new Sort(FIELD_SCORE, sortField);
+      TopDocs topDocs = assertSort(reader, sort, numHits, null);
+      assertEquals(
+          topDocs.totalHits.value,
+          numDocs); // assert that all documents were collected => optimization was not run
+    }
+  }
+
+  private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after)
+      throws IOException {
+    TopDocs topDocs = assertSearchHits(reader, sort, n, after);
+    SortField[] sortField2 = ArrayUtil.growExact(sort.getSort(), sort.getSort().length + 1);
+    // A secondary sort on reverse doc ID is the best way to catch bugs if the comparator filters
+    // too aggressively
+    sortField2[sortField2.length - 1] = new SortField(null, Type.DOC, true);
+    FieldDoc after2 = null;
+    if (after != null) {
+      Object[] afterFields2 = ArrayUtil.growExact(after.fields, after.fields.length + 1);
+      afterFields2[afterFields2.length - 1] = Integer.MAX_VALUE;
+      after2 = new FieldDoc(after.doc, after.score, afterFields2);
+    }
+    assertSearchHits(reader, new Sort(sortField2), n, after2);
+    return topDocs;
+  }
+
+  private TopDocs assertSearchHits(DirectoryReader reader, Sort sort, int n, FieldDoc after)
+      throws IOException {
+    // single threaded so totalhits is deterministic
+    IndexSearcher searcher = newSearcher(reader, true, true, false);
+    Query query = new MatchAllDocsQuery();
+    CollectorManager<TopFieldCollector, TopFieldDocs> manager =
+        TopFieldCollector.createSharedManager(sort, n, after, n);
+    TopDocs topDocs = searcher.search(query, manager);
+    IndexSearcher unoptimizedSearcher =
+        newSearcher(new NoIndexDirectoryReader(reader), true, true, false);
+    TopDocs unoptimizedTopDocs = unoptimizedSearcher.search(query, manager);
+    CheckHits.checkEqual(query, unoptimizedTopDocs.scoreDocs, topDocs.scoreDocs);
+    return topDocs;
+  }
+
+  private static final class NoIndexDirectoryReader extends FilterDirectoryReader {
+
+    public NoIndexDirectoryReader(DirectoryReader in) throws IOException {
+      super(
+          in,
+          new SubReaderWrapper() {
+            @Override
+            public LeafReader wrap(LeafReader reader) {
+              return new NoIndexLeafReader(reader);
+            }
+          });
+    }
+
+    @Override
+    protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public CacheHelper getReaderCacheHelper() {
+      return null;
+    }
+  }
+
+  private static final class NoIndexLeafReader extends FilterLeafReader {
+
+    NoIndexLeafReader(LeafReader in) {
+      super(in);
+    }
+
+    @Override
+    public CacheHelper getCoreCacheHelper() {
+      return null;
+    }
+
+    @Override
+    public CacheHelper getReaderCacheHelper() {
+      return null;
+    }
+
+    @Override
+    public Terms terms(String field) throws IOException {
+      return null;
+    }
+
+    @Override
+    public PointValues getPointValues(String field) throws IOException {
+      return null;
+    }
+
+    @Override
+    public FieldInfos getFieldInfos() {
+      FieldInfo[] newInfos = new FieldInfo[super.getFieldInfos().size()];
+      int i = 0;
+      for (FieldInfo fi : super.getFieldInfos()) {
+        FieldInfo noIndexFI =
+            new FieldInfo(
+                fi.name,
+                fi.number,
+                false,
+                false,
+                false,
+                IndexOptions.NONE,
+                fi.getDocValuesType(),
+                fi.getDocValuesGen(),
+                fi.attributes(),
+                0,
+                0,
+                0,
+                0,
+                VectorSimilarityFunction.DOT_PRODUCT,
+                fi.isSoftDeletesField());
+        newInfos[i] = noIndexFI;
+        i++;
+      }
+      return new FieldInfos(newInfos);
+    }
+  }
 }
--- a/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
+++ b/lucene/join/src/java/org/apache/lucene/search/join/ToParentBlockJoinSortField.java
@ -33,6 +33,7 @@ import org.apache.lucene.search.comparators.DoubleComparator;
 import org.apache.lucene.search.comparators.FloatComparator;
 import org.apache.lucene.search.comparators.IntComparator;
 import org.apache.lucene.search.comparators.LongComparator;
+import org.apache.lucene.search.comparators.TermOrdValComparator;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.NumericUtils;

@ -134,23 +135,25 @@ public class ToParentBlockJoinSortField extends SortField {
  }

  private FieldComparator<?> getStringComparator(int numHits) {
-    return new FieldComparator.TermOrdValComparator(
-        numHits, getField(), missingValue == STRING_LAST) {
+    FieldComparator<?> cmp =
+        new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, getReverse()) {

-      @Override
-      protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
-          throws IOException {
-        SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
-        final BlockJoinSelector.Type type =
-            order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
-        final BitSet parents = parentFilter.getBitSet(context);
-        final BitSet children = childFilter.getBitSet(context);
-        if (children == null) {
-          return DocValues.emptySorted();
-        }
-        return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
-      }
-    };
+          @Override
+          protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
+              throws IOException {
+            SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
+            final BlockJoinSelector.Type type =
+                order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
+            final BitSet parents = parentFilter.getBitSet(context);
+            final BitSet children = childFilter.getBitSet(context);
+            if (children == null) {
+              return DocValues.emptySorted();
+            }
+            return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
+          }
+        };
+    cmp.disableSkipping();
+    return cmp;
  }

  private FieldComparator<?> getIntComparator(int numHits) {