LUCENE-10633: Dynamic pruning for sorting on SORTED(_SET) fields. (#1023)

This commit enables dynamic pruning for queries sorted on SORTED(_SET) fields
by using postings to filter competitive documents.
This commit is contained in:
Adrien Grand 2022-07-29 11:12:32 +02:00 committed by GitHub
parent e1d2005df4
commit eb7b7791ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 900 additions and 297 deletions

View File

@ -102,6 +102,10 @@ Optimizations
* GITHUB#1020: Support #scoreSupplier and small optimizations to DocValuesRewriteMethod. (Greg Miller)
* LUCENE-10633: Added support for dynamic pruning to queries sorted by a string
field that is indexed with terms and SORTED or SORTED_SET doc values.
(Adrien Grand)
Bug Fixes
---------------------
* LUCENE-10663: Fix KnnVectorQuery explain with multiple segments. (Shiming Li)

View File

@ -20,7 +20,6 @@ import java.io.IOException;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@ -211,282 +210,6 @@ public abstract class FieldComparator<T> {
}
}
/**
* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
* string to their relative ordinal positions (using the index returned by {@link
* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons
* using the ordinals. For medium to large results, this comparator will be much faster than
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets
* it may be slower.
*/
public static class TermOrdValComparator extends FieldComparator<BytesRef>
implements LeafFieldComparator {
/* Ords for each slot.
@lucene.internal */
final int[] ords;
/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;
/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;
/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;
/* Current reader's doc ord/values.
@lucene.internal */
SortedDocValues termsIndex;
private final String field;
/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;
/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;
/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;
/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;
/** Set by setTopValue. */
BytesRef topValue;
boolean topSameReader;
int topOrd;
/** -1 if missing values are sorted first, 1 if they are sorted last */
final int missingSortCmp;
/** Which ordinal to use for a missing value. */
final int missingOrd;
/** Creates this, sorting missing values first. */
public TermOrdValComparator(int numHits, String field) {
this(numHits, field, false);
}
/**
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to
* put missing values at the end.
*/
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
readerGen = new int[numHits];
this.field = field;
if (sortMissingLast) {
missingSortCmp = 1;
missingOrd = Integer.MAX_VALUE;
} else {
missingSortCmp = -1;
missingOrd = -1;
}
}
private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
} else {
return -1;
}
}
@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}
final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}
@Override
public int compareBottom(int doc) throws IOException {
assert bottomSlot != -1;
int docOrd = getOrdForDoc(doc);
if (docOrd == -1) {
docOrd = missingOrd;
}
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
values[slot] = null;
} else {
assert ord >= 0;
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
values[slot] = tempBRs[slot].get();
}
ords[slot] = ord;
readerGen[slot] = currentReaderGen;
}
/** Retrieves the SortedDocValues for the field in this segment */
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
return DocValues.getSorted(context.reader(), field);
}
@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
termsIndex = getSortedDocValues(context, field);
currentReaderGen++;
if (topValue != null) {
// Recompute topOrd/SameReader
int ord = termsIndex.lookupTerm(topValue);
if (ord >= 0) {
topSameReader = true;
topOrd = ord;
} else {
topSameReader = false;
topOrd = -ord - 2;
}
} else {
topOrd = missingOrd;
topSameReader = true;
}
// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +
// topSameReader);
if (bottomSlot != -1) {
// Recompute bottomOrd/SameReader
setBottom(bottomSlot);
}
return this;
}
@Override
public void setBottom(final int bottom) throws IOException {
bottomSlot = bottom;
bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// missingOrd is null for all segments
assert ords[bottomSlot] == missingOrd;
bottomOrd = missingOrd;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int ord = termsIndex.lookupTerm(bottomValue);
if (ord < 0) {
bottomOrd = -ord - 2;
bottomSameReader = false;
} else {
bottomOrd = ord;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
}
@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
// System.out.println("setTopValue " + topValue);
}
@Override
public BytesRef value(int slot) {
return values[slot];
}
@Override
public int compareTop(int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
}
if (topSameReader) {
// ord is precisely comparable, even in the equal
// case
// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
return topOrd - ord;
} else if (ord <= topOrd) {
// the equals case always means doc is < value
// (because we set lastOrd to the lower bound)
return 1;
} else {
return -1;
}
}
@Override
public int compareValues(BytesRef val1, BytesRef val2) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}
@Override
public void setScorer(Scorable scorer) {}
}
/**
* Sorts by field's natural Term sort order. All comparisons are done using BytesRef.compareTo,
* which is slow for medium to large result sets but possibly very fast for very small results

View File

@ -31,6 +31,7 @@ import org.apache.lucene.search.comparators.DoubleComparator;
import org.apache.lucene.search.comparators.FloatComparator;
import org.apache.lucene.search.comparators.IntComparator;
import org.apache.lucene.search.comparators.LongComparator;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
@ -536,8 +537,7 @@ public class SortField {
break;
case STRING:
return new FieldComparator.TermOrdValComparator(
numHits, field, missingValue == STRING_LAST);
return new TermOrdValComparator(numHits, field, missingValue == STRING_LAST, reverse);
case STRING_VAL:
fieldComparator =

View File

@ -24,6 +24,7 @@ import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortFieldProvider;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
@ -178,8 +179,7 @@ public class SortedSetSortField extends SortField {
@Override
public FieldComparator<?> getComparator(int numHits, boolean enableSkipping) {
return new FieldComparator.TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST) {
return new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, reverse) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {

View File

@ -0,0 +1,608 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.comparators;
import java.io.IOException;
import java.util.ArrayDeque;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldComparator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafFieldComparator;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;
/**
* Sorts by field's natural Term sort order, using ordinals. This is functionally equivalent to
* {@link org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the
* string to their relative ordinal positions (using the index returned by {@link
* org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and does most comparisons using
* the ordinals. For medium to large results, this comparator will be much faster than {@link
* org.apache.lucene.search.FieldComparator.TermValComparator}. For very small result sets it may be
* slower.
*/
public class TermOrdValComparator extends FieldComparator<BytesRef> {
/* Ords for each slot.
@lucene.internal */
final int[] ords;
/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;
/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;
/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;
private final String field;
private final boolean reverse;
private final boolean sortMissingLast;
/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;
/* Bottom slot, or -1 if queue isn't full yet */
int bottomSlot = -1;
/** Set by setTopValue. */
BytesRef topValue;
/** -1 if missing values are sorted first, 1 if they are sorted last */
final int missingSortCmp;
/** Whether this is the only comparator. */
private boolean singleSort;
/** Whether this comparator is allowed to skip documents. */
private boolean canSkipDocuments = true;
/** Whether the collector is done with counting hits so that we can start skipping documents. */
private boolean hitsThresholdReached = false;
/**
* Creates this, with control over how missing values are sorted. Pass sortMissingLast=true to put
* missing values at the end.
*/
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast, boolean reverse) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
readerGen = new int[numHits];
this.field = field;
this.reverse = reverse;
this.sortMissingLast = sortMissingLast;
if (sortMissingLast) {
missingSortCmp = 1;
} else {
missingSortCmp = -1;
}
}
@Override
public void disableSkipping() {
canSkipDocuments = false;
}
@Override
public void setSingleSort() {
singleSort = true;
}
@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}
final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}
/** Retrieves the SortedDocValues for the field in this segment */
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
return DocValues.getSorted(context.reader(), field);
}
@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
currentReaderGen++;
return new TermOrdValLeafComparator(context, getSortedDocValues(context, field));
}
@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
// System.out.println("setTopValue " + topValue);
}
@Override
public BytesRef value(int slot) {
return values[slot];
}
@Override
public int compareValues(BytesRef val1, BytesRef val2) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}
private class TermOrdValLeafComparator implements LeafFieldComparator {
/* Current reader's doc ord/values. */
final SortedDocValues termsIndex;
/* True if current bottom slot matches the current reader. */
boolean bottomSameReader;
/* Bottom ord (same as ords[bottomSlot] once bottomSlot is set). Cached for faster compares. */
int bottomOrd;
final boolean topSameReader;
final int topOrd;
/** Which ordinal to use for a missing value. */
final int missingOrd;
private final CompetitiveIterator competitiveIterator;
private final boolean dense;
TermOrdValLeafComparator(LeafReaderContext context, SortedDocValues values) throws IOException {
termsIndex = values;
if (sortMissingLast) {
missingOrd = Integer.MAX_VALUE;
} else {
missingOrd = -1;
}
if (topValue != null) {
// Recompute topOrd/SameReader
int ord = termsIndex.lookupTerm(topValue);
if (ord >= 0) {
topSameReader = true;
topOrd = ord;
} else {
topSameReader = false;
topOrd = -ord - 2;
}
} else {
topOrd = missingOrd;
topSameReader = true;
}
// System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" +
// topSameReader);
if (bottomSlot != -1) {
// Recompute bottomOrd/SameReader
setBottom(bottomSlot);
}
final boolean enableSkipping;
if (canSkipDocuments == false) {
dense = false;
enableSkipping = false;
} else {
FieldInfo fieldInfo = context.reader().getFieldInfos().fieldInfo(field);
if (fieldInfo == null) {
if (termsIndex.getValueCount() != 0) {
throw new IllegalStateException("Field [" + field + "] cannot be found in field infos");
}
dense = false;
enableSkipping = true;
} else if (fieldInfo.getIndexOptions() == IndexOptions.NONE) {
// No terms index
dense = false;
enableSkipping = false;
} else {
Terms terms = context.reader().terms(field);
dense = terms.getDocCount() == context.reader().maxDoc();
if (dense || topValue != null) {
enableSkipping = true;
} else if (reverse == sortMissingLast) {
// Missing values are always competitive, we can never skip
enableSkipping = false;
} else {
enableSkipping = true;
}
}
}
if (enableSkipping) {
competitiveIterator = new CompetitiveIterator(context, field, dense, values.termsEnum());
} else {
competitiveIterator = null;
}
updateCompetitiveIterator();
}
private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
} else {
return -1;
}
}
@Override
public void setHitsThresholdReached() throws IOException {
hitsThresholdReached = true;
updateCompetitiveIterator();
}
@Override
public int compareBottom(int doc) throws IOException {
assert bottomSlot != -1;
int docOrd = getOrdForDoc(doc);
if (docOrd == -1) {
docOrd = missingOrd;
}
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}
@Override
public void copy(int slot, int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
values[slot] = null;
} else {
assert ord >= 0;
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
values[slot] = tempBRs[slot].get();
}
ords[slot] = ord;
readerGen[slot] = currentReaderGen;
}
@Override
public void setBottom(final int bottom) throws IOException {
bottomSlot = bottom;
bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// missingOrd is null for all segments
assert ords[bottomSlot] == missingOrd;
bottomOrd = missingOrd;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int ord = termsIndex.lookupTerm(bottomValue);
if (ord < 0) {
bottomOrd = -ord - 2;
bottomSameReader = false;
} else {
bottomOrd = ord;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
updateCompetitiveIterator();
}
@Override
public int compareTop(int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
}
if (topSameReader) {
// ord is precisely comparable, even in the equal case
// System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
return topOrd - ord;
} else if (ord <= topOrd) {
// the equals case always means doc is < value
// (because we set topOrd to the lower bound)
return 1;
} else {
return -1;
}
}
@Override
public void setScorer(Scorable scorer) {}
private void updateCompetitiveIterator() throws IOException {
if (competitiveIterator == null || hitsThresholdReached == false || bottomSlot == -1) {
return;
}
// This logic to figure out min and max ords is quite complex and verbose, can it be made
// simpler?
final int minOrd;
final int maxOrd;
if (reverse == false) {
if (topValue != null) {
if (topSameReader) {
minOrd = topOrd;
} else {
// In the case when the top value doesn't exist in the segment, topOrd is set as the
// previous ord, and we are only interested in values that compare strictly greater than
// this.
minOrd = topOrd + 1;
}
} else if (sortMissingLast || dense) {
minOrd = 0;
} else {
// Missing values are still competitive.
minOrd = -1;
}
if (bottomOrd == Integer.MAX_VALUE) {
// The queue still contains missing values.
if (singleSort) {
// If there is no tie breaker, we can start ignoring missing values from now on.
maxOrd = termsIndex.getValueCount() - 1;
} else {
maxOrd = Integer.MAX_VALUE;
}
} else if (bottomSameReader) {
// If there is no tie breaker, we can start ignoring values that compare equal to the
// current top value too.
maxOrd = singleSort ? bottomOrd - 1 : bottomOrd;
} else {
maxOrd = bottomOrd;
}
} else {
if (bottomOrd == -1) {
// The queue still contains missing values.
if (singleSort) {
// If there is no tie breaker, we can start ignoring missing values from now on.
minOrd = 0;
} else {
minOrd = -1;
}
} else if (bottomSameReader) {
// If there is no tie breaker, we can start ignoring values that compare equal to the
// current top value too.
minOrd = singleSort ? bottomOrd + 1 : bottomOrd;
} else {
minOrd = bottomOrd + 1;
}
if (topValue != null) {
maxOrd = topOrd;
} else if (sortMissingLast == false || dense) {
maxOrd = termsIndex.getValueCount() - 1;
} else {
maxOrd = Integer.MAX_VALUE;
}
}
if (minOrd == -1 || maxOrd == Integer.MAX_VALUE) {
// Missing values are still competitive, we can't skip yet.
return;
}
assert minOrd >= 0;
assert maxOrd < termsIndex.getValueCount();
competitiveIterator.update(minOrd, maxOrd);
}
@Override
public DocIdSetIterator competitiveIterator() {
return competitiveIterator;
}
}
private static class PostingsEnumAndOrd {
private final PostingsEnum postings;
private final int ord;
PostingsEnumAndOrd(PostingsEnum postings, int ord) {
this.postings = postings;
this.ord = ord;
}
}
private class CompetitiveIterator extends DocIdSetIterator {
private static final int MAX_TERMS = 128;
private final LeafReaderContext context;
private final int maxDoc;
private final String field;
private final boolean dense;
private final TermsEnum docValuesTerms;
private int doc = -1;
private ArrayDeque<PostingsEnumAndOrd> postings;
private DocIdSetIterator docsWithField;
private PriorityQueue<PostingsEnumAndOrd> disjunction;
CompetitiveIterator(
LeafReaderContext context, String field, boolean dense, TermsEnum docValuesTerms) {
this.context = context;
this.maxDoc = context.reader().maxDoc();
this.field = field;
this.dense = dense;
this.docValuesTerms = docValuesTerms;
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int advance(int target) throws IOException {
if (target >= maxDoc) {
return doc = NO_MORE_DOCS;
} else if (disjunction == null) {
if (docsWithField != null) {
// The field is sparse and we're only interested in documents that have a value.
assert dense == false;
return doc = docsWithField.advance(target);
} else {
// We haven't started skipping yet
return doc = target;
}
} else {
PostingsEnumAndOrd top = disjunction.top();
if (top == null) {
// priority queue is empty, none of the remaining documents are competitive
return doc = NO_MORE_DOCS;
}
while (top.postings.docID() < target) {
top.postings.advance(target);
top = disjunction.updateTop();
}
return doc = top.postings.docID();
}
}
@Override
public long cost() {
return context.reader().maxDoc();
}
/**
* Update this iterator to only match postings whose term has an ordinal between {@code minOrd}
* included and {@code maxOrd} included.
*/
private void update(int minOrd, int maxOrd) throws IOException {
final int maxTerms = Math.min(MAX_TERMS, IndexSearcher.getMaxClauseCount());
final int size = Math.max(0, maxOrd - minOrd + 1);
if (size > maxTerms) {
if (dense == false && docsWithField == null) {
docsWithField = getSortedDocValues(context, field);
}
} else if (postings == null) {
init(minOrd, maxOrd);
} else if (size < postings.size()) {
// One or more ords got removed
assert postings.isEmpty() || postings.getFirst().ord <= minOrd;
while (postings.isEmpty() == false && postings.getFirst().ord < minOrd) {
postings.removeFirst();
}
assert postings.isEmpty() || postings.getLast().ord >= maxOrd;
while (postings.isEmpty() == false && postings.getLast().ord > maxOrd) {
postings.removeLast();
}
disjunction.clear();
disjunction.addAll(postings);
}
}
/**
* For the first time, this iterator is allowed to skip documents. It needs to pull {@link
* PostingsEnum}s from the terms dictionary of the inverted index and create a priority queue
* out of them.
*/
private void init(int minOrd, int maxOrd) throws IOException {
final int size = Math.max(0, maxOrd - minOrd + 1);
postings = new ArrayDeque<>(size);
if (size > 0) {
docValuesTerms.seekExact(minOrd);
BytesRef minTerm = docValuesTerms.term();
TermsEnum terms = context.reader().terms(field).iterator();
if (terms.seekExact(minTerm) == false) {
throw new IllegalStateException(
"Term " + minTerm + " exists in doc values but not in the terms index");
}
postings.add(new PostingsEnumAndOrd(terms.postings(null, PostingsEnum.NONE), minOrd));
for (int ord = minOrd + 1; ord <= maxOrd; ++ord) {
BytesRef next = terms.next();
if (next == null) {
throw new IllegalStateException(
"Terms have more than "
+ ord
+ " unique terms while doc values have exactly "
+ ord
+ " terms");
}
assert docValuesTerms.seekExact(next) && docValuesTerms.ord() == ord;
postings.add(new PostingsEnumAndOrd(terms.postings(null, PostingsEnum.NONE), ord));
}
}
disjunction =
new PriorityQueue<PostingsEnumAndOrd>(size) {
@Override
protected boolean lessThan(PostingsEnumAndOrd a, PostingsEnumAndOrd b) {
return a.postings.docID() < b.postings.docID();
}
};
disjunction.addAll(postings);
}
}
}

View File

@ -26,24 +26,39 @@ import java.util.Collections;
import java.util.List;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.FloatDocValuesField;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.document.IntRange;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedNumericDocValuesField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FilterDirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.SortField.Type;
import org.apache.lucene.store.Directory;
import org.apache.lucene.tests.index.RandomIndexWriter;
import org.apache.lucene.tests.search.CheckHits;
import org.apache.lucene.tests.util.LuceneTestCase;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
public class TestSortOptimization extends LuceneTestCase {
@ -869,4 +884,254 @@ public class TestSortOptimization extends LuceneTestCase {
+ numDocs);
}
}
public void testStringSortOptimization() throws IOException {
final Directory dir = newDirectory();
final IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig());
final int numDocs = atLeast(10000);
for (int i = 0; i < numDocs; ++i) {
final Document doc = new Document();
final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
doc.add(new StringField("my_field", value, Store.NO));
doc.add(new SortedDocValuesField("my_field", value));
writer.addDocument(doc);
if (i % 2000 == 0) writer.flush(); // multiple segments
}
final DirectoryReader reader = DirectoryReader.open(writer);
writer.close();
doTestStringSortOptimization(reader);
reader.close();
dir.close();
}
public void testStringSortOptimizationWithMissingValues() throws IOException {
final Directory dir = newDirectory();
final IndexWriter writer =
new IndexWriter(dir, new IndexWriterConfig().setMergePolicy(newLogMergePolicy()));
final int numDocs = atLeast(10000);
// one segment with all values missing to start with
writer.addDocument(new Document());
for (int i = 0; i < numDocs - 2; ++i) {
if (i % 2000 == 0) writer.flush(); // multiple segments
final Document doc = new Document();
if (random().nextInt(2) == 0) {
final BytesRef value = new BytesRef(Integer.toString(random().nextInt(1000)));
doc.add(new StringField("my_field", value, Store.NO));
doc.add(new SortedDocValuesField("my_field", value));
}
writer.addDocument(doc);
}
writer.flush();
// And one empty segment with all values missing to finish with
writer.addDocument(new Document());
final DirectoryReader reader = DirectoryReader.open(writer);
writer.close();
doTestStringSortOptimization(reader);
reader.close();
dir.close();
}
private void doTestStringSortOptimization(DirectoryReader reader) throws IOException {
final int numDocs = reader.numDocs();
final int numHits = 5;
{ // simple ascending sort
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sortField);
TopDocs topDocs = assertSort(reader, sort, numHits, null);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // simple descending sort
SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
sortField.setMissingValue(SortField.STRING_FIRST);
Sort sort = new Sort(sortField);
TopDocs topDocs = assertSort(reader, sort, numHits, null);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // ascending sort that returns missing values first
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_FIRST);
Sort sort = new Sort(sortField);
assertSort(reader, sort, numHits, null);
}
{ // descending sort that returns missing values last
SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sortField);
assertSort(reader, sort, numHits, null);
}
{ // paging ascending sort with after
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sortField);
BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
TopDocs topDocs = assertSort(reader, sort, numHits, after);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // paging descending sort with after
SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
sortField.setMissingValue(SortField.STRING_FIRST);
Sort sort = new Sort(sortField);
BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
TopDocs topDocs = assertSort(reader, sort, numHits, after);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // paging ascending sort with after that returns missing values first
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_FIRST);
Sort sort = new Sort(sortField);
BytesRef afterValue = new BytesRef(random().nextBoolean() ? "23" : "230000000");
FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
TopDocs topDocs = assertSort(reader, sort, numHits, after);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // paging descending sort with after that returns missing values first
SortField sortField = new SortField("my_field", SortField.Type.STRING, true);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sortField);
BytesRef afterValue = new BytesRef(random().nextBoolean() ? "17" : "170000000");
FieldDoc after = new FieldDoc(2, Float.NaN, new Object[] {afterValue});
TopDocs topDocs = assertSort(reader, sort, numHits, after);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // test that if there is the secondary sort on _score, hits are still skipped
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(sortField, FIELD_SCORE);
TopDocs topDocs = assertSort(reader, sort, numHits, null);
assertNonCompetitiveHitsAreSkipped(topDocs.totalHits.value, numDocs);
}
{ // test that if string field is a secondary sort, no optimization is run
SortField sortField = new SortField("my_field", SortField.Type.STRING);
sortField.setMissingValue(SortField.STRING_LAST);
Sort sort = new Sort(FIELD_SCORE, sortField);
TopDocs topDocs = assertSort(reader, sort, numHits, null);
assertEquals(
topDocs.totalHits.value,
numDocs); // assert that all documents were collected => optimization was not run
}
}
private TopDocs assertSort(DirectoryReader reader, Sort sort, int n, FieldDoc after)
throws IOException {
TopDocs topDocs = assertSearchHits(reader, sort, n, after);
SortField[] sortField2 = ArrayUtil.growExact(sort.getSort(), sort.getSort().length + 1);
// A secondary sort on reverse doc ID is the best way to catch bugs if the comparator filters
// too aggressively
sortField2[sortField2.length - 1] = new SortField(null, Type.DOC, true);
FieldDoc after2 = null;
if (after != null) {
Object[] afterFields2 = ArrayUtil.growExact(after.fields, after.fields.length + 1);
afterFields2[afterFields2.length - 1] = Integer.MAX_VALUE;
after2 = new FieldDoc(after.doc, after.score, afterFields2);
}
assertSearchHits(reader, new Sort(sortField2), n, after2);
return topDocs;
}
private TopDocs assertSearchHits(DirectoryReader reader, Sort sort, int n, FieldDoc after)
throws IOException {
// single threaded so totalhits is deterministic
IndexSearcher searcher = newSearcher(reader, true, true, false);
Query query = new MatchAllDocsQuery();
CollectorManager<TopFieldCollector, TopFieldDocs> manager =
TopFieldCollector.createSharedManager(sort, n, after, n);
TopDocs topDocs = searcher.search(query, manager);
IndexSearcher unoptimizedSearcher =
newSearcher(new NoIndexDirectoryReader(reader), true, true, false);
TopDocs unoptimizedTopDocs = unoptimizedSearcher.search(query, manager);
CheckHits.checkEqual(query, unoptimizedTopDocs.scoreDocs, topDocs.scoreDocs);
return topDocs;
}
private static final class NoIndexDirectoryReader extends FilterDirectoryReader {
public NoIndexDirectoryReader(DirectoryReader in) throws IOException {
super(
in,
new SubReaderWrapper() {
@Override
public LeafReader wrap(LeafReader reader) {
return new NoIndexLeafReader(reader);
}
});
}
@Override
protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
}
private static final class NoIndexLeafReader extends FilterLeafReader {
NoIndexLeafReader(LeafReader in) {
super(in);
}
@Override
public CacheHelper getCoreCacheHelper() {
return null;
}
@Override
public CacheHelper getReaderCacheHelper() {
return null;
}
@Override
public Terms terms(String field) throws IOException {
return null;
}
@Override
public PointValues getPointValues(String field) throws IOException {
return null;
}
@Override
public FieldInfos getFieldInfos() {
FieldInfo[] newInfos = new FieldInfo[super.getFieldInfos().size()];
int i = 0;
for (FieldInfo fi : super.getFieldInfos()) {
FieldInfo noIndexFI =
new FieldInfo(
fi.name,
fi.number,
false,
false,
false,
IndexOptions.NONE,
fi.getDocValuesType(),
fi.getDocValuesGen(),
fi.attributes(),
0,
0,
0,
0,
VectorSimilarityFunction.DOT_PRODUCT,
fi.isSoftDeletesField());
newInfos[i] = noIndexFI;
i++;
}
return new FieldInfos(newInfos);
}
}
}

View File

@ -33,6 +33,7 @@ import org.apache.lucene.search.comparators.DoubleComparator;
import org.apache.lucene.search.comparators.FloatComparator;
import org.apache.lucene.search.comparators.IntComparator;
import org.apache.lucene.search.comparators.LongComparator;
import org.apache.lucene.search.comparators.TermOrdValComparator;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.NumericUtils;
@ -134,23 +135,25 @@ public class ToParentBlockJoinSortField extends SortField {
}
private FieldComparator<?> getStringComparator(int numHits) {
return new FieldComparator.TermOrdValComparator(
numHits, getField(), missingValue == STRING_LAST) {
FieldComparator<?> cmp =
new TermOrdValComparator(numHits, getField(), missingValue == STRING_LAST, getReverse()) {
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
final BlockJoinSelector.Type type =
order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
final BitSet parents = parentFilter.getBitSet(context);
final BitSet children = childFilter.getBitSet(context);
if (children == null) {
return DocValues.emptySorted();
}
return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
}
};
@Override
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field)
throws IOException {
SortedSetDocValues sortedSet = DocValues.getSortedSet(context.reader(), field);
final BlockJoinSelector.Type type =
order ? BlockJoinSelector.Type.MAX : BlockJoinSelector.Type.MIN;
final BitSet parents = parentFilter.getBitSet(context);
final BitSet children = childFilter.getBitSet(context);
if (children == null) {
return DocValues.emptySorted();
}
return BlockJoinSelector.wrap(sortedSet, type, parents, toIter(children));
}
};
cmp.disableSkipping();
return cmp;
}
private FieldComparator<?> getIntComparator(int numHits) {