Use radix sort to speed up the sorting of terms in TermInSetQuery (#12587)

This commit is contained in:
gf2121 2023-10-17 17:31:29 +08:00 committed by GitHub
parent 6dac2f7afc
commit c6e76d3e01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 30 additions and 6 deletions

View File

@ -185,6 +185,8 @@ Optimizations
* GITHUB#12591: Use stable radix sort to speed up the sorting of update terms. (Guo Feng)
* GITHUB#12587: Use radix sort to speed up the sorting of terms in TermInSetQuery. (Guo Feng)
* GITHUB#12604: Estimate the block size of FST BytesStore in BlockTreeTermsWriter
to reduce GC load during indexing. (Guo Feng)

View File

@ -29,11 +29,12 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefComparator;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringSorter;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
@ -112,7 +113,23 @@ public class TermInSetQuery extends MultiTermQuery implements Accountable {
boolean sorted =
terms instanceof SortedSet && ((SortedSet<BytesRef>) terms).comparator() == null;
if (sorted == false) {
ArrayUtil.timSort(sortedTerms);
new StringSorter(BytesRefComparator.NATURAL) {
@Override
protected void get(BytesRefBuilder builder, BytesRef result, int i) {
BytesRef term = sortedTerms[i];
result.length = term.length;
result.offset = term.offset;
result.bytes = term.bytes;
}
@Override
protected void swap(int i, int j) {
BytesRef b = sortedTerms[i];
sortedTerms[i] = sortedTerms[j];
sortedTerms[j] = b;
}
}.sort(0, sortedTerms.length);
}
PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
BytesRefBuilder previous = null;

View File

@ -20,8 +20,7 @@ import java.util.Arrays;
import java.util.Comparator;
/**
* Specialized {@link BytesRef} comparator that {@link
* FixedLengthBytesRefArray#iterator(Comparator)} has optimizations for.
* Specialized {@link BytesRef} comparator that {@link StringSorter} has optimizations for.
*
* @lucene.internal
*/

View File

@ -19,7 +19,13 @@ package org.apache.lucene.util;
import java.util.Comparator;
abstract class StringSorter extends Sorter {
/**
* A {@link BytesRef} sorter tries to use a efficient radix sorter if {@link StringSorter#cmp} is a
* {@link BytesRefComparator}, otherwise fallback to {@link StringSorter#fallbackSorter}
*
* @lucene.internal
*/
public abstract class StringSorter extends Sorter {
private final Comparator<BytesRef> cmp;
protected final BytesRefBuilder scratch1 = new BytesRefBuilder();
@ -29,7 +35,7 @@ abstract class StringSorter extends Sorter {
protected final BytesRef scratchBytes2 = new BytesRef();
protected final BytesRef pivot = new BytesRef();
StringSorter(Comparator<BytesRef> cmp) {
protected StringSorter(Comparator<BytesRef> cmp) {
this.cmp = cmp;
}