LUCENE-9068: Build FuzzyQuery automata up-front (#1042)

FuzzyTermsEnum can now either take an array of compiled automata, and
an AttributeSource, to be used across multiple segments (eg during
FuzzyQuery rewrite); or it can take a term, edit distance, prefix and transition
boolean and build the automata itself if only being used once (eg for fuzzy
nearest neighbour calculations).

Rather than interact via attribute sources and specialized attributes, users of
FuzzyTermsEnum can get the boost and set minimum competitive boosts
directly on the enum.
This commit is contained in:
Alan Woodward 2020-01-15 14:58:11 +00:00 committed by GitHub
parent 087b2e1c0d
commit 9d72bfc1af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 147 additions and 194 deletions

View File

@ -138,6 +138,8 @@ Other
* LUCENE-9096: Simplification of CompressingTermVectorsWriter#flushOffsets.
(kkewwei via Adrien Grand)
* LUCENE-9068: FuzzyQuery builds its Automaton up-front (Alan Woodward, Mike Drob)
======================= Lucene 8.4.1 =======================
Bug Fixes

View File

@ -34,14 +34,11 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -155,27 +152,22 @@ public class NearestFuzzyQuery extends Query {
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
BoostAttribute boostAtt =
fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
numVariants++;
totalVariantDocFreqs += fe.docFreq();
float score = boostAtt.getBoost();
float score = fe.getBoost();
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {

View File

@ -23,12 +23,11 @@ import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.Operations;
import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
/** Implements the fuzzy search query. The similarity measurement
* is based on the Damerau-Levenshtein (optimal string alignment) algorithm,
@ -54,7 +53,9 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
* not match an indexed term "ab", and FuzzyQuery on term "a" with maxEdits=2 will not
* match an indexed term "abc".
*/
public class FuzzyQuery extends MultiTermQuery {
public class FuzzyQuery extends MultiTermQuery implements Accountable {
private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(AutomatonQuery.class);
public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
@ -66,6 +67,10 @@ public class FuzzyQuery extends MultiTermQuery {
private final boolean transpositions;
private final int prefixLength;
private final Term term;
private final int termLength;
private final CompiledAutomaton[] automata;
private final long ramBytesUsed;
/**
* Create a new FuzzyQuery that will match terms with an edit distance
@ -101,7 +106,22 @@ public class FuzzyQuery extends MultiTermQuery {
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
int[] codePoints = FuzzyTermsEnum.stringToUTF32(term.text());
this.termLength = codePoints.length;
this.automata = FuzzyTermsEnum.buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits);
setRewriteMethod(new MultiTermQuery.TopTermsBlendedFreqScoringRewrite(maxExpansions));
this.ramBytesUsed = calculateRamBytesUsed(term, this.automata);
}
private static long calculateRamBytesUsed(Term term, CompiledAutomaton[] automata) {
long bytes = BASE_RAM_BYTES + term.ramBytesUsed();
for (CompiledAutomaton a : automata) {
bytes += a.ramBytesUsed();
}
bytes += 4 * Integer.BYTES;
bytes += Long.BYTES;
bytes += 1;
return bytes;
}
/**
@ -151,10 +171,10 @@ public class FuzzyQuery extends MultiTermQuery {
}
/**
* Expert: Constructs an equivalent Automaton accepting terms matched by this query
* Returns the compiled automata used to match terms
*/
public Automaton toAutomaton() {
return FuzzyTermsEnum.buildAutomaton(term.text(), prefixLength, transpositions, maxEdits);
public CompiledAutomaton[] getAutomata() {
return automata;
}
@Override
@ -163,13 +183,7 @@ public class FuzzyQuery extends MultiTermQuery {
if (maxEdits == 0 || prefixLength >= term.text().length()) {
visitor.consumeTerms(this, term);
} else {
// Note: we're rebuilding the automaton here, so this can be expensive
try {
visitor.consumeTermsMatching(this, field,
new ByteRunAutomaton(toAutomaton(), false, Operations.DEFAULT_MAX_DETERMINIZED_STATES));
} catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsEnum.FuzzyTermsException(term.text(), e);
}
automata[automata.length - 1].visit(visitor, this, field);
}
}
}
@ -179,7 +193,7 @@ public class FuzzyQuery extends MultiTermQuery {
if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
return new SingleTermsEnum(terms.iterator(), term.bytes());
}
return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
return new FuzzyTermsEnum(terms, atts, getTerm(), termLength, maxEdits, automata);
}
/**
@ -198,7 +212,7 @@ public class FuzzyQuery extends MultiTermQuery {
}
buffer.append(term.text());
buffer.append('~');
buffer.append(Integer.toString(maxEdits));
buffer.append(maxEdits);
return buffer.toString();
}
@ -223,6 +237,8 @@ public class FuzzyQuery extends MultiTermQuery {
if (getClass() != obj.getClass())
return false;
FuzzyQuery other = (FuzzyQuery) obj;
// Note that we don't need to compare termLength or automata because they
// are entirely determined by the other fields
if (maxEdits != other.maxEdits)
return false;
if (prefixLength != other.prefixLength)
@ -265,4 +281,9 @@ public class FuzzyQuery extends MultiTermQuery {
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
}
@Override
public long ramBytesUsed() {
return ramBytesUsed;
}
}

View File

@ -18,18 +18,13 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Attribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.AttributeReflector;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
@ -46,10 +41,12 @@ import org.apache.lucene.util.automaton.TooComplexToDeterminizeException;
* {@link BytesRef#compareTo}. Each term in the enumeration is
* greater than all that precede it.</p>
*/
public final class FuzzyTermsEnum extends BaseTermsEnum {
public final class FuzzyTermsEnum extends TermsEnum {
// NOTE: we can't subclass FilteredTermsEnum here because we need to sometimes change actualEnum:
private TermsEnum actualEnum;
private final AttributeSource atts;
// We use this to communicate the score (boost) of the current matched term we are on back to
// MultiTermQuery.TopTermsBlendedFreqScoringRewrite that is collecting the best (default 50) matched terms:
@ -59,30 +56,46 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
// which we use to know when we can reduce the automaton from ed=2 to ed=1, or ed=0 if only single top term is collected:
private final MaxNonCompetitiveBoostAttribute maxBoostAtt;
// We use this to share the pre-built (once for the query) Levenshtein automata across segments:
private final LevenshteinAutomataAttribute dfaAtt;
private final CompiledAutomaton[] automata;
private float bottom;
private BytesRef bottomTerm;
private final CompiledAutomaton automata[];
private BytesRef queuedBottom;
final int termLength;
private final int termLength;
// Maximum number of edits we will accept. This is either 2 or 1 (or, degenerately, 0) passed by the user originally,
// but as we collect terms, we can lower this (e.g. from 2 to 1) if we detect that the term queue is full, and all
// collected terms are ed=1:
private int maxEdits;
final Terms terms;
final Term term;
final int termText[];
final int realPrefixLength;
private final Terms terms;
private final Term term;
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have at most {@code maxEdits} edits.
* <p>
* After calling the constructor the enumeration is already pointing to the first
* valid term if such a term exists.
*
* @param terms Delivers terms.
* @param term Pattern term.
* @param maxEdits Maximum edit distance.
* @param prefixLength the length of the required common prefix
* @param transpositions whether transpositions should count as a single edit
* @throws IOException if there is a low-level IO error
*/
public FuzzyTermsEnum(Terms terms, Term term, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
this(terms, term, stringToUTF32(term.text()), maxEdits, prefixLength, transpositions);
}
private FuzzyTermsEnum(Terms terms, Term term, int[] codePoints, int maxEdits, int prefixLength, boolean transpositions) throws IOException {
this(terms, new AttributeSource(), term, codePoints.length, maxEdits,
buildAutomata(term.text(), codePoints, prefixLength, transpositions, maxEdits));
}
// True (the default, in FuzzyQuery) if a transposition should count as a single edit:
final boolean transpositions;
/**
* Constructor for enumeration of all terms from specified <code>reader</code> which share a prefix of
* length <code>prefixLength</code> with <code>term</code> and which have at most {@code maxEdits} edits.
@ -92,76 +105,62 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
*
* @param terms Delivers terms.
* @param atts {@link AttributeSource} created by the rewrite method of {@link MultiTermQuery}
* thats contains information about competitive boosts during rewrite. It is also used
* to cache DFAs between segment transitions.
* that contains information about competitive boosts during rewrite
* @param term Pattern term.
* @param maxEdits Maximum edit distance.
* @param prefixLength Length of required common prefix. Default value is 0.
* @param automata An array of levenshtein automata to match against terms,
* see {@link #buildAutomata(String, int[], int, boolean, int)}
* @throws IOException if there is a low-level IO error
*/
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term,
final int maxEdits, final int prefixLength, boolean transpositions) throws IOException {
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("max edits must be 0.." + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got: " + maxEdits);
}
if (prefixLength < 0) {
throw new IllegalArgumentException("prefixLength cannot be less than 0");
}
public FuzzyTermsEnum(Terms terms, AttributeSource atts, Term term, int termLength,
final int maxEdits, CompiledAutomaton[] automata) throws IOException {
this.maxEdits = maxEdits;
this.terms = terms;
this.term = term;
// convert the string into a utf32 int[] representation for fast comparisons
this.termText = stringToUTF32(term.text());
this.termLength = termText.length;
this.atts = atts;
this.termLength = termLength;
this.dfaAtt = atts.addAttribute(LevenshteinAutomataAttribute.class);
this.maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
this.boostAtt = atts.addAttribute(BoostAttribute.class);
// NOTE: boostAtt must pulled from attributes() not from atts! This is because TopTermsRewrite looks for boostAtt from this TermsEnum's
// private attributes() and not the global atts passed to us from MultiTermQuery:
this.boostAtt = attributes().addAttribute(BoostAttribute.class);
this.automata = automata;
//The prefix could be longer than the word.
//It's kind of silly though. It means we must match the entire word.
this.realPrefixLength = prefixLength > termLength ? termLength : prefixLength;
this.transpositions = transpositions;
CompiledAutomaton[] prevAutomata = dfaAtt.automata();
if (prevAutomata == null) {
prevAutomata = new CompiledAutomaton[maxEdits+1];
Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
for (int i = 0; i <= maxEdits; i++) {
try {
prevAutomata[i] = new CompiledAutomaton(automata[i], true, false);
} catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsException(term.text(), e);
}
}
// first segment computes the automata, and we share with subsequent segments via this Attribute:
dfaAtt.setAutomata(prevAutomata);
}
this.automata = prevAutomata;
bottom = maxBoostAtt.getMaxNonCompetitiveBoost();
bottomTerm = maxBoostAtt.getCompetitiveTerm();
bottomChanged(null);
}
/**
* Builds a binary Automaton to match a fuzzy term
* @param text the term to match
* @param prefixLength length of a required common prefix
* @param transpositions {@code true} if transpositions should count as a single edit
* @param maxEdits the maximum edit distance of matching terms
* Sets the maximum non-competitive boost, which may allow switching to a
* lower max-edit automaton at run time
*/
public static Automaton buildAutomaton(String text, int prefixLength, boolean transpositions, int maxEdits) {
int[] termText = stringToUTF32(text);
Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
return automata[automata.length - 1];
public void setMaxNonCompetitiveBoost(float boost) {
this.maxBoostAtt.setMaxNonCompetitiveBoost(boost);
}
private static int[] stringToUTF32(String text) {
/**
* Gets the boost of the current term
*/
public float getBoost() {
return boostAtt.getBoost();
}
static CompiledAutomaton[] buildAutomata(String text, int[] termText, int prefixLength, boolean transpositions, int maxEdits) {
CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
Automaton[] automata = buildAutomata(termText, prefixLength, transpositions, maxEdits);
for (int i = 0; i <= maxEdits; i++) {
try {
compiled[i] = new CompiledAutomaton(automata[i], true, false);
}
catch (TooComplexToDeterminizeException e) {
throw new FuzzyTermsException(text, e);
}
}
return compiled;
}
static int[] stringToUTF32(String text) {
int[] termText = new int[text.codePointCount(0, text.length())];
for (int cp, i = 0, j = 0; i < text.length(); i += Character.charCount(cp)) {
termText[j++] = cp = text.codePointAt(i);
@ -328,7 +327,12 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
public long ord() throws IOException {
return actualEnum.ord();
}
@Override
public AttributeSource attributes() {
return atts;
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
return actualEnum.seekExact(text);
@ -349,70 +353,6 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
return actualEnum.term();
}
/**
* reuses compiled automata across different segments,
* because they are independent of the index
* @lucene.internal */
public static interface LevenshteinAutomataAttribute extends Attribute {
public CompiledAutomaton[] automata();
public void setAutomata(CompiledAutomaton[] automata);
}
/**
* Stores compiled automata as a list (indexed by edit distance)
* @lucene.internal */
public static final class LevenshteinAutomataAttributeImpl extends AttributeImpl implements LevenshteinAutomataAttribute {
private CompiledAutomaton[] automata;
@Override
public CompiledAutomaton[] automata() {
return automata;
}
@Override
public void setAutomata(CompiledAutomaton[] automata) {
this.automata = automata;
}
@Override
public void clear() {
automata = null;
}
@Override
public int hashCode() {
if (automata == null) {
return 0;
} else {
return automata.hashCode();
}
}
@Override
public boolean equals(Object other) {
if (this == other)
return true;
if (!(other instanceof LevenshteinAutomataAttributeImpl))
return false;
return Arrays.equals(automata, ((LevenshteinAutomataAttributeImpl) other).automata);
}
@Override
public void copyTo(AttributeImpl _target) {
LevenshteinAutomataAttribute target = (LevenshteinAutomataAttribute) _target;
if (automata == null) {
target.setAutomata(null);
} else {
target.setAutomata(automata);
}
}
@Override
public void reflectWith(AttributeReflector reflector) {
reflector.reflect(LevenshteinAutomataAttribute.class, "automata", automata);
}
}
/**
* Thrown to indicate that there was an issue creating a fuzzy query for a given term.
* Typically occurs with terms longer than 220 UTF-8 characters,
@ -423,4 +363,5 @@ public final class FuzzyTermsEnum extends BaseTermsEnum {
super("Term too complex: " + term, cause);
}
}
}

View File

@ -17,9 +17,6 @@
package org.apache.lucene.util;
import static org.apache.lucene.util.RamUsageEstimator.*;
import static org.apache.lucene.util.RamUsageTester.sizeOf;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@ -31,9 +28,22 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import static org.apache.lucene.util.RamUsageEstimator.COMPRESSED_REFS_ENABLED;
import static org.apache.lucene.util.RamUsageEstimator.HOTSPOT_BEAN_CLASS;
import static org.apache.lucene.util.RamUsageEstimator.JVM_IS_HOTSPOT_64BIT;
import static org.apache.lucene.util.RamUsageEstimator.LONG_SIZE;
import static org.apache.lucene.util.RamUsageEstimator.MANAGEMENT_FACTORY_CLASS;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_ARRAY_HEADER;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_ALIGNMENT;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_HEADER;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOf;
import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
import static org.apache.lucene.util.RamUsageTester.sizeOf;
public class TestRamUsageEstimator extends LuceneTestCase {
static final String[] strings = new String[] {
@ -161,7 +171,7 @@ public class TestRamUsageEstimator extends LuceneTestCase {
Arrays.asList(new TermQuery(new Term("foo1", "bar1")), new TermQuery(new Term("baz1", "bam1"))), 1.0f);
BooleanQuery bq = new BooleanQuery.Builder()
.add(new TermQuery(new Term("foo2", "bar2")), BooleanClause.Occur.SHOULD)
.add(new FuzzyQuery(new Term("foo3", "baz3")), BooleanClause.Occur.MUST_NOT)
.add(new PhraseQuery.Builder().add(new Term("foo3", "baz3")).build(), BooleanClause.Occur.MUST_NOT)
.add(dismax, BooleanClause.Occur.MUST)
.build();
long actual = sizeOf(bq);

View File

@ -39,13 +39,11 @@ import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
@ -206,10 +204,7 @@ public class FuzzyLikeThisQuery extends Query
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
@ -226,7 +221,7 @@ public class FuzzyLikeThisQuery extends Query
variantsQ.insertWithOverflow(st);
minScore = variantsQ.top().score; // maintain minScore
}
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {

View File

@ -16,19 +16,6 @@
*/
package org.apache.lucene.search.spell;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
@ -37,6 +24,16 @@ import java.util.HashSet;
import java.util.Locale;
import java.util.PriorityQueue;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
* Simple automaton-based spellchecker.
* <p>
@ -420,25 +417,20 @@ public class DirectSpellChecker {
*/
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance,
float accuracy, final CharsRefBuilder spare) throws IOException {
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt =
atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
Terms terms = MultiTerms.getTerms(ir, term.field());
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance-1), true);
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
BoostAttribute boostAtt =
e.attributes().addAttribute(BoostAttribute.class);
while ((candidateTerm = e.next()) != null) {
// For FuzzyQuery, boost is the score:
float score = boostAtt.getBoost();
float score = e.getBoost();
// ignore uncompetitive hits
if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
continue;
@ -479,7 +471,7 @@ public class DirectSpellChecker {
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
e.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;