mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-24 05:44:59 +00:00
Fix threshold frequency computation in Suggesters (#34312)
The `term` and `phrase` suggesters have different options to filter candidates based on their frequencies. The `popular` mode for instance filters candidate terms that occur in less docs than the original term. However when we compute this threshold we use the total term frequency of a term instead of the document frequency. This is not inline with the actual filtering which is always based on the document frequency. This change fixes this discrepancy and clarifies the meaning of the different frequencies in use in the suggesters. It also ensures that the threshold doesn't overflow the maximum allowed value (Integer.MAX_VALUE). Closes #34282
This commit is contained in:
parent
c1c447a4cf
commit
7b49beb9b0
@ -372,9 +372,6 @@
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]ContextMapping.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoContextMapping.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]context[/\\]GeoQueryContext.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]CandidateScorer.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellChecker.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]WordScorer.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]RestoreService.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardFailure.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]main[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]SnapshotShardsService.java" checks="LineLength" />
|
||||
@ -564,7 +561,6 @@
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]CorruptedTranslogIT.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]IndexStoreTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]store[/\\]StoreTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]suggest[/\\]stats[/\\]SuggestStatsIT.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]index[/\\]translog[/\\]TranslogTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexing[/\\]IndexActionIT.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]indexlifecycle[/\\]IndexLifecycleActionIT.java" checks="LineLength" />
|
||||
@ -644,7 +640,6 @@
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]ContextCompletionSuggestSearchIT.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]CategoryContextMappingTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]completion[/\\]GeoContextMappingTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]search[/\\]suggest[/\\]phrase[/\\]NoisyChannelSpellCheckerTests.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]similarity[/\\]SimilarityIT.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]AbstractSnapshotIntegTestCase.java" checks="LineLength" />
|
||||
<suppress files="server[/\\]src[/\\]test[/\\]java[/\\]org[/\\]elasticsearch[/\\]snapshots[/\\]DedicatedClusterSnapshotRestoreIT.java" checks="LineLength" />
|
||||
|
@ -78,6 +78,13 @@ removed.
|
||||
* `levenstein` - replaced by `levenshtein`
|
||||
* `jarowinkler` - replaced by `jaro_winkler`
|
||||
|
||||
[float]
|
||||
==== `popular` mode for Suggesters
|
||||
|
||||
The `popular` mode for Suggesters (`term` and `phrase`) now uses the doc frequency
|
||||
(instead of the sum of the doc frequency) of the input terms to compute the frequency
|
||||
threshold for candidate suggestions.
|
||||
|
||||
[float]
|
||||
==== Limiting the number of terms that can be used in a Terms Query request
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
||||
@ -29,7 +30,7 @@ public abstract class CandidateGenerator {
|
||||
|
||||
public abstract boolean isKnownWord(BytesRef term) throws IOException;
|
||||
|
||||
public abstract long frequency(BytesRef term) throws IOException;
|
||||
public abstract TermStats termStats(BytesRef term) throws IOException;
|
||||
|
||||
public CandidateSet drawCandidates(BytesRef term) throws IOException {
|
||||
CandidateSet set = new CandidateSet(Candidate.EMPTY, createCandidate(term, true));
|
||||
@ -37,14 +38,14 @@ public abstract class CandidateGenerator {
|
||||
}
|
||||
|
||||
public Candidate createCandidate(BytesRef term, boolean userInput) throws IOException {
|
||||
return createCandidate(term, frequency(term), 1.0, userInput);
|
||||
return createCandidate(term, termStats(term), 1.0, userInput);
|
||||
}
|
||||
public Candidate createCandidate(BytesRef term, long frequency, double channelScore) throws IOException {
|
||||
return createCandidate(term, frequency, channelScore, false);
|
||||
public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore) throws IOException {
|
||||
return createCandidate(term, termStats, channelScore, false);
|
||||
}
|
||||
|
||||
public abstract Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException;
|
||||
public abstract Candidate createCandidate(BytesRef term, TermStats termStats,
|
||||
double channelScore, boolean userInput) throws IOException;
|
||||
|
||||
public abstract CandidateSet drawCandidates(CandidateSet set) throws IOException;
|
||||
|
||||
}
|
||||
|
@ -77,21 +77,24 @@ final class CandidateScorer {
|
||||
} else {
|
||||
if (numMissspellingsLeft > 0) {
|
||||
path[ord] = current.originalTerm;
|
||||
findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore,
|
||||
pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
for (int i = 0; i < current.candidates.length; i++) {
|
||||
path[ord] = current.candidates[i];
|
||||
findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore,
|
||||
pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
}
|
||||
} else {
|
||||
path[ord] = current.originalTerm;
|
||||
findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore,
|
||||
pathScore + scorer.score(path, candidates, ord, gramSize));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score)
|
||||
throws IOException {
|
||||
private void updateTop(CandidateSet[] candidates, Candidate[] path,
|
||||
PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException {
|
||||
score = Math.exp(score);
|
||||
assert Math.abs(score - score(path, candidates)) < 0.00001 : "cur_score=" + score + ", path_score=" + score(path,candidates);
|
||||
if (score > cutoffScore) {
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
@ -48,6 +49,7 @@ import java.util.Set;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
import static java.lang.Math.round;
|
||||
|
||||
public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
@ -57,20 +59,20 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
private final SuggestMode suggestMode;
|
||||
private final TermsEnum termsEnum;
|
||||
private final IndexReader reader;
|
||||
private final long dictSize;
|
||||
private final long sumTotalTermFreq;
|
||||
private static final double LOG_BASE = 5;
|
||||
private final long frequencyPlateau;
|
||||
private final Analyzer preFilter;
|
||||
private final Analyzer postFilter;
|
||||
private final double nonErrorLikelihood;
|
||||
private final boolean useTotalTermFrequency;
|
||||
private final CharsRefBuilder spare = new CharsRefBuilder();
|
||||
private final BytesRefBuilder byteSpare = new BytesRefBuilder();
|
||||
private final int numCandidates;
|
||||
|
||||
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
|
||||
double nonErrorLikelihood, int numCandidates) throws IOException {
|
||||
this(spellchecker, field, suggestMode, reader, nonErrorLikelihood, numCandidates, null, null, MultiFields.getTerms(reader, field));
|
||||
this(spellchecker, field, suggestMode, reader, nonErrorLikelihood,
|
||||
numCandidates, null, null, MultiFields.getTerms(reader, field));
|
||||
}
|
||||
|
||||
public DirectCandidateGenerator(DirectSpellChecker spellchecker, String field, SuggestMode suggestMode, IndexReader reader,
|
||||
@ -83,14 +85,12 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
this.numCandidates = numCandidates;
|
||||
this.suggestMode = suggestMode;
|
||||
this.reader = reader;
|
||||
final long dictSize = terms.getSumTotalTermFreq();
|
||||
this.useTotalTermFrequency = dictSize != -1;
|
||||
this.dictSize = dictSize == -1 ? reader.maxDoc() : dictSize;
|
||||
this.sumTotalTermFreq = terms.getSumTotalTermFreq() == -1 ? reader.maxDoc() : terms.getSumTotalTermFreq();
|
||||
this.preFilter = preFilter;
|
||||
this.postFilter = postFilter;
|
||||
this.nonErrorLikelihood = nonErrorLikelihood;
|
||||
float thresholdFrequency = spellchecker.getThresholdFrequency();
|
||||
this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int)(dictSize * thresholdFrequency);
|
||||
this.frequencyPlateau = thresholdFrequency >= 1.0f ? (int) thresholdFrequency: (int) (reader.maxDoc() * thresholdFrequency);
|
||||
termsEnum = terms.iterator();
|
||||
}
|
||||
|
||||
@ -99,24 +99,29 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
*/
|
||||
@Override
|
||||
public boolean isKnownWord(BytesRef term) throws IOException {
|
||||
return frequency(term) > 0;
|
||||
return termStats(term).docFreq > 0;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.elasticsearch.search.suggest.phrase.CandidateGenerator#frequency(org.apache.lucene.util.BytesRef)
|
||||
*/
|
||||
@Override
|
||||
public long frequency(BytesRef term) throws IOException {
|
||||
public TermStats termStats(BytesRef term) throws IOException {
|
||||
term = preFilter(term, spare, byteSpare);
|
||||
return internalFrequency(term);
|
||||
return internalTermStats(term);
|
||||
}
|
||||
|
||||
|
||||
public long internalFrequency(BytesRef term) throws IOException {
|
||||
public TermStats internalTermStats(BytesRef term) throws IOException {
|
||||
if (termsEnum.seekExact(term)) {
|
||||
return useTotalTermFrequency ? termsEnum.totalTermFreq() : termsEnum.docFreq();
|
||||
return new TermStats(termsEnum.docFreq(),
|
||||
/**
|
||||
* We use the {@link TermsEnum#docFreq()} for fields that don't
|
||||
* record the {@link TermsEnum#totalTermFreq()}.
|
||||
*/
|
||||
termsEnum.totalTermFreq() == -1 ? termsEnum.docFreq() : termsEnum.totalTermFreq());
|
||||
}
|
||||
return 0;
|
||||
return new TermStats(0, 0);
|
||||
}
|
||||
|
||||
public String getField() {
|
||||
@ -127,15 +132,28 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
public CandidateSet drawCandidates(CandidateSet set) throws IOException {
|
||||
Candidate original = set.originalTerm;
|
||||
BytesRef term = preFilter(original.term, spare, byteSpare);
|
||||
final long frequency = original.frequency;
|
||||
spellchecker.setThresholdFrequency(this.suggestMode == SuggestMode.SUGGEST_ALWAYS ? 0 : thresholdFrequency(frequency, dictSize));
|
||||
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
|
||||
/**
|
||||
* We use the {@link TermStats#docFreq} to compute the frequency threshold
|
||||
* because that's what {@link DirectSpellChecker#suggestSimilar} expects
|
||||
* when filtering terms.
|
||||
*/
|
||||
int threshold = thresholdTermFrequency(original.termStats.docFreq);
|
||||
if (threshold == Integer.MAX_VALUE) {
|
||||
// the threshold is the max possible frequency so we can skip the search
|
||||
return set;
|
||||
}
|
||||
spellchecker.setThresholdFrequency(threshold);
|
||||
}
|
||||
|
||||
SuggestWord[] suggestSimilar = spellchecker.suggestSimilar(new Term(field, term), numCandidates, reader, this.suggestMode);
|
||||
List<Candidate> candidates = new ArrayList<>(suggestSimilar.length);
|
||||
for (int i = 0; i < suggestSimilar.length; i++) {
|
||||
SuggestWord suggestWord = suggestSimilar[i];
|
||||
BytesRef candidate = new BytesRef(suggestWord.string);
|
||||
postFilter(new Candidate(candidate, internalFrequency(candidate), suggestWord.score,
|
||||
score(suggestWord.freq, suggestWord.score, dictSize), false), spare, byteSpare, candidates);
|
||||
TermStats termStats = internalTermStats(candidate);
|
||||
postFilter(new Candidate(candidate, termStats,
|
||||
suggestWord.score, score(termStats, suggestWord.score, sumTotalTermFreq), false), spare, byteSpare, candidates);
|
||||
}
|
||||
set.addCandidates(candidates);
|
||||
return set;
|
||||
@ -171,28 +189,30 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
BytesRef term = result.toBytesRef();
|
||||
// We should not use frequency(term) here because it will analyze the term again
|
||||
// If preFilter and postFilter are the same analyzer it would fail.
|
||||
long freq = internalFrequency(term);
|
||||
candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance,
|
||||
score(candidate.frequency, candidate.stringDistance, dictSize), false));
|
||||
TermStats termStats = internalTermStats(term);
|
||||
candidates.add(new Candidate(result.toBytesRef(), termStats, candidate.stringDistance,
|
||||
score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
|
||||
} else {
|
||||
candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood,
|
||||
score(candidate.frequency, candidate.stringDistance, dictSize), false));
|
||||
candidates.add(new Candidate(result.toBytesRef(), candidate.termStats, nonErrorLikelihood,
|
||||
score(candidate.termStats, candidate.stringDistance, sumTotalTermFreq), false));
|
||||
}
|
||||
}
|
||||
}, spare);
|
||||
}
|
||||
}
|
||||
|
||||
private double score(long frequency, double errorScore, long dictionarySize) {
|
||||
return errorScore * (((double)frequency + 1) / ((double)dictionarySize +1));
|
||||
private double score(TermStats termStats, double errorScore, long dictionarySize) {
|
||||
return errorScore * (((double)termStats.totalTermFreq + 1) / ((double)dictionarySize +1));
|
||||
}
|
||||
|
||||
protected long thresholdFrequency(long termFrequency, long dictionarySize) {
|
||||
if (termFrequency > 0) {
|
||||
return max(0, round(termFrequency * (log10(termFrequency - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1));
|
||||
// package protected for test
|
||||
int thresholdTermFrequency(int docFreq) {
|
||||
if (docFreq > 0) {
|
||||
return (int) min(
|
||||
max(0, round(docFreq * (log10(docFreq - frequencyPlateau) * (1.0 / log10(LOG_BASE))) + 1)), Integer.MAX_VALUE
|
||||
);
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
public abstract static class TokenConsumer {
|
||||
@ -249,12 +269,12 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
public static final Candidate[] EMPTY = new Candidate[0];
|
||||
public final BytesRef term;
|
||||
public final double stringDistance;
|
||||
public final long frequency;
|
||||
public final TermStats termStats;
|
||||
public final double score;
|
||||
public final boolean userInput;
|
||||
|
||||
public Candidate(BytesRef term, long frequency, double stringDistance, double score, boolean userInput) {
|
||||
this.frequency = frequency;
|
||||
public Candidate(BytesRef term, TermStats termStats, double stringDistance, double score, boolean userInput) {
|
||||
this.termStats = termStats;
|
||||
this.term = term;
|
||||
this.stringDistance = stringDistance;
|
||||
this.score = score;
|
||||
@ -266,7 +286,7 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
return "Candidate [term=" + term.utf8ToString()
|
||||
+ ", stringDistance=" + stringDistance
|
||||
+ ", score=" + score
|
||||
+ ", frequency=" + frequency
|
||||
+ ", termStats=" + termStats
|
||||
+ (userInput ? ", userInput" : "") + "]";
|
||||
}
|
||||
|
||||
@ -305,8 +325,8 @@ public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException {
|
||||
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
|
||||
public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException {
|
||||
return new Candidate(term, termStats, channelScore, score(termStats, channelScore, sumTotalTermFreq), userInput);
|
||||
}
|
||||
|
||||
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)
|
||||
|
@ -46,7 +46,7 @@ final class LaplaceScorer extends WordScorer {
|
||||
@Override
|
||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||
join(separator, spare, w_1.term, word.term);
|
||||
return (alpha + frequency(spare.get())) / (w_1.frequency + alpha * numTerms);
|
||||
return (alpha + frequency(spare.get())) / (w_1.termStats.totalTermFreq + alpha * numTerms);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -60,7 +60,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
||||
if (count < 1) {
|
||||
return unigramLambda * scoreUnigram(word);
|
||||
}
|
||||
return bigramLambda * (count / (0.5d + w_1.frequency)) + unigramLambda * scoreUnigram(word);
|
||||
return bigramLambda * (count / (0.5d + w_1.termStats.totalTermFreq)) + unigramLambda * scoreUnigram(word);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -18,6 +18,7 @@
|
||||
*/
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
||||
@ -41,8 +42,8 @@ public final class MultiCandidateGeneratorWrapper extends CandidateGenerator {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long frequency(BytesRef term) throws IOException {
|
||||
return candidateGenerator[0].frequency(term);
|
||||
public TermStats termStats(BytesRef term) throws IOException {
|
||||
return candidateGenerator[0].termStats(term);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -65,8 +66,8 @@ public final class MultiCandidateGeneratorWrapper extends CandidateGenerator {
|
||||
return set;
|
||||
}
|
||||
@Override
|
||||
public Candidate createCandidate(BytesRef term, long frequency, double channelScore, boolean userInput) throws IOException {
|
||||
return candidateGenerator[0].createCandidate(term, frequency, channelScore, userInput);
|
||||
public Candidate createCandidate(BytesRef term, TermStats termStats, double channelScore, boolean userInput) throws IOException {
|
||||
return candidateGenerator[0].createCandidate(term, termStats, channelScore, userInput);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||
import org.apache.lucene.analysis.synonym.SynonymFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.codecs.TermStats;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
@ -84,9 +85,9 @@ public final class NoisyChannelSpellChecker {
|
||||
anyUnigram = true;
|
||||
if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
|
||||
assert currentSet != null;
|
||||
long freq = 0;
|
||||
if ((freq = generator.frequency(term)) > 0) {
|
||||
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
|
||||
TermStats termStats = generator.termStats(term);
|
||||
if (termStats.docFreq > 0) {
|
||||
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), termStats, realWordLikelihood));
|
||||
}
|
||||
} else {
|
||||
if (currentSet != null) {
|
||||
@ -131,9 +132,11 @@ public final class NoisyChannelSpellChecker {
|
||||
}
|
||||
|
||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField,
|
||||
WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
|
||||
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors,
|
||||
numCorrections, scorer, confidence, gramSize);
|
||||
|
||||
}
|
||||
|
||||
|
@ -28,8 +28,8 @@ import java.io.IOException;
|
||||
class StupidBackoffScorer extends WordScorer {
|
||||
private final double discount;
|
||||
|
||||
StupidBackoffScorer(IndexReader reader, Terms terms,String field, double realWordLikelyhood, BytesRef separator, double discount)
|
||||
throws IOException {
|
||||
StupidBackoffScorer(IndexReader reader, Terms terms,String field,
|
||||
double realWordLikelyhood, BytesRef separator, double discount) throws IOException {
|
||||
super(reader, terms, field, realWordLikelyhood, separator);
|
||||
this.discount = discount;
|
||||
}
|
||||
@ -45,7 +45,7 @@ class StupidBackoffScorer extends WordScorer {
|
||||
if (count < 1) {
|
||||
return discount * scoreUnigram(word);
|
||||
}
|
||||
return count / (w_1.frequency + 0.00000000001d);
|
||||
return count / (w_1.termStats.totalTermFreq + 0.00000000001d);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -60,7 +60,7 @@ class StupidBackoffScorer extends WordScorer {
|
||||
join(separator, spare, w_2.term, w_1.term, w.term);
|
||||
long trigramCount = frequency(spare.get());
|
||||
if (trigramCount < 1) {
|
||||
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
||||
return discount * (bigramCount / (w_1.termStats.totalTermFreq + 0.00000000001d));
|
||||
}
|
||||
return trigramCount / (bigramCount + 0.00000000001d);
|
||||
}
|
||||
|
@ -62,7 +62,8 @@ public abstract class WordScorer {
|
||||
// division by zero, by scoreUnigram.
|
||||
final long nTerms = terms.size();
|
||||
this.numTerms = nTerms == -1 ? reader.maxDoc() : nTerms;
|
||||
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
|
||||
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null,
|
||||
BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
|
||||
this.reader = reader;
|
||||
this.realWordLikelyhood = realWordLikelyHood;
|
||||
this.separator = separator;
|
||||
|
@ -106,9 +106,12 @@ public class SuggestStatsIT extends ESIntegTestCase {
|
||||
assertThat(suggest.getSuggestCurrent(), equalTo(0L));
|
||||
|
||||
// check suggest count
|
||||
assertThat(suggest.getSuggestCount(), equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2)));
|
||||
assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1)));
|
||||
assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(), equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2)));
|
||||
assertThat(suggest.getSuggestCount(),
|
||||
equalTo((long) (suggestAllIdx * totalShards + suggestIdx1 * shardsIdx1 + suggestIdx2 * shardsIdx2)));
|
||||
assertThat(indicesStats.getIndices().get("test1").getTotal().getSearch().getTotal().getSuggestCount(),
|
||||
equalTo((long) ((suggestAllIdx + suggestIdx1) * shardsIdx1)));
|
||||
assertThat(indicesStats.getIndices().get("test2").getTotal().getSearch().getTotal().getSuggestCount(),
|
||||
equalTo((long) ((suggestAllIdx + suggestIdx2) * shardsIdx2)));
|
||||
|
||||
logger.info("iter {}, iter1 {}, iter2 {}, {}", suggestAllIdx, suggestIdx1, suggestIdx2, endTime - startTime);
|
||||
// check suggest time
|
||||
|
@ -19,11 +19,20 @@
|
||||
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
||||
import org.apache.lucene.search.spell.JaroWinklerDistance;
|
||||
import org.apache.lucene.search.spell.LevenshteinDistance;
|
||||
import org.apache.lucene.search.spell.LuceneLevenshteinDistance;
|
||||
import org.apache.lucene.search.spell.NGramDistance;
|
||||
import org.apache.lucene.search.spell.SuggestMode;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
|
||||
import org.elasticsearch.common.xcontent.ToXContent;
|
||||
import org.elasticsearch.common.xcontent.XContentBuilder;
|
||||
@ -32,7 +41,6 @@ import org.elasticsearch.common.xcontent.XContentParseException;
|
||||
import org.elasticsearch.common.xcontent.XContentParser;
|
||||
import org.elasticsearch.common.xcontent.XContentType;
|
||||
import org.elasticsearch.common.xcontent.json.JsonXContent;
|
||||
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator;
|
||||
import org.elasticsearch.test.ESTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -133,7 +141,8 @@ public class DirectCandidateGeneratorTests extends ESTestCase {
|
||||
}
|
||||
}
|
||||
|
||||
public static void assertEqualGenerators(DirectCandidateGenerator first, DirectCandidateGenerator second) {
|
||||
public static void assertEqualGenerators(PhraseSuggestionContext.DirectCandidateGenerator first,
|
||||
PhraseSuggestionContext.DirectCandidateGenerator second) {
|
||||
assertEquals(first.field(), second.field());
|
||||
assertEquals(first.accuracy(), second.accuracy(), Float.MIN_VALUE);
|
||||
assertEquals(first.maxTermFreq(), second.maxTermFreq(), Float.MIN_VALUE);
|
||||
@ -176,6 +185,66 @@ public class DirectCandidateGeneratorTests extends ESTestCase {
|
||||
"[direct_generator] size doesn't support values of type: START_ARRAY");
|
||||
}
|
||||
|
||||
public void testFrequencyThreshold() throws Exception {
|
||||
try (Directory dir = newDirectory()) {
|
||||
IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig());
|
||||
int numDocs = randomIntBetween(10, 20);
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
Document doc = new Document();
|
||||
if (i == 0) {
|
||||
for (int j = 0; j < numDocs; j++) {
|
||||
doc.add(new TextField("field", "fooz", Field.Store.NO));
|
||||
}
|
||||
} else {
|
||||
doc.add(new TextField("field", "foo", Field.Store.NO));
|
||||
}
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
try (IndexReader reader = DirectoryReader.open(writer)) {
|
||||
writer.close();
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR,
|
||||
reader, 0f, 10);
|
||||
DirectCandidateGenerator.CandidateSet candidateSet =
|
||||
generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY,
|
||||
generator.createCandidate(new BytesRef("fooz"), false)));
|
||||
assertThat(candidateSet.candidates.length, equalTo(1));
|
||||
assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1));
|
||||
assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1));
|
||||
|
||||
// test that it doesn't overflow
|
||||
assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE));
|
||||
|
||||
spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setThresholdFrequency(0.5f);
|
||||
generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR,
|
||||
reader, 0f, 10);
|
||||
candidateSet =
|
||||
generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY,
|
||||
generator.createCandidate(new BytesRef("fooz"), false)));
|
||||
assertThat(candidateSet.candidates.length, equalTo(1));
|
||||
assertThat(candidateSet.candidates[0].termStats.docFreq, equalTo(numDocs - 1));
|
||||
assertThat(candidateSet.candidates[0].termStats.totalTermFreq, equalTo((long) numDocs - 1));
|
||||
|
||||
// test that it doesn't overflow
|
||||
assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE));
|
||||
|
||||
spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setThresholdFrequency(0.5f);
|
||||
generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_ALWAYS,
|
||||
reader, 0f, 10);
|
||||
candidateSet =
|
||||
generator.drawCandidates(new DirectCandidateGenerator.CandidateSet(DirectCandidateGenerator.Candidate.EMPTY,
|
||||
generator.createCandidate(new BytesRef("fooz"), false)));
|
||||
assertThat(candidateSet.candidates.length, equalTo(01));
|
||||
|
||||
// test that it doesn't overflow
|
||||
assertThat(generator.thresholdTermFrequency(Integer.MAX_VALUE), equalTo(Integer.MAX_VALUE));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void assertIllegalXContent(String directGenerator, Class<? extends Exception> exceptionClass, String exceptionMsg)
|
||||
throws IOException {
|
||||
try (XContentParser parser = createParser(JsonXContent.jsonXContent, directGenerator)) {
|
||||
|
@ -110,20 +110,24 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
}
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(writer);
|
||||
WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
|
||||
WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR,
|
||||
ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 2);
|
||||
Correction[] corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
|
||||
assertThat(result.cutoffScore, greaterThan(0d));
|
||||
|
||||
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
|
||||
result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 0, 1);
|
||||
corrections = result.corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
|
||||
@ -131,8 +135,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
|
||||
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -143,7 +149,8 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
|
||||
assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f,
|
||||
4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -152,8 +159,10 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
|
||||
// Test some of the highlighting corner cases
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -187,19 +196,25 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
spellchecker.setMinPrefix(1);
|
||||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.85);
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
|
||||
10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
|
||||
|
||||
// Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85,
|
||||
10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
|
||||
}
|
||||
@ -265,47 +280,58 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
}
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(writer);
|
||||
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
|
||||
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5f);
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
|
||||
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
|
||||
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir,
|
||||
0.95, 10);
|
||||
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir,
|
||||
0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
|
||||
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
|
||||
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(0)); // only use forward with constant prefix
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir,
|
||||
"body", wordScorer, 0, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
|
||||
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
"body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir,
|
||||
"body", wordScorer, 1.5f, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir,
|
||||
"body", wordScorer, 1, 2).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
|
||||
}
|
||||
@ -362,22 +388,28 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
}
|
||||
|
||||
DirectoryReader ir = DirectoryReader.open(writer);
|
||||
WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
spellchecker.setMinQueryLength(1);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections;
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir,
|
||||
0.95, 5);
|
||||
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1,
|
||||
ir, "body", wordScorer, 1, 1).corrections;
|
||||
assertThat(corrections.length, equalTo(0));
|
||||
// assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
|
||||
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections;
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -387,7 +419,8 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(4));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -395,7 +428,8 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
|
||||
|
||||
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections;
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1,
|
||||
ir, "body", wordScorer, 100, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(1));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
|
||||
@ -423,17 +457,23 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
spellchecker.setMinPrefix(1);
|
||||
spellchecker.setMinQueryLength(1);
|
||||
suggester = new NoisyChannelSpellChecker(0.95);
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d,
|
||||
new BytesRef(" "), 0.5, 0.4, 0.1);
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
|
||||
generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95,
|
||||
10, null, analyzer, MultiFields.getTerms(ir, "body"));
|
||||
corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4,
|
||||
ir, "body", wordScorer, 1, 3).corrections;
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
|
||||
|
||||
|
||||
wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections;
|
||||
wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d,
|
||||
new BytesRef(" "), 0.4);
|
||||
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2,
|
||||
ir, "body", wordScorer, 0, 3).corrections;
|
||||
assertThat(corrections.length, equalTo(2));
|
||||
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
|
||||
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
|
||||
@ -452,11 +492,14 @@ public class NoisyChannelSpellCheckerTests extends ESTestCase {
|
||||
}
|
||||
|
||||
try (DirectoryReader ir = DirectoryReader.open(dir)) {
|
||||
WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d, new BytesRef(" "), 0.4f);
|
||||
WordScorer wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.95d,
|
||||
new BytesRef(" "), 0.4f);
|
||||
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
|
||||
DirectSpellChecker spellchecker = new DirectSpellChecker();
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1, ir, "field", wordScorer, 1, 2);
|
||||
DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "field",
|
||||
SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
|
||||
Result result = suggester.getCorrections(new StandardAnalyzer(), new BytesRef("valeu"), generator, 1, 1,
|
||||
ir, "field", wordScorer, 1, 2);
|
||||
assertThat(result.corrections.length, equalTo(1));
|
||||
assertThat(result.corrections[0].join(space).utf8ToString(), equalTo("value"));
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user