mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 10:25:15 +00:00
Move analysis helper methods to DirectCandidateGenerator
This commit is contained in:
parent
d6e16b6e74
commit
cdc77648a1
@ -18,7 +18,6 @@
|
||||
*/
|
||||
package org.elasticsearch.search.suggest;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
@ -29,11 +28,7 @@ import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.elasticsearch.common.ParseField;
|
||||
import org.elasticsearch.common.io.FastCharArrayReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
@ -103,44 +98,6 @@ public final class SuggestUtils {
|
||||
public void end() {}
|
||||
}
|
||||
|
||||
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
|
||||
spare.copyUTF8Bytes(toAnalyze);
|
||||
return analyze(analyzer, spare.get(), field, consumer);
|
||||
}
|
||||
|
||||
public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException {
|
||||
try (TokenStream ts = analyzer.tokenStream(
|
||||
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) {
|
||||
return analyze(ts, consumer);
|
||||
}
|
||||
}
|
||||
|
||||
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
|
||||
* because really the caller who called {@link Analyzer#tokenStream} should close it,
|
||||
* but when trying that there are recursion issues when we try to use the same
|
||||
* TokenStream twice in the same recursion... */
|
||||
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
|
||||
int numTokens = 0;
|
||||
boolean success = false;
|
||||
try {
|
||||
stream.reset();
|
||||
consumer.reset(stream);
|
||||
while (stream.incrementToken()) {
|
||||
consumer.nextToken();
|
||||
numTokens++;
|
||||
}
|
||||
consumer.end();
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
stream.close();
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(stream);
|
||||
}
|
||||
}
|
||||
return numTokens;
|
||||
}
|
||||
|
||||
public static class Fields {
|
||||
public static final ParseField STRING_DISTANCE = new ParseField("string_distance");
|
||||
public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode");
|
||||
|
@ -19,6 +19,7 @@
|
||||
package org.elasticsearch.search.suggest.phrase;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.Term;
|
||||
@ -29,8 +30,12 @@ import org.apache.lucene.search.spell.SuggestMode;
|
||||
import org.apache.lucene.search.spell.SuggestWord;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.BytesRefBuilder;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.elasticsearch.common.io.FastCharArrayReader;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.SuggestUtils.TokenConsumer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
@ -44,7 +49,7 @@ import static java.lang.Math.log10;
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.round;
|
||||
|
||||
final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
|
||||
private final DirectSpellChecker spellchecker;
|
||||
private final String field;
|
||||
@ -140,7 +145,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
return term;
|
||||
}
|
||||
final BytesRefBuilder result = byteSpare;
|
||||
SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
|
||||
analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
|
||||
|
||||
@Override
|
||||
public void nextToken() throws IOException {
|
||||
@ -156,7 +161,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
candidates.add(candidate);
|
||||
} else {
|
||||
final BytesRefBuilder result = byteSpare;
|
||||
SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
|
||||
analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
|
||||
@Override
|
||||
public void nextToken() throws IOException {
|
||||
this.fillBytesRef(result);
|
||||
@ -283,4 +288,39 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
|
||||
}
|
||||
|
||||
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
|
||||
spare.copyUTF8Bytes(toAnalyze);
|
||||
CharsRef charsRef = spare.get();
|
||||
try (TokenStream ts = analyzer.tokenStream(
|
||||
field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
|
||||
return analyze(ts, consumer);
|
||||
}
|
||||
}
|
||||
|
||||
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
|
||||
* because really the caller who called {@link Analyzer#tokenStream} should close it,
|
||||
* but when trying that there are recursion issues when we try to use the same
|
||||
* TokenStream twice in the same recursion... */
|
||||
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
|
||||
int numTokens = 0;
|
||||
boolean success = false;
|
||||
try {
|
||||
stream.reset();
|
||||
consumer.reset(stream);
|
||||
while (stream.incrementToken()) {
|
||||
consumer.nextToken();
|
||||
numTokens++;
|
||||
}
|
||||
consumer.end();
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
stream.close();
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(stream);
|
||||
}
|
||||
}
|
||||
return numTokens;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -51,19 +51,19 @@ public final class NoisyChannelSpellChecker {
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
|
||||
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
|
||||
}
|
||||
|
||||
|
||||
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
||||
this.realWordLikelihood = nonErrorLikelihood;
|
||||
this.requireUnigram = requireUnigram;
|
||||
this.tokenLimit = tokenLimit;
|
||||
|
||||
|
||||
}
|
||||
|
||||
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
|
||||
final List<CandidateSet> candidateSetsList = new ArrayList<>();
|
||||
SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
|
||||
DirectCandidateGenerator.analyze(stream, new SuggestUtils.TokenConsumer() {
|
||||
CandidateSet currentSet = null;
|
||||
private TypeAttribute typeAttribute;
|
||||
private final BytesRefBuilder termsRef = new BytesRefBuilder();
|
||||
@ -74,7 +74,7 @@ public final class NoisyChannelSpellChecker {
|
||||
super.reset(stream);
|
||||
typeAttribute = stream.addAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void nextToken() throws IOException {
|
||||
anyTokens = true;
|
||||
@ -96,7 +96,7 @@ public final class NoisyChannelSpellChecker {
|
||||
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
if (currentSet != null) {
|
||||
@ -107,11 +107,11 @@ public final class NoisyChannelSpellChecker {
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
|
||||
return Result.EMPTY;
|
||||
}
|
||||
|
||||
|
||||
for (CandidateSet candidateSet : candidateSetsList) {
|
||||
generator.drawCandidates(candidateSet);
|
||||
}
|
||||
@ -127,13 +127,13 @@ public final class NoisyChannelSpellChecker {
|
||||
cutoffScore = inputPhraseScore * confidence;
|
||||
}
|
||||
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||
|
||||
|
||||
return new Result(bestCandidates, cutoffScore);
|
||||
}
|
||||
|
||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||
|
||||
|
||||
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
|
||||
|
||||
}
|
||||
|
@ -34,6 +34,7 @@ import org.elasticsearch.search.suggest.SuggestUtils;
|
||||
import org.elasticsearch.search.suggest.Suggester;
|
||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
@ -73,7 +74,7 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
|
||||
private List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
|
||||
final List<Token> result = new ArrayList<>();
|
||||
final String field = suggestion.getField();
|
||||
SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
|
||||
DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
|
||||
@Override
|
||||
public void nextToken() {
|
||||
Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));
|
||||
|
Loading…
x
Reference in New Issue
Block a user