Move analysis helper methods to DirectCandidateGenerator

This commit is contained in:
Christoph Büscher 2016-08-10 13:00:30 +02:00
parent d6e16b6e74
commit cdc77648a1
4 changed files with 55 additions and 57 deletions

View File

@ -18,7 +18,6 @@
*/
package org.elasticsearch.search.suggest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -29,11 +28,7 @@ import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.FastCharArrayReader;
import java.io.IOException;
import java.util.Comparator;
@ -103,44 +98,6 @@ public final class SuggestUtils {
public void end() {}
}
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
spare.copyUTF8Bytes(toAnalyze);
return analyze(analyzer, spare.get(), field, consumer);
}
public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException {
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) {
return analyze(ts, consumer);
}
}
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
int numTokens = 0;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
success = true;
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
return numTokens;
}
public static class Fields {
public static final ParseField STRING_DISTANCE = new ParseField("string_distance");
public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode");

View File

@ -19,6 +19,7 @@
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
@ -29,8 +30,12 @@ import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.io.FastCharArrayReader;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.SuggestUtils.TokenConsumer;
import java.io.IOException;
import java.util.ArrayList;
@ -44,7 +49,7 @@ import static java.lang.Math.log10;
import static java.lang.Math.max;
import static java.lang.Math.round;
final class DirectCandidateGenerator extends CandidateGenerator {
public final class DirectCandidateGenerator extends CandidateGenerator {
private final DirectSpellChecker spellchecker;
private final String field;
@ -140,7 +145,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
return term;
}
final BytesRefBuilder result = byteSpare;
SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
@Override
public void nextToken() throws IOException {
@ -156,7 +161,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
candidates.add(candidate);
} else {
final BytesRefBuilder result = byteSpare;
SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
@Override
public void nextToken() throws IOException {
this.fillBytesRef(result);
@ -283,4 +288,39 @@ final class DirectCandidateGenerator extends CandidateGenerator {
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
}
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
spare.copyUTF8Bytes(toAnalyze);
CharsRef charsRef = spare.get();
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
return analyze(ts, consumer);
}
}
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
int numTokens = 0;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
success = true;
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
return numTokens;
}
}

View File

@ -51,19 +51,19 @@ public final class NoisyChannelSpellChecker {
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
this.realWordLikelihood = nonErrorLikelihood;
this.requireUnigram = requireUnigram;
this.tokenLimit = tokenLimit;
}
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<>();
SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
DirectCandidateGenerator.analyze(stream, new SuggestUtils.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
@ -74,7 +74,7 @@ public final class NoisyChannelSpellChecker {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
@ -96,7 +96,7 @@ public final class NoisyChannelSpellChecker {
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
@ -107,11 +107,11 @@ public final class NoisyChannelSpellChecker {
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
@ -127,13 +127,13 @@ public final class NoisyChannelSpellChecker {
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
}

View File

@ -34,6 +34,7 @@ import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.Suggester;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator;
import java.io.IOException;
import java.util.ArrayList;
@ -73,7 +74,7 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
private List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
final List<Token> result = new ArrayList<>();
final String field = suggestion.getField();
SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
@Override
public void nextToken() {
Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));