Merge pull request #19920 from cbuescher/remove-SuggestUtil

Remove SuggestUtil helper class
This commit is contained in:
Christoph Büscher 2016-08-10 19:22:22 +02:00 committed by GitHub
commit 563bf0154c
15 changed files with 192 additions and 254 deletions

View File

@ -21,8 +21,13 @@ package org.elasticsearch.search.suggest;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import java.util.Comparator;
public class DirectSpellcheckerSettings {
// NB: If this changes, make sure to change the default in TermBuilderSuggester
@ -49,6 +54,9 @@ public class DirectSpellcheckerSettings {
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private float minDocFreq = DEFAULT_MIN_DOC_FREQ;
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
public SuggestMode suggestMode() {
return suggestMode;
}
@ -129,6 +137,33 @@ public class DirectSpellcheckerSettings {
this.minDocFreq = minDocFreq;
}
public DirectSpellChecker createDirectSpellChecker() {
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
directSpellChecker.setAccuracy(accuracy());
Comparator<SuggestWord> comparator;
switch (sort()) {
case SCORE:
comparator = SCORE_COMPARATOR;
break;
case FREQUENCY:
comparator = LUCENE_FREQUENCY;
break;
default:
throw new IllegalArgumentException("Illegal suggest sort: " + sort());
}
directSpellChecker.setComparator(comparator);
directSpellChecker.setDistance(stringDistance());
directSpellChecker.setMaxEdits(maxEdits());
directSpellChecker.setMaxInspections(maxInspections());
directSpellChecker.setMaxQueryFrequency(maxTermFreq());
directSpellChecker.setMinPrefix(prefixLength());
directSpellChecker.setMinQueryLength(minWordLength());
directSpellChecker.setThresholdFrequency(minDocFreq());
directSpellChecker.setLowerCaseTerms(false);
return directSpellChecker;
}
@Override
public String toString() {
return "[" +

View File

@ -1,162 +0,0 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.search.suggest;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
import org.apache.lucene.search.spell.SuggestWordQueue;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.io.FastCharArrayReader;
import java.io.IOException;
import java.util.Comparator;
public final class SuggestUtils {
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
private SuggestUtils() {
// utils!!
}
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
directSpellChecker.setAccuracy(suggestion.accuracy());
Comparator<SuggestWord> comparator;
switch (suggestion.sort()) {
case SCORE:
comparator = SCORE_COMPARATOR;
break;
case FREQUENCY:
comparator = LUCENE_FREQUENCY;
break;
default:
throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
}
directSpellChecker.setComparator(comparator);
directSpellChecker.setDistance(suggestion.stringDistance());
directSpellChecker.setMaxEdits(suggestion.maxEdits());
directSpellChecker.setMaxInspections(suggestion.maxInspections());
directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
directSpellChecker.setMinPrefix(suggestion.prefixLength());
directSpellChecker.setMinQueryLength(suggestion.minWordLength());
directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
directSpellChecker.setLowerCaseTerms(false);
return directSpellChecker;
}
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
result.clear();
for (int i = 0; i < toJoin.length - 1; i++) {
result.append(toJoin[i]);
result.append(separator);
}
result.append(toJoin[toJoin.length-1]);
return result.get();
}
public abstract static class TokenConsumer {
protected CharTermAttribute charTermAttr;
protected PositionIncrementAttribute posIncAttr;
protected OffsetAttribute offsetAttr;
public void reset(TokenStream stream) {
charTermAttr = stream.addAttribute(CharTermAttribute.class);
posIncAttr = stream.addAttribute(PositionIncrementAttribute.class);
offsetAttr = stream.addAttribute(OffsetAttribute.class);
}
protected BytesRef fillBytesRef(BytesRefBuilder spare) {
spare.copyChars(charTermAttr);
return spare.get();
}
public abstract void nextToken() throws IOException;
public void end() {}
}
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
spare.copyUTF8Bytes(toAnalyze);
return analyze(analyzer, spare.get(), field, consumer);
}
public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException {
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) {
return analyze(ts, consumer);
}
}
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
int numTokens = 0;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
success = true;
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
return numTokens;
}
public static class Fields {
public static final ParseField STRING_DISTANCE = new ParseField("string_distance");
public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode");
public static final ParseField MAX_EDITS = new ParseField("max_edits");
public static final ParseField MAX_INSPECTIONS = new ParseField("max_inspections");
// TODO some of these constants are the same as MLT constants and
// could be moved to a shared class for consistency
public static final ParseField MAX_TERM_FREQ = new ParseField("max_term_freq");
public static final ParseField PREFIX_LENGTH = new ParseField("prefix_length", "prefix_len");
public static final ParseField MIN_WORD_LENGTH = new ParseField("min_word_length", "min_word_len");
public static final ParseField MIN_DOC_FREQ = new ParseField("min_doc_freq");
public static final ParseField SHARD_SIZE = new ParseField("shard_size");
public static final ParseField ANALYZER = new ParseField("analyzer");
public static final ParseField FIELD = new ParseField("field");
public static final ParseField SIZE = new ParseField("size");
public static final ParseField SORT = new ParseField("sort");
public static final ParseField ACCURACY = new ParseField("accuracy");
}
}

View File

@ -37,7 +37,6 @@ import org.elasticsearch.index.mapper.MappedFieldType;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
import org.elasticsearch.search.suggest.completion.context.ContextMapping;
@ -48,7 +47,6 @@ import org.elasticsearch.search.suggest.completion2x.context.GeolocationContextM
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -90,10 +88,10 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
TLP_PARSER.declareField((parser, completionSuggestionContext, context) ->
completionSuggestionContext.regexOptions = RegexOptions.parse(parser, context),
RegexOptions.REGEX_OPTIONS, ObjectParser.ValueType.OBJECT);
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, SuggestUtils.Fields.FIELD);
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, SuggestUtils.Fields.ANALYZER);
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SuggestUtils.Fields.SIZE);
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SuggestUtils.Fields.SHARD_SIZE);
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, FIELDNAME_FIELD);
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, ANALYZER_FIELD);
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SIZE_FIELD);
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SHARDSIZE_FIELD);
TLP_PARSER.declareField((p, v, c) -> {
// Copy the current structure. We will parse, once the mapping is provided
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
@ -353,7 +351,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
// now we should have field name, check and copy fields over to the suggestion builder we return
if (field == null) {
throw new ElasticsearchParseException(
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
}
return new CompletionSuggestionBuilder(field, builder);
}

View File

@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.util.Arrays;
@ -73,7 +72,7 @@ public final class Correction implements Comparable<Correction> {
len += toJoin[i].length;
}
result.grow(len);
return SuggestUtils.join(separator, result, toJoin);
return WordScorer.join(separator, result, toJoin);
}
/** Lower scores sorts first; if scores are equal,

View File

@ -19,6 +19,10 @@
package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
@ -29,8 +33,10 @@ import org.apache.lucene.search.spell.SuggestMode;
import org.apache.lucene.search.spell.SuggestWord;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.common.io.FastCharArrayReader;
import java.io.IOException;
import java.util.ArrayList;
@ -44,7 +50,7 @@ import static java.lang.Math.log10;
import static java.lang.Math.max;
import static java.lang.Math.round;
final class DirectCandidateGenerator extends CandidateGenerator {
public final class DirectCandidateGenerator extends CandidateGenerator {
private final DirectSpellChecker spellchecker;
private final String field;
@ -140,7 +146,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
return term;
}
final BytesRefBuilder result = byteSpare;
SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
analyze(preFilter, term, field, new TokenConsumer() {
@Override
public void nextToken() throws IOException {
@ -156,7 +162,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
candidates.add(candidate);
} else {
final BytesRefBuilder result = byteSpare;
SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
analyze(postFilter, candidate.term, field, new TokenConsumer() {
@Override
public void nextToken() throws IOException {
this.fillBytesRef(result);
@ -189,6 +195,27 @@ final class DirectCandidateGenerator extends CandidateGenerator {
}
public abstract static class TokenConsumer {
protected CharTermAttribute charTermAttr;
protected PositionIncrementAttribute posIncAttr;
protected OffsetAttribute offsetAttr;
public void reset(TokenStream stream) {
charTermAttr = stream.addAttribute(CharTermAttribute.class);
posIncAttr = stream.addAttribute(PositionIncrementAttribute.class);
offsetAttr = stream.addAttribute(OffsetAttribute.class);
}
protected BytesRef fillBytesRef(BytesRefBuilder spare) {
spare.copyChars(charTermAttr);
return spare.get();
}
public abstract void nextToken() throws IOException;
public void end() {}
}
public static class CandidateSet {
public Candidate[] candidates;
public final Candidate originalTerm;
@ -283,4 +310,40 @@ final class DirectCandidateGenerator extends CandidateGenerator {
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
}
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)
throws IOException {
spare.copyUTF8Bytes(toAnalyze);
CharsRef charsRef = spare.get();
try (TokenStream ts = analyzer.tokenStream(
field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
return analyze(ts, consumer);
}
}
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
* because really the caller who called {@link Analyzer#tokenStream} should close it,
* but when trying that there are recursion issues when we try to use the same
* TokenStream twice in the same recursion... */
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
int numTokens = 0;
boolean success = false;
try {
stream.reset();
consumer.reset(stream);
while (stream.incrementToken()) {
consumer.nextToken();
numTokens++;
}
consumer.end();
success = true;
} finally {
if (success) {
stream.close();
} else {
IOUtils.closeWhileHandlingException(stream);
}
}
return numTokens;
}
}

View File

@ -51,21 +51,21 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
private static final String TYPE = "direct_generator";
static final ParseField DIRECT_GENERATOR_FIELD = new ParseField(TYPE);
static final ParseField FIELDNAME_FIELD = new ParseField("field");
static final ParseField PREFILTER_FIELD = new ParseField("pre_filter");
static final ParseField POSTFILTER_FIELD = new ParseField("post_filter");
static final ParseField SUGGESTMODE_FIELD = new ParseField("suggest_mode");
static final ParseField MIN_DOC_FREQ_FIELD = new ParseField("min_doc_freq");
static final ParseField ACCURACY_FIELD = new ParseField("accuracy");
static final ParseField SIZE_FIELD = new ParseField("size");
static final ParseField SORT_FIELD = new ParseField("sort");
static final ParseField STRING_DISTANCE_FIELD = new ParseField("string_distance");
static final ParseField MAX_EDITS_FIELD = new ParseField("max_edits");
static final ParseField MAX_INSPECTIONS_FIELD = new ParseField("max_inspections");
static final ParseField MAX_TERM_FREQ_FIELD = new ParseField("max_term_freq");
static final ParseField PREFIX_LENGTH_FIELD = new ParseField("prefix_length");
static final ParseField MIN_WORD_LENGTH_FIELD = new ParseField("min_word_length");
public static final ParseField DIRECT_GENERATOR_FIELD = new ParseField(TYPE);
public static final ParseField FIELDNAME_FIELD = new ParseField("field");
public static final ParseField PREFILTER_FIELD = new ParseField("pre_filter");
public static final ParseField POSTFILTER_FIELD = new ParseField("post_filter");
public static final ParseField SUGGESTMODE_FIELD = new ParseField("suggest_mode");
public static final ParseField MIN_DOC_FREQ_FIELD = new ParseField("min_doc_freq");
public static final ParseField ACCURACY_FIELD = new ParseField("accuracy");
public static final ParseField SIZE_FIELD = new ParseField("size");
public static final ParseField SORT_FIELD = new ParseField("sort");
public static final ParseField STRING_DISTANCE_FIELD = new ParseField("string_distance");
public static final ParseField MAX_EDITS_FIELD = new ParseField("max_edits");
public static final ParseField MAX_INSPECTIONS_FIELD = new ParseField("max_inspections");
public static final ParseField MAX_TERM_FREQ_FIELD = new ParseField("max_term_freq");
public static final ParseField PREFIX_LENGTH_FIELD = new ParseField("prefix_length");
public static final ParseField MIN_WORD_LENGTH_FIELD = new ParseField("min_word_length");
private final String field;
private String preFilter;
@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
return new LuceneLevenshteinDistance();
} else if ("levenstein".equals(distanceVal)) {
return new LevensteinDistance();
//TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein
// TODO Jaro and Winkler are 2 people - so apply same naming logic
// as damerau_levenshtein
} else if ("jarowinkler".equals(distanceVal)) {
return new JaroWinklerDistance();
} else if ("ngram".equals(distanceVal)) {

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
}
@Override
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term);
join(separator, spare, w_2.term, w_1.term, word.term);
long trigramCount = frequency(spare.get());
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
}

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
final long count = frequency(spare.get());
if (count < 1) {
return unigramLambda * scoreUnigram(word);
@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer {
@Override
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term);
join(separator, spare, w.term, w_1.term, w_2.term);
final long count = frequency(spare.get());
if (count < 1) {
return scoreBigram(w, w_1);
}
SuggestUtils.join(separator, spare, w.term, w_1.term);
join(separator, spare, w.term, w_1.term);
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
}

View File

@ -28,7 +28,6 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.common.io.FastCharArrayReader;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
@ -51,19 +50,19 @@ public final class NoisyChannelSpellChecker {
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
}
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
this.realWordLikelihood = nonErrorLikelihood;
this.requireUnigram = requireUnigram;
this.tokenLimit = tokenLimit;
}
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<>();
SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
@ -74,7 +73,7 @@ public final class NoisyChannelSpellChecker {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
@ -96,7 +95,7 @@ public final class NoisyChannelSpellChecker {
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
@ -107,11 +106,11 @@ public final class NoisyChannelSpellChecker {
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
@ -127,13 +126,13 @@ public final class NoisyChannelSpellChecker {
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
}

View File

@ -45,7 +45,6 @@ import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.suggest.Suggest.Suggestion;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.Suggester;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
@ -84,7 +83,7 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
final List<CandidateGenerator> gens = new ArrayList<>(generators.size());
for (int i = 0; i < numGenerators; i++) {
PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator);
DirectSpellChecker directSpellChecker = generator.createDirectSpellChecker();
Terms terms = MultiFields.getTerms(indexReader, generator.field());
if (terms != null) {
gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(),

View File

@ -43,7 +43,6 @@ import org.elasticsearch.script.CompiledScript;
import org.elasticsearch.script.Script;
import org.elasticsearch.script.ScriptContext;
import org.elasticsearch.script.ScriptService;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator;
@ -596,7 +595,7 @@ public class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSuggestionB
// now we should have field name, check and copy fields over to the suggestion builder we return
if (fieldname == null) {
throw new ElasticsearchParseException(
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
}
return new PhraseSuggestionBuilder(fieldname, tmpSuggestion);
}

View File

@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
import java.io.IOException;
@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer {
@Override
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
SuggestUtils.join(separator, spare, w_1.term, word.term);
join(separator, spare, w_1.term, word.term);
final long count = frequency(spare.get());
if (count < 1) {
return discount * scoreUnigram(word);
@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer {
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
// when the bigrams and trigrams are rare and we need both anyway.
SuggestUtils.join(separator, spare, w_1.term, w.term);
join(separator, spare, w_1.term, w.term);
long bigramCount = frequency(spare.get());
if (bigramCount < 1) {
return discount * scoreUnigram(w);
}
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
join(separator, spare, w_2.term, w_1.term, w.term);
long trigramCount = frequency(spare.get());
if (trigramCount < 1) {
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));

View File

@ -100,6 +100,16 @@ public abstract class WordScorer {
return scoreBigram(word, w_1);
}
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
result.clear();
for (int i = 0; i < toJoin.length - 1; i++) {
result.append(toJoin[i]);
result.append(separator);
}
result.append(toJoin[toJoin.length-1]);
return result.get();
}
public interface WordScorerFactory {
WordScorer newScorer(IndexReader reader, Terms terms,
String field, double realWordLikelyhood, BytesRef separator) throws IOException;

View File

@ -30,10 +30,10 @@ import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.Suggester;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator;
import java.io.IOException;
import java.util.ArrayList;
@ -48,7 +48,7 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare)
throws IOException {
DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());
DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().createDirectSpellChecker();
final IndexReader indexReader = searcher.getIndexReader();
TermSuggestion response = new TermSuggestion(
name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
@ -70,10 +70,11 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
return response;
}
private List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
private static List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
final List<Token> result = new ArrayList<>();
final String field = suggestion.getField();
SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field,
new DirectCandidateGenerator.TokenConsumer() {
@Override
public void nextToken() {
Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));

View File

@ -37,7 +37,6 @@ import org.elasticsearch.index.query.QueryParseContext;
import org.elasticsearch.index.query.QueryShardContext;
import org.elasticsearch.search.suggest.DirectSpellcheckerSettings;
import org.elasticsearch.search.suggest.SortBy;
import org.elasticsearch.search.suggest.SuggestUtils;
import org.elasticsearch.search.suggest.SuggestionBuilder;
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
@ -52,16 +51,16 @@ import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAUL
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.ACCURACY;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_EDITS;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_INSPECTIONS;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_TERM_FREQ;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_DOC_FREQ;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_WORD_LENGTH;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.PREFIX_LENGTH;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SORT;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.STRING_DISTANCE;
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SUGGEST_MODE;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD;
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD;
/**
* Defines the actual suggest command. Each command uses the global options
@ -376,16 +375,16 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
@Override
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
builder.field(SUGGEST_MODE.getPreferredName(), suggestMode);
builder.field(ACCURACY.getPreferredName(), accuracy);
builder.field(SORT.getPreferredName(), sort);
builder.field(STRING_DISTANCE.getPreferredName(), stringDistance);
builder.field(MAX_EDITS.getPreferredName(), maxEdits);
builder.field(MAX_INSPECTIONS.getPreferredName(), maxInspections);
builder.field(MAX_TERM_FREQ.getPreferredName(), maxTermFreq);
builder.field(PREFIX_LENGTH.getPreferredName(), prefixLength);
builder.field(MIN_WORD_LENGTH.getPreferredName(), minWordLength);
builder.field(MIN_DOC_FREQ.getPreferredName(), minDocFreq);
builder.field(SUGGESTMODE_FIELD.getPreferredName(), suggestMode);
builder.field(ACCURACY_FIELD.getPreferredName(), accuracy);
builder.field(SORT_FIELD.getPreferredName(), sort);
builder.field(STRING_DISTANCE_FIELD.getPreferredName(), stringDistance);
builder.field(MAX_EDITS_FIELD.getPreferredName(), maxEdits);
builder.field(MAX_INSPECTIONS_FIELD.getPreferredName(), maxInspections);
builder.field(MAX_TERM_FREQ_FIELD.getPreferredName(), maxTermFreq);
builder.field(PREFIX_LENGTH_FIELD.getPreferredName(), prefixLength);
builder.field(MIN_WORD_LENGTH_FIELD.getPreferredName(), minWordLength);
builder.field(MIN_DOC_FREQ_FIELD.getPreferredName(), minDocFreq);
return builder;
}
@ -408,25 +407,25 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
tmpSuggestion.size(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, SuggestionBuilder.SHARDSIZE_FIELD)) {
tmpSuggestion.shardSize(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, SUGGEST_MODE)) {
} else if (parseFieldMatcher.match(currentFieldName, SUGGESTMODE_FIELD)) {
tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text()));
} else if (parseFieldMatcher.match(currentFieldName, ACCURACY)) {
} else if (parseFieldMatcher.match(currentFieldName, ACCURACY_FIELD)) {
tmpSuggestion.accuracy(parser.floatValue());
} else if (parseFieldMatcher.match(currentFieldName, SORT)) {
} else if (parseFieldMatcher.match(currentFieldName, SORT_FIELD)) {
tmpSuggestion.sort(SortBy.resolve(parser.text()));
} else if (parseFieldMatcher.match(currentFieldName, STRING_DISTANCE)) {
} else if (parseFieldMatcher.match(currentFieldName, STRING_DISTANCE_FIELD)) {
tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text()));
} else if (parseFieldMatcher.match(currentFieldName, MAX_EDITS)) {
} else if (parseFieldMatcher.match(currentFieldName, MAX_EDITS_FIELD)) {
tmpSuggestion.maxEdits(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, MAX_INSPECTIONS)) {
} else if (parseFieldMatcher.match(currentFieldName, MAX_INSPECTIONS_FIELD)) {
tmpSuggestion.maxInspections(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, MAX_TERM_FREQ)) {
} else if (parseFieldMatcher.match(currentFieldName, MAX_TERM_FREQ_FIELD)) {
tmpSuggestion.maxTermFreq(parser.floatValue());
} else if (parseFieldMatcher.match(currentFieldName, PREFIX_LENGTH)) {
} else if (parseFieldMatcher.match(currentFieldName, PREFIX_LENGTH_FIELD)) {
tmpSuggestion.prefixLength(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, MIN_WORD_LENGTH)) {
} else if (parseFieldMatcher.match(currentFieldName, MIN_WORD_LENGTH_FIELD)) {
tmpSuggestion.minWordLength(parser.intValue());
} else if (parseFieldMatcher.match(currentFieldName, MIN_DOC_FREQ)) {
} else if (parseFieldMatcher.match(currentFieldName, MIN_DOC_FREQ_FIELD)) {
tmpSuggestion.minDocFreq(parser.floatValue());
} else {
throw new ParsingException(parser.getTokenLocation(),
@ -440,7 +439,7 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
// now we should have field name, check and copy fields over to the suggestion builder we return
if (fieldname == null) {
throw new ElasticsearchParseException(
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
}
return new TermSuggestionBuilder(fieldname, tmpSuggestion);
}