Merge pull request #19920 from cbuescher/remove-SuggestUtil
Remove SuggestUtil helper class
This commit is contained in:
commit
563bf0154c
|
@ -21,8 +21,13 @@ package org.elasticsearch.search.suggest;
|
||||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
import org.apache.lucene.search.spell.DirectSpellChecker;
|
||||||
import org.apache.lucene.search.spell.StringDistance;
|
import org.apache.lucene.search.spell.StringDistance;
|
||||||
import org.apache.lucene.search.spell.SuggestMode;
|
import org.apache.lucene.search.spell.SuggestMode;
|
||||||
|
import org.apache.lucene.search.spell.SuggestWord;
|
||||||
|
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
||||||
|
import org.apache.lucene.search.spell.SuggestWordQueue;
|
||||||
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
import org.apache.lucene.util.automaton.LevenshteinAutomata;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
public class DirectSpellcheckerSettings {
|
public class DirectSpellcheckerSettings {
|
||||||
|
|
||||||
// NB: If this changes, make sure to change the default in TermBuilderSuggester
|
// NB: If this changes, make sure to change the default in TermBuilderSuggester
|
||||||
|
@ -49,6 +54,9 @@ public class DirectSpellcheckerSettings {
|
||||||
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
|
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
|
||||||
private float minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
private float minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
||||||
|
|
||||||
|
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
|
||||||
|
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
|
||||||
|
|
||||||
public SuggestMode suggestMode() {
|
public SuggestMode suggestMode() {
|
||||||
return suggestMode;
|
return suggestMode;
|
||||||
}
|
}
|
||||||
|
@ -129,6 +137,33 @@ public class DirectSpellcheckerSettings {
|
||||||
this.minDocFreq = minDocFreq;
|
this.minDocFreq = minDocFreq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DirectSpellChecker createDirectSpellChecker() {
|
||||||
|
|
||||||
|
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
|
||||||
|
directSpellChecker.setAccuracy(accuracy());
|
||||||
|
Comparator<SuggestWord> comparator;
|
||||||
|
switch (sort()) {
|
||||||
|
case SCORE:
|
||||||
|
comparator = SCORE_COMPARATOR;
|
||||||
|
break;
|
||||||
|
case FREQUENCY:
|
||||||
|
comparator = LUCENE_FREQUENCY;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new IllegalArgumentException("Illegal suggest sort: " + sort());
|
||||||
|
}
|
||||||
|
directSpellChecker.setComparator(comparator);
|
||||||
|
directSpellChecker.setDistance(stringDistance());
|
||||||
|
directSpellChecker.setMaxEdits(maxEdits());
|
||||||
|
directSpellChecker.setMaxInspections(maxInspections());
|
||||||
|
directSpellChecker.setMaxQueryFrequency(maxTermFreq());
|
||||||
|
directSpellChecker.setMinPrefix(prefixLength());
|
||||||
|
directSpellChecker.setMinQueryLength(minWordLength());
|
||||||
|
directSpellChecker.setThresholdFrequency(minDocFreq());
|
||||||
|
directSpellChecker.setLowerCaseTerms(false);
|
||||||
|
return directSpellChecker;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "[" +
|
return "[" +
|
||||||
|
|
|
@ -1,162 +0,0 @@
|
||||||
/*
|
|
||||||
* Licensed to Elasticsearch under one or more contributor
|
|
||||||
* license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright
|
|
||||||
* ownership. Elasticsearch licenses this file to you under
|
|
||||||
* the Apache License, Version 2.0 (the "License"); you may
|
|
||||||
* not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing,
|
|
||||||
* software distributed under the License is distributed on an
|
|
||||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
* KIND, either express or implied. See the License for the
|
|
||||||
* specific language governing permissions and limitations
|
|
||||||
* under the License.
|
|
||||||
*/
|
|
||||||
package org.elasticsearch.search.suggest;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|
||||||
import org.apache.lucene.search.spell.DirectSpellChecker;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWord;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWordFrequencyComparator;
|
|
||||||
import org.apache.lucene.search.spell.SuggestWordQueue;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
|
||||||
import org.apache.lucene.util.CharsRef;
|
|
||||||
import org.apache.lucene.util.CharsRefBuilder;
|
|
||||||
import org.apache.lucene.util.IOUtils;
|
|
||||||
import org.elasticsearch.common.ParseField;
|
|
||||||
import org.elasticsearch.common.io.FastCharArrayReader;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
public final class SuggestUtils {
|
|
||||||
private static final Comparator<SuggestWord> LUCENE_FREQUENCY = new SuggestWordFrequencyComparator();
|
|
||||||
private static final Comparator<SuggestWord> SCORE_COMPARATOR = SuggestWordQueue.DEFAULT_COMPARATOR;
|
|
||||||
|
|
||||||
private SuggestUtils() {
|
|
||||||
// utils!!
|
|
||||||
}
|
|
||||||
|
|
||||||
public static DirectSpellChecker getDirectSpellChecker(DirectSpellcheckerSettings suggestion) {
|
|
||||||
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
|
|
||||||
directSpellChecker.setAccuracy(suggestion.accuracy());
|
|
||||||
Comparator<SuggestWord> comparator;
|
|
||||||
switch (suggestion.sort()) {
|
|
||||||
case SCORE:
|
|
||||||
comparator = SCORE_COMPARATOR;
|
|
||||||
break;
|
|
||||||
case FREQUENCY:
|
|
||||||
comparator = LUCENE_FREQUENCY;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("Illegal suggest sort: " + suggestion.sort());
|
|
||||||
}
|
|
||||||
directSpellChecker.setComparator(comparator);
|
|
||||||
directSpellChecker.setDistance(suggestion.stringDistance());
|
|
||||||
directSpellChecker.setMaxEdits(suggestion.maxEdits());
|
|
||||||
directSpellChecker.setMaxInspections(suggestion.maxInspections());
|
|
||||||
directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq());
|
|
||||||
directSpellChecker.setMinPrefix(suggestion.prefixLength());
|
|
||||||
directSpellChecker.setMinQueryLength(suggestion.minWordLength());
|
|
||||||
directSpellChecker.setThresholdFrequency(suggestion.minDocFreq());
|
|
||||||
directSpellChecker.setLowerCaseTerms(false);
|
|
||||||
return directSpellChecker;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
|
||||||
result.clear();
|
|
||||||
for (int i = 0; i < toJoin.length - 1; i++) {
|
|
||||||
result.append(toJoin[i]);
|
|
||||||
result.append(separator);
|
|
||||||
}
|
|
||||||
result.append(toJoin[toJoin.length-1]);
|
|
||||||
return result.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract static class TokenConsumer {
|
|
||||||
protected CharTermAttribute charTermAttr;
|
|
||||||
protected PositionIncrementAttribute posIncAttr;
|
|
||||||
protected OffsetAttribute offsetAttr;
|
|
||||||
|
|
||||||
public void reset(TokenStream stream) {
|
|
||||||
charTermAttr = stream.addAttribute(CharTermAttribute.class);
|
|
||||||
posIncAttr = stream.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
offsetAttr = stream.addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected BytesRef fillBytesRef(BytesRefBuilder spare) {
|
|
||||||
spare.copyChars(charTermAttr);
|
|
||||||
return spare.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public abstract void nextToken() throws IOException;
|
|
||||||
|
|
||||||
public void end() {}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException {
|
|
||||||
spare.copyUTF8Bytes(toAnalyze);
|
|
||||||
return analyze(analyzer, spare.get(), field, consumer);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int analyze(Analyzer analyzer, CharsRef toAnalyze, String field, TokenConsumer consumer) throws IOException {
|
|
||||||
try (TokenStream ts = analyzer.tokenStream(
|
|
||||||
field, new FastCharArrayReader(toAnalyze.chars, toAnalyze.offset, toAnalyze.length))) {
|
|
||||||
return analyze(ts, consumer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
|
|
||||||
* because really the caller who called {@link Analyzer#tokenStream} should close it,
|
|
||||||
* but when trying that there are recursion issues when we try to use the same
|
|
||||||
* TokenStream twice in the same recursion... */
|
|
||||||
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
|
|
||||||
int numTokens = 0;
|
|
||||||
boolean success = false;
|
|
||||||
try {
|
|
||||||
stream.reset();
|
|
||||||
consumer.reset(stream);
|
|
||||||
while (stream.incrementToken()) {
|
|
||||||
consumer.nextToken();
|
|
||||||
numTokens++;
|
|
||||||
}
|
|
||||||
consumer.end();
|
|
||||||
success = true;
|
|
||||||
} finally {
|
|
||||||
if (success) {
|
|
||||||
stream.close();
|
|
||||||
} else {
|
|
||||||
IOUtils.closeWhileHandlingException(stream);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return numTokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class Fields {
|
|
||||||
public static final ParseField STRING_DISTANCE = new ParseField("string_distance");
|
|
||||||
public static final ParseField SUGGEST_MODE = new ParseField("suggest_mode");
|
|
||||||
public static final ParseField MAX_EDITS = new ParseField("max_edits");
|
|
||||||
public static final ParseField MAX_INSPECTIONS = new ParseField("max_inspections");
|
|
||||||
// TODO some of these constants are the same as MLT constants and
|
|
||||||
// could be moved to a shared class for consistency
|
|
||||||
public static final ParseField MAX_TERM_FREQ = new ParseField("max_term_freq");
|
|
||||||
public static final ParseField PREFIX_LENGTH = new ParseField("prefix_length", "prefix_len");
|
|
||||||
public static final ParseField MIN_WORD_LENGTH = new ParseField("min_word_length", "min_word_len");
|
|
||||||
public static final ParseField MIN_DOC_FREQ = new ParseField("min_doc_freq");
|
|
||||||
public static final ParseField SHARD_SIZE = new ParseField("shard_size");
|
|
||||||
public static final ParseField ANALYZER = new ParseField("analyzer");
|
|
||||||
public static final ParseField FIELD = new ParseField("field");
|
|
||||||
public static final ParseField SIZE = new ParseField("size");
|
|
||||||
public static final ParseField SORT = new ParseField("sort");
|
|
||||||
public static final ParseField ACCURACY = new ParseField("accuracy");
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -37,7 +37,6 @@ import org.elasticsearch.index.mapper.MappedFieldType;
|
||||||
import org.elasticsearch.index.mapper.MapperService;
|
import org.elasticsearch.index.mapper.MapperService;
|
||||||
import org.elasticsearch.index.query.QueryParseContext;
|
import org.elasticsearch.index.query.QueryParseContext;
|
||||||
import org.elasticsearch.index.query.QueryShardContext;
|
import org.elasticsearch.index.query.QueryShardContext;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||||
import org.elasticsearch.search.suggest.completion.context.ContextMapping;
|
import org.elasticsearch.search.suggest.completion.context.ContextMapping;
|
||||||
|
@ -48,7 +47,6 @@ import org.elasticsearch.search.suggest.completion2x.context.GeolocationContextM
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -90,10 +88,10 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
|
||||||
TLP_PARSER.declareField((parser, completionSuggestionContext, context) ->
|
TLP_PARSER.declareField((parser, completionSuggestionContext, context) ->
|
||||||
completionSuggestionContext.regexOptions = RegexOptions.parse(parser, context),
|
completionSuggestionContext.regexOptions = RegexOptions.parse(parser, context),
|
||||||
RegexOptions.REGEX_OPTIONS, ObjectParser.ValueType.OBJECT);
|
RegexOptions.REGEX_OPTIONS, ObjectParser.ValueType.OBJECT);
|
||||||
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, SuggestUtils.Fields.FIELD);
|
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::field, FIELDNAME_FIELD);
|
||||||
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, SuggestUtils.Fields.ANALYZER);
|
TLP_PARSER.declareString(CompletionSuggestionBuilder.InnerBuilder::analyzer, ANALYZER_FIELD);
|
||||||
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SuggestUtils.Fields.SIZE);
|
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::size, SIZE_FIELD);
|
||||||
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SuggestUtils.Fields.SHARD_SIZE);
|
TLP_PARSER.declareInt(CompletionSuggestionBuilder.InnerBuilder::shardSize, SHARDSIZE_FIELD);
|
||||||
TLP_PARSER.declareField((p, v, c) -> {
|
TLP_PARSER.declareField((p, v, c) -> {
|
||||||
// Copy the current structure. We will parse, once the mapping is provided
|
// Copy the current structure. We will parse, once the mapping is provided
|
||||||
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
|
XContentBuilder builder = XContentFactory.contentBuilder(XContentType.JSON);
|
||||||
|
@ -353,7 +351,7 @@ public class CompletionSuggestionBuilder extends SuggestionBuilder<CompletionSug
|
||||||
// now we should have field name, check and copy fields over to the suggestion builder we return
|
// now we should have field name, check and copy fields over to the suggestion builder we return
|
||||||
if (field == null) {
|
if (field == null) {
|
||||||
throw new ElasticsearchParseException(
|
throw new ElasticsearchParseException(
|
||||||
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
|
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
|
||||||
}
|
}
|
||||||
return new CompletionSuggestionBuilder(field, builder);
|
return new CompletionSuggestionBuilder(field, builder);
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -73,7 +72,7 @@ public final class Correction implements Comparable<Correction> {
|
||||||
len += toJoin[i].length;
|
len += toJoin[i].length;
|
||||||
}
|
}
|
||||||
result.grow(len);
|
result.grow(len);
|
||||||
return SuggestUtils.join(separator, result, toJoin);
|
return WordScorer.join(separator, result, toJoin);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Lower scores sorts first; if scores are equal,
|
/** Lower scores sorts first; if scores are equal,
|
||||||
|
|
|
@ -19,6 +19,10 @@
|
||||||
package org.elasticsearch.search.suggest.phrase;
|
package org.elasticsearch.search.suggest.phrase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.MultiFields;
|
import org.apache.lucene.index.MultiFields;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -29,8 +33,10 @@ import org.apache.lucene.search.spell.SuggestMode;
|
||||||
import org.apache.lucene.search.spell.SuggestWord;
|
import org.apache.lucene.search.spell.SuggestWord;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.CharsRefBuilder;
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
import org.apache.lucene.util.IOUtils;
|
||||||
|
import org.elasticsearch.common.io.FastCharArrayReader;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -44,7 +50,7 @@ import static java.lang.Math.log10;
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.round;
|
import static java.lang.Math.round;
|
||||||
|
|
||||||
final class DirectCandidateGenerator extends CandidateGenerator {
|
public final class DirectCandidateGenerator extends CandidateGenerator {
|
||||||
|
|
||||||
private final DirectSpellChecker spellchecker;
|
private final DirectSpellChecker spellchecker;
|
||||||
private final String field;
|
private final String field;
|
||||||
|
@ -140,7 +146,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||||
return term;
|
return term;
|
||||||
}
|
}
|
||||||
final BytesRefBuilder result = byteSpare;
|
final BytesRefBuilder result = byteSpare;
|
||||||
SuggestUtils.analyze(preFilter, term, field, new SuggestUtils.TokenConsumer() {
|
analyze(preFilter, term, field, new TokenConsumer() {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void nextToken() throws IOException {
|
public void nextToken() throws IOException {
|
||||||
|
@ -156,7 +162,7 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||||
candidates.add(candidate);
|
candidates.add(candidate);
|
||||||
} else {
|
} else {
|
||||||
final BytesRefBuilder result = byteSpare;
|
final BytesRefBuilder result = byteSpare;
|
||||||
SuggestUtils.analyze(postFilter, candidate.term, field, new SuggestUtils.TokenConsumer() {
|
analyze(postFilter, candidate.term, field, new TokenConsumer() {
|
||||||
@Override
|
@Override
|
||||||
public void nextToken() throws IOException {
|
public void nextToken() throws IOException {
|
||||||
this.fillBytesRef(result);
|
this.fillBytesRef(result);
|
||||||
|
@ -189,6 +195,27 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public abstract static class TokenConsumer {
|
||||||
|
protected CharTermAttribute charTermAttr;
|
||||||
|
protected PositionIncrementAttribute posIncAttr;
|
||||||
|
protected OffsetAttribute offsetAttr;
|
||||||
|
|
||||||
|
public void reset(TokenStream stream) {
|
||||||
|
charTermAttr = stream.addAttribute(CharTermAttribute.class);
|
||||||
|
posIncAttr = stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAttr = stream.addAttribute(OffsetAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected BytesRef fillBytesRef(BytesRefBuilder spare) {
|
||||||
|
spare.copyChars(charTermAttr);
|
||||||
|
return spare.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract void nextToken() throws IOException;
|
||||||
|
|
||||||
|
public void end() {}
|
||||||
|
}
|
||||||
|
|
||||||
public static class CandidateSet {
|
public static class CandidateSet {
|
||||||
public Candidate[] candidates;
|
public Candidate[] candidates;
|
||||||
public final Candidate originalTerm;
|
public final Candidate originalTerm;
|
||||||
|
@ -283,4 +310,40 @@ final class DirectCandidateGenerator extends CandidateGenerator {
|
||||||
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
|
return new Candidate(term, frequency, channelScore, score(frequency, channelScore, dictSize), userInput);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare)
|
||||||
|
throws IOException {
|
||||||
|
spare.copyUTF8Bytes(toAnalyze);
|
||||||
|
CharsRef charsRef = spare.get();
|
||||||
|
try (TokenStream ts = analyzer.tokenStream(
|
||||||
|
field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) {
|
||||||
|
return analyze(ts, consumer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** NOTE: this method closes the TokenStream, even on exception, which is awkward
|
||||||
|
* because really the caller who called {@link Analyzer#tokenStream} should close it,
|
||||||
|
* but when trying that there are recursion issues when we try to use the same
|
||||||
|
* TokenStream twice in the same recursion... */
|
||||||
|
public static int analyze(TokenStream stream, TokenConsumer consumer) throws IOException {
|
||||||
|
int numTokens = 0;
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
stream.reset();
|
||||||
|
consumer.reset(stream);
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
consumer.nextToken();
|
||||||
|
numTokens++;
|
||||||
|
}
|
||||||
|
consumer.end();
|
||||||
|
success = true;
|
||||||
|
} finally {
|
||||||
|
if (success) {
|
||||||
|
stream.close();
|
||||||
|
} else {
|
||||||
|
IOUtils.closeWhileHandlingException(stream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return numTokens;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,21 +51,21 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
|
||||||
|
|
||||||
private static final String TYPE = "direct_generator";
|
private static final String TYPE = "direct_generator";
|
||||||
|
|
||||||
static final ParseField DIRECT_GENERATOR_FIELD = new ParseField(TYPE);
|
public static final ParseField DIRECT_GENERATOR_FIELD = new ParseField(TYPE);
|
||||||
static final ParseField FIELDNAME_FIELD = new ParseField("field");
|
public static final ParseField FIELDNAME_FIELD = new ParseField("field");
|
||||||
static final ParseField PREFILTER_FIELD = new ParseField("pre_filter");
|
public static final ParseField PREFILTER_FIELD = new ParseField("pre_filter");
|
||||||
static final ParseField POSTFILTER_FIELD = new ParseField("post_filter");
|
public static final ParseField POSTFILTER_FIELD = new ParseField("post_filter");
|
||||||
static final ParseField SUGGESTMODE_FIELD = new ParseField("suggest_mode");
|
public static final ParseField SUGGESTMODE_FIELD = new ParseField("suggest_mode");
|
||||||
static final ParseField MIN_DOC_FREQ_FIELD = new ParseField("min_doc_freq");
|
public static final ParseField MIN_DOC_FREQ_FIELD = new ParseField("min_doc_freq");
|
||||||
static final ParseField ACCURACY_FIELD = new ParseField("accuracy");
|
public static final ParseField ACCURACY_FIELD = new ParseField("accuracy");
|
||||||
static final ParseField SIZE_FIELD = new ParseField("size");
|
public static final ParseField SIZE_FIELD = new ParseField("size");
|
||||||
static final ParseField SORT_FIELD = new ParseField("sort");
|
public static final ParseField SORT_FIELD = new ParseField("sort");
|
||||||
static final ParseField STRING_DISTANCE_FIELD = new ParseField("string_distance");
|
public static final ParseField STRING_DISTANCE_FIELD = new ParseField("string_distance");
|
||||||
static final ParseField MAX_EDITS_FIELD = new ParseField("max_edits");
|
public static final ParseField MAX_EDITS_FIELD = new ParseField("max_edits");
|
||||||
static final ParseField MAX_INSPECTIONS_FIELD = new ParseField("max_inspections");
|
public static final ParseField MAX_INSPECTIONS_FIELD = new ParseField("max_inspections");
|
||||||
static final ParseField MAX_TERM_FREQ_FIELD = new ParseField("max_term_freq");
|
public static final ParseField MAX_TERM_FREQ_FIELD = new ParseField("max_term_freq");
|
||||||
static final ParseField PREFIX_LENGTH_FIELD = new ParseField("prefix_length");
|
public static final ParseField PREFIX_LENGTH_FIELD = new ParseField("prefix_length");
|
||||||
static final ParseField MIN_WORD_LENGTH_FIELD = new ParseField("min_word_length");
|
public static final ParseField MIN_WORD_LENGTH_FIELD = new ParseField("min_word_length");
|
||||||
|
|
||||||
private final String field;
|
private final String field;
|
||||||
private String preFilter;
|
private String preFilter;
|
||||||
|
@ -449,7 +449,8 @@ public final class DirectCandidateGeneratorBuilder implements CandidateGenerator
|
||||||
return new LuceneLevenshteinDistance();
|
return new LuceneLevenshteinDistance();
|
||||||
} else if ("levenstein".equals(distanceVal)) {
|
} else if ("levenstein".equals(distanceVal)) {
|
||||||
return new LevensteinDistance();
|
return new LevensteinDistance();
|
||||||
//TODO Jaro and Winkler are 2 people - so apply same naming logic as damerau_levenshtein
|
// TODO Jaro and Winkler are 2 people - so apply same naming logic
|
||||||
|
// as damerau_levenshtein
|
||||||
} else if ("jarowinkler".equals(distanceVal)) {
|
} else if ("jarowinkler".equals(distanceVal)) {
|
||||||
return new JaroWinklerDistance();
|
return new JaroWinklerDistance();
|
||||||
} else if ("ngram".equals(distanceVal)) {
|
} else if ("ngram".equals(distanceVal)) {
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -41,15 +40,15 @@ final class LaplaceScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
return (alpha + frequency(spare.get())) / (alpha + w_1.frequency + vocabluarySize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate word, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, word.term);
|
join(separator, spare, w_2.term, w_1.term, word.term);
|
||||||
long trigramCount = frequency(spare.get());
|
long trigramCount = frequency(spare.get());
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
return (alpha + trigramCount) / (alpha + frequency(spare.get()) + vocabluarySize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -56,7 +55,7 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return unigramLambda * scoreUnigram(word);
|
return unigramLambda * scoreUnigram(word);
|
||||||
|
@ -66,12 +65,12 @@ public final class LinearInterpolatingScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w.term, w_1.term, w_2.term);
|
join(separator, spare, w.term, w_1.term, w_2.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return scoreBigram(w, w_1);
|
return scoreBigram(w, w_1);
|
||||||
}
|
}
|
||||||
SuggestUtils.join(separator, spare, w.term, w_1.term);
|
join(separator, spare, w.term, w_1.term);
|
||||||
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
|
return trigramLambda * (count / (1.d + frequency(spare.get()))) + scoreBigram(w, w_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.BytesRefBuilder;
|
import org.apache.lucene.util.BytesRefBuilder;
|
||||||
import org.apache.lucene.util.CharsRefBuilder;
|
import org.apache.lucene.util.CharsRefBuilder;
|
||||||
import org.elasticsearch.common.io.FastCharArrayReader;
|
import org.elasticsearch.common.io.FastCharArrayReader;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet;
|
||||||
|
|
||||||
|
@ -51,19 +50,19 @@ public final class NoisyChannelSpellChecker {
|
||||||
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
|
public NoisyChannelSpellChecker(double nonErrorLikelihood) {
|
||||||
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
|
this(nonErrorLikelihood, true, DEFAULT_TOKEN_LIMIT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
public NoisyChannelSpellChecker(double nonErrorLikelihood, boolean requireUnigram, int tokenLimit) {
|
||||||
this.realWordLikelihood = nonErrorLikelihood;
|
this.realWordLikelihood = nonErrorLikelihood;
|
||||||
this.requireUnigram = requireUnigram;
|
this.requireUnigram = requireUnigram;
|
||||||
this.tokenLimit = tokenLimit;
|
this.tokenLimit = tokenLimit;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
public Result getCorrections(TokenStream stream, final CandidateGenerator generator,
|
||||||
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
|
float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
|
||||||
|
|
||||||
final List<CandidateSet> candidateSetsList = new ArrayList<>();
|
final List<CandidateSet> candidateSetsList = new ArrayList<>();
|
||||||
SuggestUtils.analyze(stream, new SuggestUtils.TokenConsumer() {
|
DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
|
||||||
CandidateSet currentSet = null;
|
CandidateSet currentSet = null;
|
||||||
private TypeAttribute typeAttribute;
|
private TypeAttribute typeAttribute;
|
||||||
private final BytesRefBuilder termsRef = new BytesRefBuilder();
|
private final BytesRefBuilder termsRef = new BytesRefBuilder();
|
||||||
|
@ -74,7 +73,7 @@ public final class NoisyChannelSpellChecker {
|
||||||
super.reset(stream);
|
super.reset(stream);
|
||||||
typeAttribute = stream.addAttribute(TypeAttribute.class);
|
typeAttribute = stream.addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void nextToken() throws IOException {
|
public void nextToken() throws IOException {
|
||||||
anyTokens = true;
|
anyTokens = true;
|
||||||
|
@ -96,7 +95,7 @@ public final class NoisyChannelSpellChecker {
|
||||||
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
|
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void end() {
|
public void end() {
|
||||||
if (currentSet != null) {
|
if (currentSet != null) {
|
||||||
|
@ -107,11 +106,11 @@ public final class NoisyChannelSpellChecker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
|
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
|
||||||
return Result.EMPTY;
|
return Result.EMPTY;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (CandidateSet candidateSet : candidateSetsList) {
|
for (CandidateSet candidateSet : candidateSetsList) {
|
||||||
generator.drawCandidates(candidateSet);
|
generator.drawCandidates(candidateSet);
|
||||||
}
|
}
|
||||||
|
@ -127,13 +126,13 @@ public final class NoisyChannelSpellChecker {
|
||||||
cutoffScore = inputPhraseScore * confidence;
|
cutoffScore = inputPhraseScore * confidence;
|
||||||
}
|
}
|
||||||
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
|
||||||
|
|
||||||
return new Result(bestCandidates, cutoffScore);
|
return new Result(bestCandidates, cutoffScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
public Result getCorrections(Analyzer analyzer, BytesRef query, CandidateGenerator generator,
|
||||||
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
|
float maxErrors, int numCorrections, IndexReader reader, String analysisField, WordScorer scorer, float confidence, int gramSize) throws IOException {
|
||||||
|
|
||||||
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
|
return getCorrections(tokenStream(analyzer, query, new CharsRefBuilder(), analysisField), generator, maxErrors, numCorrections, scorer, confidence, gramSize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,6 @@ import org.elasticsearch.script.ScriptService;
|
||||||
import org.elasticsearch.search.suggest.Suggest.Suggestion;
|
import org.elasticsearch.search.suggest.Suggest.Suggestion;
|
||||||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
|
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry;
|
||||||
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
|
import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.Suggester;
|
import org.elasticsearch.search.suggest.Suggester;
|
||||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||||
|
@ -84,7 +83,7 @@ public final class PhraseSuggester extends Suggester<PhraseSuggestionContext> {
|
||||||
final List<CandidateGenerator> gens = new ArrayList<>(generators.size());
|
final List<CandidateGenerator> gens = new ArrayList<>(generators.size());
|
||||||
for (int i = 0; i < numGenerators; i++) {
|
for (int i = 0; i < numGenerators; i++) {
|
||||||
PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
|
PhraseSuggestionContext.DirectCandidateGenerator generator = generators.get(i);
|
||||||
DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(generator);
|
DirectSpellChecker directSpellChecker = generator.createDirectSpellChecker();
|
||||||
Terms terms = MultiFields.getTerms(indexReader, generator.field());
|
Terms terms = MultiFields.getTerms(indexReader, generator.field());
|
||||||
if (terms != null) {
|
if (terms != null) {
|
||||||
gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(),
|
gens.add(new DirectCandidateGenerator(directSpellChecker, generator.field(), generator.suggestMode(),
|
||||||
|
|
|
@ -43,7 +43,6 @@ import org.elasticsearch.script.CompiledScript;
|
||||||
import org.elasticsearch.script.Script;
|
import org.elasticsearch.script.Script;
|
||||||
import org.elasticsearch.script.ScriptContext;
|
import org.elasticsearch.script.ScriptContext;
|
||||||
import org.elasticsearch.script.ScriptService;
|
import org.elasticsearch.script.ScriptService;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||||
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator;
|
import org.elasticsearch.search.suggest.phrase.PhraseSuggestionContext.DirectCandidateGenerator;
|
||||||
|
@ -596,7 +595,7 @@ public class PhraseSuggestionBuilder extends SuggestionBuilder<PhraseSuggestionB
|
||||||
// now we should have field name, check and copy fields over to the suggestion builder we return
|
// now we should have field name, check and copy fields over to the suggestion builder we return
|
||||||
if (fieldname == null) {
|
if (fieldname == null) {
|
||||||
throw new ElasticsearchParseException(
|
throw new ElasticsearchParseException(
|
||||||
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
|
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
|
||||||
}
|
}
|
||||||
return new PhraseSuggestionBuilder(fieldname, tmpSuggestion);
|
return new PhraseSuggestionBuilder(fieldname, tmpSuggestion);
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,6 @@ package org.elasticsearch.search.suggest.phrase;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -41,7 +40,7 @@ class StupidBackoffScorer extends WordScorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
protected double scoreBigram(Candidate word, Candidate w_1) throws IOException {
|
||||||
SuggestUtils.join(separator, spare, w_1.term, word.term);
|
join(separator, spare, w_1.term, word.term);
|
||||||
final long count = frequency(spare.get());
|
final long count = frequency(spare.get());
|
||||||
if (count < 1) {
|
if (count < 1) {
|
||||||
return discount * scoreUnigram(word);
|
return discount * scoreUnigram(word);
|
||||||
|
@ -53,12 +52,12 @@ class StupidBackoffScorer extends WordScorer {
|
||||||
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
protected double scoreTrigram(Candidate w, Candidate w_1, Candidate w_2) throws IOException {
|
||||||
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
// First see if there are bigrams. If there aren't then skip looking up the trigram. This saves lookups
|
||||||
// when the bigrams and trigrams are rare and we need both anyway.
|
// when the bigrams and trigrams are rare and we need both anyway.
|
||||||
SuggestUtils.join(separator, spare, w_1.term, w.term);
|
join(separator, spare, w_1.term, w.term);
|
||||||
long bigramCount = frequency(spare.get());
|
long bigramCount = frequency(spare.get());
|
||||||
if (bigramCount < 1) {
|
if (bigramCount < 1) {
|
||||||
return discount * scoreUnigram(w);
|
return discount * scoreUnigram(w);
|
||||||
}
|
}
|
||||||
SuggestUtils.join(separator, spare, w_2.term, w_1.term, w.term);
|
join(separator, spare, w_2.term, w_1.term, w.term);
|
||||||
long trigramCount = frequency(spare.get());
|
long trigramCount = frequency(spare.get());
|
||||||
if (trigramCount < 1) {
|
if (trigramCount < 1) {
|
||||||
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
return discount * (bigramCount / (w_1.frequency + 0.00000000001d));
|
||||||
|
|
|
@ -100,6 +100,16 @@ public abstract class WordScorer {
|
||||||
return scoreBigram(word, w_1);
|
return scoreBigram(word, w_1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef... toJoin) {
|
||||||
|
result.clear();
|
||||||
|
for (int i = 0; i < toJoin.length - 1; i++) {
|
||||||
|
result.append(toJoin[i]);
|
||||||
|
result.append(separator);
|
||||||
|
}
|
||||||
|
result.append(toJoin[toJoin.length-1]);
|
||||||
|
return result.get();
|
||||||
|
}
|
||||||
|
|
||||||
public interface WordScorerFactory {
|
public interface WordScorerFactory {
|
||||||
WordScorer newScorer(IndexReader reader, Terms terms,
|
WordScorer newScorer(IndexReader reader, Terms terms,
|
||||||
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
String field, double realWordLikelyhood, BytesRef separator) throws IOException;
|
||||||
|
|
|
@ -30,10 +30,10 @@ import org.elasticsearch.common.bytes.BytesArray;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.text.Text;
|
import org.elasticsearch.common.text.Text;
|
||||||
import org.elasticsearch.index.query.QueryParseContext;
|
import org.elasticsearch.index.query.QueryParseContext;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.Suggester;
|
import org.elasticsearch.search.suggest.Suggester;
|
||||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||||
|
import org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -48,7 +48,7 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
|
||||||
@Override
|
@Override
|
||||||
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare)
|
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
DirectSpellChecker directSpellChecker = SuggestUtils.getDirectSpellChecker(suggestion.getDirectSpellCheckerSettings());
|
DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().createDirectSpellChecker();
|
||||||
final IndexReader indexReader = searcher.getIndexReader();
|
final IndexReader indexReader = searcher.getIndexReader();
|
||||||
TermSuggestion response = new TermSuggestion(
|
TermSuggestion response = new TermSuggestion(
|
||||||
name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
|
name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort()
|
||||||
|
@ -70,10 +70,11 @@ public final class TermSuggester extends Suggester<TermSuggestionContext> {
|
||||||
return response;
|
return response;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
|
private static List<Token> queryTerms(SuggestionContext suggestion, CharsRefBuilder spare) throws IOException {
|
||||||
final List<Token> result = new ArrayList<>();
|
final List<Token> result = new ArrayList<>();
|
||||||
final String field = suggestion.getField();
|
final String field = suggestion.getField();
|
||||||
SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() {
|
DirectCandidateGenerator.analyze(suggestion.getAnalyzer(), suggestion.getText(), field,
|
||||||
|
new DirectCandidateGenerator.TokenConsumer() {
|
||||||
@Override
|
@Override
|
||||||
public void nextToken() {
|
public void nextToken() {
|
||||||
Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));
|
Term term = new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())));
|
||||||
|
|
|
@ -37,7 +37,6 @@ import org.elasticsearch.index.query.QueryParseContext;
|
||||||
import org.elasticsearch.index.query.QueryShardContext;
|
import org.elasticsearch.index.query.QueryShardContext;
|
||||||
import org.elasticsearch.search.suggest.DirectSpellcheckerSettings;
|
import org.elasticsearch.search.suggest.DirectSpellcheckerSettings;
|
||||||
import org.elasticsearch.search.suggest.SortBy;
|
import org.elasticsearch.search.suggest.SortBy;
|
||||||
import org.elasticsearch.search.suggest.SuggestUtils;
|
|
||||||
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
import org.elasticsearch.search.suggest.SuggestionBuilder;
|
||||||
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
import org.elasticsearch.search.suggest.SuggestionSearchContext.SuggestionContext;
|
||||||
|
|
||||||
|
@ -52,16 +51,16 @@ import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAUL
|
||||||
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
|
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_DOC_FREQ;
|
||||||
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
|
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_MIN_WORD_LENGTH;
|
||||||
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
|
import static org.elasticsearch.search.suggest.DirectSpellcheckerSettings.DEFAULT_PREFIX_LENGTH;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.ACCURACY;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.ACCURACY_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_EDITS;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_EDITS_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_INSPECTIONS;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_INSPECTIONS_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MAX_TERM_FREQ;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MAX_TERM_FREQ_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_DOC_FREQ;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_DOC_FREQ_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.MIN_WORD_LENGTH;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.MIN_WORD_LENGTH_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.PREFIX_LENGTH;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.PREFIX_LENGTH_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SORT;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SORT_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.STRING_DISTANCE;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.STRING_DISTANCE_FIELD;
|
||||||
import static org.elasticsearch.search.suggest.SuggestUtils.Fields.SUGGEST_MODE;
|
import static org.elasticsearch.search.suggest.phrase.DirectCandidateGeneratorBuilder.SUGGESTMODE_FIELD;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Defines the actual suggest command. Each command uses the global options
|
* Defines the actual suggest command. Each command uses the global options
|
||||||
|
@ -376,16 +375,16 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
|
public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException {
|
||||||
builder.field(SUGGEST_MODE.getPreferredName(), suggestMode);
|
builder.field(SUGGESTMODE_FIELD.getPreferredName(), suggestMode);
|
||||||
builder.field(ACCURACY.getPreferredName(), accuracy);
|
builder.field(ACCURACY_FIELD.getPreferredName(), accuracy);
|
||||||
builder.field(SORT.getPreferredName(), sort);
|
builder.field(SORT_FIELD.getPreferredName(), sort);
|
||||||
builder.field(STRING_DISTANCE.getPreferredName(), stringDistance);
|
builder.field(STRING_DISTANCE_FIELD.getPreferredName(), stringDistance);
|
||||||
builder.field(MAX_EDITS.getPreferredName(), maxEdits);
|
builder.field(MAX_EDITS_FIELD.getPreferredName(), maxEdits);
|
||||||
builder.field(MAX_INSPECTIONS.getPreferredName(), maxInspections);
|
builder.field(MAX_INSPECTIONS_FIELD.getPreferredName(), maxInspections);
|
||||||
builder.field(MAX_TERM_FREQ.getPreferredName(), maxTermFreq);
|
builder.field(MAX_TERM_FREQ_FIELD.getPreferredName(), maxTermFreq);
|
||||||
builder.field(PREFIX_LENGTH.getPreferredName(), prefixLength);
|
builder.field(PREFIX_LENGTH_FIELD.getPreferredName(), prefixLength);
|
||||||
builder.field(MIN_WORD_LENGTH.getPreferredName(), minWordLength);
|
builder.field(MIN_WORD_LENGTH_FIELD.getPreferredName(), minWordLength);
|
||||||
builder.field(MIN_DOC_FREQ.getPreferredName(), minDocFreq);
|
builder.field(MIN_DOC_FREQ_FIELD.getPreferredName(), minDocFreq);
|
||||||
return builder;
|
return builder;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -408,25 +407,25 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
|
||||||
tmpSuggestion.size(parser.intValue());
|
tmpSuggestion.size(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, SuggestionBuilder.SHARDSIZE_FIELD)) {
|
} else if (parseFieldMatcher.match(currentFieldName, SuggestionBuilder.SHARDSIZE_FIELD)) {
|
||||||
tmpSuggestion.shardSize(parser.intValue());
|
tmpSuggestion.shardSize(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, SUGGEST_MODE)) {
|
} else if (parseFieldMatcher.match(currentFieldName, SUGGESTMODE_FIELD)) {
|
||||||
tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text()));
|
tmpSuggestion.suggestMode(SuggestMode.resolve(parser.text()));
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, ACCURACY)) {
|
} else if (parseFieldMatcher.match(currentFieldName, ACCURACY_FIELD)) {
|
||||||
tmpSuggestion.accuracy(parser.floatValue());
|
tmpSuggestion.accuracy(parser.floatValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, SORT)) {
|
} else if (parseFieldMatcher.match(currentFieldName, SORT_FIELD)) {
|
||||||
tmpSuggestion.sort(SortBy.resolve(parser.text()));
|
tmpSuggestion.sort(SortBy.resolve(parser.text()));
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, STRING_DISTANCE)) {
|
} else if (parseFieldMatcher.match(currentFieldName, STRING_DISTANCE_FIELD)) {
|
||||||
tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text()));
|
tmpSuggestion.stringDistance(StringDistanceImpl.resolve(parser.text()));
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, MAX_EDITS)) {
|
} else if (parseFieldMatcher.match(currentFieldName, MAX_EDITS_FIELD)) {
|
||||||
tmpSuggestion.maxEdits(parser.intValue());
|
tmpSuggestion.maxEdits(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, MAX_INSPECTIONS)) {
|
} else if (parseFieldMatcher.match(currentFieldName, MAX_INSPECTIONS_FIELD)) {
|
||||||
tmpSuggestion.maxInspections(parser.intValue());
|
tmpSuggestion.maxInspections(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, MAX_TERM_FREQ)) {
|
} else if (parseFieldMatcher.match(currentFieldName, MAX_TERM_FREQ_FIELD)) {
|
||||||
tmpSuggestion.maxTermFreq(parser.floatValue());
|
tmpSuggestion.maxTermFreq(parser.floatValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, PREFIX_LENGTH)) {
|
} else if (parseFieldMatcher.match(currentFieldName, PREFIX_LENGTH_FIELD)) {
|
||||||
tmpSuggestion.prefixLength(parser.intValue());
|
tmpSuggestion.prefixLength(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, MIN_WORD_LENGTH)) {
|
} else if (parseFieldMatcher.match(currentFieldName, MIN_WORD_LENGTH_FIELD)) {
|
||||||
tmpSuggestion.minWordLength(parser.intValue());
|
tmpSuggestion.minWordLength(parser.intValue());
|
||||||
} else if (parseFieldMatcher.match(currentFieldName, MIN_DOC_FREQ)) {
|
} else if (parseFieldMatcher.match(currentFieldName, MIN_DOC_FREQ_FIELD)) {
|
||||||
tmpSuggestion.minDocFreq(parser.floatValue());
|
tmpSuggestion.minDocFreq(parser.floatValue());
|
||||||
} else {
|
} else {
|
||||||
throw new ParsingException(parser.getTokenLocation(),
|
throw new ParsingException(parser.getTokenLocation(),
|
||||||
|
@ -440,7 +439,7 @@ public class TermSuggestionBuilder extends SuggestionBuilder<TermSuggestionBuild
|
||||||
// now we should have field name, check and copy fields over to the suggestion builder we return
|
// now we should have field name, check and copy fields over to the suggestion builder we return
|
||||||
if (fieldname == null) {
|
if (fieldname == null) {
|
||||||
throw new ElasticsearchParseException(
|
throw new ElasticsearchParseException(
|
||||||
"the required field option [" + SuggestUtils.Fields.FIELD.getPreferredName() + "] is missing");
|
"the required field option [" + FIELDNAME_FIELD.getPreferredName() + "] is missing");
|
||||||
}
|
}
|
||||||
return new TermSuggestionBuilder(fieldname, tmpSuggestion);
|
return new TermSuggestionBuilder(fieldname, tmpSuggestion);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue