mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 18:35:25 +00:00
use a base ShingleFilterFactory to simplify default shingle detection
This commit is contained in:
parent
0f95499703
commit
1eb24d7efc
@ -32,48 +32,83 @@ import org.elasticsearch.index.settings.IndexSettings;
|
|||||||
*/
|
*/
|
||||||
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||||
|
|
||||||
private final int maxShingleSize;
|
private final Factory factory;
|
||||||
|
|
||||||
private final boolean outputUnigrams;
|
|
||||||
|
|
||||||
private final boolean outputUnigramsIfNoShingles;
|
|
||||||
|
|
||||||
private String tokenSeparator;
|
|
||||||
|
|
||||||
private int minShingleSize;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||||
super(index, indexSettings, name, settings);
|
super(index, indexSettings, name, settings);
|
||||||
maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||||
minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||||
outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
||||||
outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||||
tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
|
String tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
|
||||||
|
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
|
return factory.create(tokenStream);
|
||||||
filter.setOutputUnigrams(outputUnigrams);
|
|
||||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
|
||||||
filter.setTokenSeparator(tokenSeparator);
|
|
||||||
return filter;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getMaxShingleSize() {
|
|
||||||
return maxShingleSize;
|
public Factory getInnerFactory() {
|
||||||
|
return this.factory;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getMinShingleSize() {
|
public static final class Factory implements TokenFilterFactory {
|
||||||
return minShingleSize;
|
private final int maxShingleSize;
|
||||||
}
|
|
||||||
|
|
||||||
public boolean getOutputUnigrams() {
|
private final boolean outputUnigrams;
|
||||||
return outputUnigrams;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean getOutputUnigramsIfNoShingles() {
|
private final boolean outputUnigramsIfNoShingles;
|
||||||
return outputUnigramsIfNoShingles;
|
|
||||||
|
private final String tokenSeparator;
|
||||||
|
|
||||||
|
private int minShingleSize;
|
||||||
|
|
||||||
|
private final String name;
|
||||||
|
|
||||||
|
public Factory(String name) {
|
||||||
|
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.TOKEN_SEPARATOR);
|
||||||
|
}
|
||||||
|
|
||||||
|
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {
|
||||||
|
this.maxShingleSize = maxShingleSize;
|
||||||
|
this.outputUnigrams = outputUnigrams;
|
||||||
|
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||||
|
this.tokenSeparator = tokenSeparator;
|
||||||
|
this.minShingleSize = minShingleSize;
|
||||||
|
this.name = name;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream create(TokenStream tokenStream) {
|
||||||
|
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
|
||||||
|
filter.setOutputUnigrams(outputUnigrams);
|
||||||
|
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||||
|
filter.setTokenSeparator(tokenSeparator);
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMaxShingleSize() {
|
||||||
|
return maxShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinShingleSize() {
|
||||||
|
return minShingleSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getOutputUnigrams() {
|
||||||
|
return outputUnigrams;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getOutputUnigramsIfNoShingles() {
|
||||||
|
return outputUnigramsIfNoShingles;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String name() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -504,17 +504,7 @@ public class IndicesAnalysisService extends AbstractComponent {
|
|||||||
}
|
}
|
||||||
}));
|
}));
|
||||||
|
|
||||||
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new ShingleTokenFilterFactory.Factory("shingle")));
|
||||||
@Override
|
|
||||||
public String name() {
|
|
||||||
return "shingle";
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public TokenStream create(TokenStream tokenStream) {
|
|
||||||
return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
|
||||||
}
|
|
||||||
}));
|
|
||||||
|
|
||||||
tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||||
@Override
|
@Override
|
||||||
|
@ -23,7 +23,6 @@ import java.util.Comparator;
|
|||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CustomAnalyzerWrapper;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
@ -276,21 +275,21 @@ public final class SuggestUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static ShingleTokenFilterFactory getShingleFilterFactory(Analyzer analyzer) {
|
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) {
|
||||||
if (analyzer instanceof NamedAnalyzer) {
|
if (analyzer instanceof NamedAnalyzer) {
|
||||||
analyzer = ((NamedAnalyzer)analyzer).analyzer();
|
analyzer = ((NamedAnalyzer)analyzer).analyzer();
|
||||||
}
|
}
|
||||||
if (analyzer instanceof CustomAnalyzer) {
|
if (analyzer instanceof CustomAnalyzer) {
|
||||||
CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
||||||
TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
final TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
||||||
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
||||||
if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
|
if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
|
||||||
return ((ShingleTokenFilterFactory) tokenFilterFactory);
|
return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory();
|
||||||
|
} else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) {
|
||||||
|
return (ShingleTokenFilterFactory.Factory) tokenFilterFactory;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -216,7 +216,7 @@ public final class PhraseSuggestParser implements SuggestContextParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!gramSizeSet || suggestion.generators().isEmpty()) {
|
if (!gramSizeSet || suggestion.generators().isEmpty()) {
|
||||||
final ShingleTokenFilterFactory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
|
final ShingleTokenFilterFactory.Factory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
|
||||||
if (!gramSizeSet) {
|
if (!gramSizeSet) {
|
||||||
// try to detect the shingle size
|
// try to detect the shingle size
|
||||||
if (shingleFilterFactory != null) {
|
if (shingleFilterFactory != null) {
|
||||||
|
@ -740,6 +740,8 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||||||
builder.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase");
|
builder.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase");
|
||||||
builder.put("index.analysis.analyzer.ngram.tokenizer", "standard");
|
builder.put("index.analysis.analyzer.ngram.tokenizer", "standard");
|
||||||
builder.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase");
|
builder.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase");
|
||||||
|
builder.put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard");
|
||||||
|
builder.putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase");
|
||||||
builder.put("index.analysis.filter.my_shingle.type", "shingle");
|
builder.put("index.analysis.filter.my_shingle.type", "shingle");
|
||||||
builder.put("index.analysis.filter.my_shingle.output_unigrams", false);
|
builder.put("index.analysis.filter.my_shingle.output_unigrams", false);
|
||||||
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
|
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
|
||||||
@ -833,6 +835,23 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||||||
SearchResponse search = client.prepareSearch()
|
SearchResponse search = client.prepareSearch()
|
||||||
.setSearchType(SearchType.COUNT)
|
.setSearchType(SearchType.COUNT)
|
||||||
.setSuggestText("Xor the Got-Jewel")
|
.setSuggestText("Xor the Got-Jewel")
|
||||||
|
.addSuggestion(
|
||||||
|
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram").analyzer("myDefAnalyzer")
|
||||||
|
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||||
|
.size(1)).execute().actionGet();
|
||||||
|
|
||||||
|
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
|
||||||
|
assertThat(search.getSuggest(), notNullValue());
|
||||||
|
assertThat(search.getSuggest().size(), equalTo(1));
|
||||||
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
|
||||||
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
|
||||||
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||||
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
||||||
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
||||||
|
|
||||||
|
search = client.prepareSearch()
|
||||||
|
.setSearchType(SearchType.COUNT)
|
||||||
|
.setSuggestText("Xor the Got-Jewel")
|
||||||
.addSuggestion(
|
.addSuggestion(
|
||||||
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram")
|
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram")
|
||||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||||
@ -846,6 +865,7 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
||||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user