use a base ShingleFilterFactory to simplify default shingle detection
This commit is contained in:
parent
0f95499703
commit
1eb24d7efc
|
@ -32,48 +32,83 @@ import org.elasticsearch.index.settings.IndexSettings;
|
|||
*/
|
||||
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final int maxShingleSize;
|
||||
|
||||
private final boolean outputUnigrams;
|
||||
|
||||
private final boolean outputUnigramsIfNoShingles;
|
||||
|
||||
private String tokenSeparator;
|
||||
|
||||
private int minShingleSize;
|
||||
private final Factory factory;
|
||||
|
||||
@Inject
|
||||
public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||
outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
||||
outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||
tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
|
||||
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
|
||||
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
|
||||
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
|
||||
String tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
|
||||
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
return filter;
|
||||
return factory.create(tokenStream);
|
||||
}
|
||||
|
||||
public int getMaxShingleSize() {
|
||||
return maxShingleSize;
|
||||
|
||||
public Factory getInnerFactory() {
|
||||
return this.factory;
|
||||
}
|
||||
|
||||
public int getMinShingleSize() {
|
||||
return minShingleSize;
|
||||
}
|
||||
|
||||
public boolean getOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
||||
public boolean getOutputUnigramsIfNoShingles() {
|
||||
return outputUnigramsIfNoShingles;
|
||||
public static final class Factory implements TokenFilterFactory {
|
||||
private final int maxShingleSize;
|
||||
|
||||
private final boolean outputUnigrams;
|
||||
|
||||
private final boolean outputUnigramsIfNoShingles;
|
||||
|
||||
private final String tokenSeparator;
|
||||
|
||||
private int minShingleSize;
|
||||
|
||||
private final String name;
|
||||
|
||||
public Factory(String name) {
|
||||
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.TOKEN_SEPARATOR);
|
||||
}
|
||||
|
||||
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {
|
||||
this.maxShingleSize = maxShingleSize;
|
||||
this.outputUnigrams = outputUnigrams;
|
||||
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
|
||||
this.tokenSeparator = tokenSeparator;
|
||||
this.minShingleSize = minShingleSize;
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
|
||||
filter.setOutputUnigrams(outputUnigrams);
|
||||
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
|
||||
filter.setTokenSeparator(tokenSeparator);
|
||||
return filter;
|
||||
}
|
||||
|
||||
public int getMaxShingleSize() {
|
||||
return maxShingleSize;
|
||||
}
|
||||
|
||||
public int getMinShingleSize() {
|
||||
return minShingleSize;
|
||||
}
|
||||
|
||||
public boolean getOutputUnigrams() {
|
||||
return outputUnigrams;
|
||||
}
|
||||
|
||||
public boolean getOutputUnigramsIfNoShingles() {
|
||||
return outputUnigramsIfNoShingles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -504,17 +504,7 @@ public class IndicesAnalysisService extends AbstractComponent {
|
|||
}
|
||||
}));
|
||||
|
||||
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "shingle";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
|
||||
}
|
||||
}));
|
||||
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new ShingleTokenFilterFactory.Factory("shingle")));
|
||||
|
||||
tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
|
|
|
@ -23,7 +23,6 @@ import java.util.Comparator;
|
|||
import java.util.Locale;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CustomAnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
@ -276,21 +275,21 @@ public final class SuggestUtils {
|
|||
}
|
||||
|
||||
|
||||
public static ShingleTokenFilterFactory getShingleFilterFactory(Analyzer analyzer) {
|
||||
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) {
|
||||
if (analyzer instanceof NamedAnalyzer) {
|
||||
analyzer = ((NamedAnalyzer)analyzer).analyzer();
|
||||
}
|
||||
if (analyzer instanceof CustomAnalyzer) {
|
||||
CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
||||
TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
||||
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
|
||||
final TokenFilterFactory[] tokenFilters = a.tokenFilters();
|
||||
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
|
||||
if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
|
||||
return ((ShingleTokenFilterFactory) tokenFilterFactory);
|
||||
return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory();
|
||||
} else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) {
|
||||
return (ShingleTokenFilterFactory.Factory) tokenFilterFactory;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -216,7 +216,7 @@ public final class PhraseSuggestParser implements SuggestContextParser {
|
|||
}
|
||||
|
||||
if (!gramSizeSet || suggestion.generators().isEmpty()) {
|
||||
final ShingleTokenFilterFactory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
|
||||
final ShingleTokenFilterFactory.Factory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
|
||||
if (!gramSizeSet) {
|
||||
// try to detect the shingle size
|
||||
if (shingleFilterFactory != null) {
|
||||
|
|
|
@ -740,6 +740,8 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||
builder.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase");
|
||||
builder.put("index.analysis.analyzer.ngram.tokenizer", "standard");
|
||||
builder.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase");
|
||||
builder.put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard");
|
||||
builder.putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase");
|
||||
builder.put("index.analysis.filter.my_shingle.type", "shingle");
|
||||
builder.put("index.analysis.filter.my_shingle.output_unigrams", false);
|
||||
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
|
||||
|
@ -833,6 +835,23 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||
SearchResponse search = client.prepareSearch()
|
||||
.setSearchType(SearchType.COUNT)
|
||||
.setSuggestText("Xor the Got-Jewel")
|
||||
.addSuggestion(
|
||||
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram").analyzer("myDefAnalyzer")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
.size(1)).execute().actionGet();
|
||||
|
||||
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
|
||||
assertThat(search.getSuggest(), notNullValue());
|
||||
assertThat(search.getSuggest().size(), equalTo(1));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
||||
|
||||
search = client.prepareSearch()
|
||||
.setSearchType(SearchType.COUNT)
|
||||
.setSuggestText("Xor the Got-Jewel")
|
||||
.addSuggestion(
|
||||
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram")
|
||||
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
|
||||
|
@ -846,6 +865,7 @@ public class SuggestSearchTests extends AbstractNodesTests {
|
|||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
|
||||
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue