use a base ShingleFilterFactory to simplify default shingle detection

This commit is contained in:
Simon Willnauer 2013-03-05 12:32:50 +01:00
parent 0f95499703
commit 1eb24d7efc
5 changed files with 94 additions and 50 deletions

View File

@ -32,48 +32,83 @@ import org.elasticsearch.index.settings.IndexSettings;
*/
public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
private final int maxShingleSize;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private String tokenSeparator;
private int minShingleSize;
private final Factory factory;
@Inject
public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
outputUnigrams = settings.getAsBoolean("output_unigrams", true);
outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
String tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator);
}
@Override
public TokenStream create(TokenStream tokenStream) {
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setTokenSeparator(tokenSeparator);
return filter;
return factory.create(tokenStream);
}
public int getMaxShingleSize() {
return maxShingleSize;
public Factory getInnerFactory() {
return this.factory;
}
public int getMinShingleSize() {
return minShingleSize;
}
public boolean getOutputUnigrams() {
return outputUnigrams;
}
public boolean getOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
public static final class Factory implements TokenFilterFactory {
private final int maxShingleSize;
private final boolean outputUnigrams;
private final boolean outputUnigramsIfNoShingles;
private final String tokenSeparator;
private int minShingleSize;
private final String name;
public Factory(String name) {
this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.TOKEN_SEPARATOR);
}
Factory(String name, int minShingleSize, int maxShingleSize, boolean outputUnigrams, boolean outputUnigramsIfNoShingles, String tokenSeparator) {
this.maxShingleSize = maxShingleSize;
this.outputUnigrams = outputUnigrams;
this.outputUnigramsIfNoShingles = outputUnigramsIfNoShingles;
this.tokenSeparator = tokenSeparator;
this.minShingleSize = minShingleSize;
this.name = name;
}
public TokenStream create(TokenStream tokenStream) {
ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setTokenSeparator(tokenSeparator);
return filter;
}
public int getMaxShingleSize() {
return maxShingleSize;
}
public int getMinShingleSize() {
return minShingleSize;
}
public boolean getOutputUnigrams() {
return outputUnigrams;
}
public boolean getOutputUnigramsIfNoShingles() {
return outputUnigramsIfNoShingles;
}
@Override
public String name() {
return name;
}
}
}

View File

@ -504,17 +504,7 @@ public class IndicesAnalysisService extends AbstractComponent {
}
}));
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {
return "shingle";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new ShingleFilter(tokenStream, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
}
}));
tokenFilterFactories.put("shingle", new PreBuiltTokenFilterFactoryFactory(new ShingleTokenFilterFactory.Factory("shingle")));
tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override

View File

@ -23,7 +23,6 @@ import java.util.Comparator;
import java.util.Locale;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CustomAnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -276,21 +275,21 @@ public final class SuggestUtils {
}
public static ShingleTokenFilterFactory getShingleFilterFactory(Analyzer analyzer) {
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) {
if (analyzer instanceof NamedAnalyzer) {
analyzer = ((NamedAnalyzer)analyzer).analyzer();
}
if (analyzer instanceof CustomAnalyzer) {
CustomAnalyzer a = (CustomAnalyzer) analyzer;
TokenFilterFactory[] tokenFilters = a.tokenFilters();
final CustomAnalyzer a = (CustomAnalyzer) analyzer;
final TokenFilterFactory[] tokenFilters = a.tokenFilters();
for (TokenFilterFactory tokenFilterFactory : tokenFilters) {
if (tokenFilterFactory instanceof ShingleTokenFilterFactory) {
return ((ShingleTokenFilterFactory) tokenFilterFactory);
return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory();
} else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) {
return (ShingleTokenFilterFactory.Factory) tokenFilterFactory;
}
}
}
return null;
}
}

View File

@ -216,7 +216,7 @@ public final class PhraseSuggestParser implements SuggestContextParser {
}
if (!gramSizeSet || suggestion.generators().isEmpty()) {
final ShingleTokenFilterFactory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
final ShingleTokenFilterFactory.Factory shingleFilterFactory = SuggestUtils.getShingleFilterFactory(suggestion.getAnalyzer() == null ? context.mapperService().fieldSearchAnalyzer(suggestion.getField()) : suggestion.getAnalyzer()); ;
if (!gramSizeSet) {
// try to detect the shingle size
if (shingleFilterFactory != null) {

View File

@ -740,6 +740,8 @@ public class SuggestSearchTests extends AbstractNodesTests {
builder.putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase");
builder.put("index.analysis.analyzer.ngram.tokenizer", "standard");
builder.putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase");
builder.put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard");
builder.putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase");
builder.put("index.analysis.filter.my_shingle.type", "shingle");
builder.put("index.analysis.filter.my_shingle.output_unigrams", false);
builder.put("index.analysis.filter.my_shingle.min_shingle_size", 2);
@ -833,6 +835,23 @@ public class SuggestSearchTests extends AbstractNodesTests {
SearchResponse search = client.prepareSearch()
.setSearchType(SearchType.COUNT)
.setSuggestText("Xor the Got-Jewel")
.addSuggestion(
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram").analyzer("myDefAnalyzer")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
.size(1)).execute().actionGet();
assertThat(Arrays.toString(search.getShardFailures()), search.getFailedShards(), equalTo(0));
assertThat(search.getSuggest(), notNullValue());
assertThat(search.getSuggest().size(), equalTo(1));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getName(), equalTo("simple_phrase"));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().size(), equalTo(1));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
search = client.prepareSearch()
.setSearchType(SearchType.COUNT)
.setSuggestText("Xor the Got-Jewel")
.addSuggestion(
phraseSuggestion("simple_phrase").maxErrors(0.5f).field("ngram")
.addCandidateGenerator(PhraseSuggestionBuilder.candidateGenerator("body").minWordLength(1).suggestMode("always"))
@ -846,6 +865,7 @@ public class SuggestSearchTests extends AbstractNodesTests {
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().size(), equalTo(1));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getText().string(), equalTo("Xor the Got-Jewel"));
assertThat(search.getSuggest().getSuggestion("simple_phrase").getEntries().get(0).getOptions().get(0).getText().string(), equalTo("xorr the god jewel"));
}