#2116 Expose all ShingleFilter settings via ShingleTokenFilterFactory

This commit is contained in:
Simon Willnauer 2012-07-31 21:40:05 +02:00 committed by Shay Banon
parent 0492d9b8cb
commit d13a7809d1
4 changed files with 172 additions and 1 deletions

View File

@ -36,17 +36,28 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory {
private final boolean outputUnigrams;
private Boolean outputUnigramsIfNoShingles;
private String tokenSeparator;
private int minShingleSize;
@Inject
public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
outputUnigrams = settings.getAsBoolean("output_unigrams", true);
outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR);
}
@Override
public TokenStream create(TokenStream tokenStream) {
ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize);
ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize, minShingleSize);
filter.setOutputUnigrams(outputUnigrams);
filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
filter.setTokenSeparator(tokenSeparator);
return filter;
}
}

View File

@ -0,0 +1,73 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.inject.ModulesBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsModule;
import org.elasticsearch.env.Environment;
import org.elasticsearch.env.EnvironmentModule;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.IndexNameModule;
import org.elasticsearch.index.analysis.AnalysisModule;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.settings.IndexSettingsModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisModule;
import org.elasticsearch.indices.analysis.IndicesAnalysisService;
import org.testng.Assert;
public class AnalysisHelper {
public static AnalysisService createAnalysisServiceFromClassPath(String resource) {
Settings settings = ImmutableSettings.settingsBuilder()
.loadFromClasspath(resource).build();
Index index = new Index("test");
Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings),
new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector();
AnalysisModule analysisModule = new AnalysisModule(settings,
parentInjector.getInstance(IndicesAnalysisService.class));
Injector injector = new ModulesBuilder().add(new IndexSettingsModule(index, settings),
new IndexNameModule(index), analysisModule).createChildInjector(parentInjector);
return injector.getInstance(AnalysisService.class);
}
public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
stream.reset();
CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
Assert.assertNotNull(termAttr);
int i = 0;
while (stream.incrementToken()) {
Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString());
Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
i++;
}
Assert.assertEquals(i, expected.length, "not all tokens produced");
}
}

View File

@ -0,0 +1,71 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.analysis;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.instanceOf;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.ShingleTokenFilterFactory;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.testng.annotations.Test;
public class ShingleTokenFilterFactoryTest {
private static final String RESOURCE = "org/elasticsearch/test/unit/index/analysis/shingle_analysis.json";
@Test
public void testDefault() throws IOException {
AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle");
String source = "the quick brown fox";
String[] expected = new String[] { "the", "the quick", "quick", "quick brown", "brown", "brown fox", "fox" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source));
AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testInverseMapping() throws IOException {
AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse");
assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
String source = "the quick brown fox";
String[] expected = new String[] { "the_quick_brown", "quick_brown_fox" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source));
AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
@Test
public void testInverseMappingNoShingles() throws IOException {
AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse");
assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class));
String source = "the quick";
String[] expected = new String[] { "the", "quick" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source));
AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}

View File

@ -0,0 +1,16 @@
{
"index":{
"analysis":{
"filter":{
"shingle_inverse":{
"type":"shingle",
"max_shingle_size" : 3,
"min_shingle_size" : 3,
"output_unigrams" : false,
"output_unigrams_if_no_shingles" : true,
"token_separator" : "_"
}
}
}
}
}