From d13a7809d15c0588743c8d0729dca7cbb6d46aa8 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Tue, 31 Jul 2012 21:40:05 +0200 Subject: [PATCH] #2116 Expose all ShingleFilter settings via ShingleTokenFilterFactory --- .../analysis/ShingleTokenFilterFactory.java | 13 +++- .../unit/index/analysis/AnalysisHelper.java | 73 +++++++++++++++++++ .../ShingleTokenFilterFactoryTest.java | 71 ++++++++++++++++++ .../unit/index/analysis/shingle_analysis.json | 16 ++++ 4 files changed, 172 insertions(+), 1 deletion(-) create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisHelper.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/ShingleTokenFilterFactoryTest.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/analysis/shingle_analysis.json diff --git a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index 58ce9f7af44..40fceee68b7 100644 --- a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -36,17 +36,28 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { private final boolean outputUnigrams; + private Boolean outputUnigramsIfNoShingles; + + private String tokenSeparator; + + private int minShingleSize; + @Inject public ShingleTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, @Assisted String name, @Assisted Settings settings) { super(index, indexSettings, name, settings); maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); + minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); outputUnigrams = settings.getAsBoolean("output_unigrams", true); + outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); + tokenSeparator = settings.get("token_separator", ShingleFilter.TOKEN_SEPARATOR); } @Override public TokenStream create(TokenStream tokenStream) { - ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize); + ShingleFilter filter = new ShingleFilter(tokenStream, maxShingleSize, minShingleSize); filter.setOutputUnigrams(outputUnigrams); + filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); + filter.setTokenSeparator(tokenSeparator); return filter; } } \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisHelper.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisHelper.java new file mode 100644 index 00000000000..1843baa3065 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisHelper.java @@ -0,0 +1,73 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.unit.index.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.inject.Injector; +import org.elasticsearch.common.inject.ModulesBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.settings.SettingsModule; +import org.elasticsearch.env.Environment; +import org.elasticsearch.env.EnvironmentModule; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexNameModule; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.settings.IndexSettingsModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisModule; +import org.elasticsearch.indices.analysis.IndicesAnalysisService; +import org.testng.Assert; + +public class AnalysisHelper { + public static AnalysisService createAnalysisServiceFromClassPath(String resource) { + Settings settings = ImmutableSettings.settingsBuilder() + .loadFromClasspath(resource).build(); + + Index index = new Index("test"); + + Injector parentInjector = new ModulesBuilder().add(new SettingsModule(settings), + new EnvironmentModule(new Environment(settings)), new IndicesAnalysisModule()).createInjector(); + + AnalysisModule analysisModule = new AnalysisModule(settings, + parentInjector.getInstance(IndicesAnalysisService.class)); + + Injector injector = new ModulesBuilder().add(new IndexSettingsModule(index, settings), + new IndexNameModule(index), analysisModule).createChildInjector(parentInjector); + + return injector.getInstance(AnalysisService.class); + } + + public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { + stream.reset(); + CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class); + Assert.assertNotNull(termAttr); + int i = 0; + while (stream.incrementToken()) { + Assert.assertTrue(i < expected.length, "got extra term: " + termAttr.toString()); + Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i); + i++; + } + Assert.assertEquals(i, expected.length, "not all tokens produced"); + } +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/ShingleTokenFilterFactoryTest.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/ShingleTokenFilterFactoryTest.java new file mode 100644 index 00000000000..582f9a21351 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/ShingleTokenFilterFactoryTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.unit.index.analysis; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.instanceOf; + +import java.io.IOException; +import java.io.StringReader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.util.Version; +import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.analysis.ShingleTokenFilterFactory; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.testng.annotations.Test; + +public class ShingleTokenFilterFactoryTest { + private static final String RESOURCE = "org/elasticsearch/test/unit/index/analysis/shingle_analysis.json"; + + @Test + public void testDefault() throws IOException { + AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle"); + String source = "the quick brown fox"; + String[] expected = new String[] { "the", "the quick", "quick", "quick brown", "brown", "brown fox", "fox" }; + Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source)); + AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + } + + @Test + public void testInverseMapping() throws IOException { + AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse"); + assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class)); + String source = "the quick brown fox"; + String[] expected = new String[] { "the_quick_brown", "quick_brown_fox" }; + Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source)); + AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + } + + @Test + public void testInverseMappingNoShingles() throws IOException { + AnalysisService analysisService = AnalysisHelper.createAnalysisServiceFromClassPath(RESOURCE); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("shingle_inverse"); + assertThat(tokenFilter, instanceOf(ShingleTokenFilterFactory.class)); + String source = "the quick"; + String[] expected = new String[] { "the", "quick" }; + Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader(source)); + AnalysisHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); + } + +} diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/shingle_analysis.json b/src/test/java/org/elasticsearch/test/unit/index/analysis/shingle_analysis.json new file mode 100644 index 00000000000..c469a4a4dd0 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/shingle_analysis.json @@ -0,0 +1,16 @@ +{ + "index":{ + "analysis":{ + "filter":{ + "shingle_inverse":{ + "type":"shingle", + "max_shingle_size" : 3, + "min_shingle_size" : 3, + "output_unigrams" : false, + "output_unigrams_if_no_shingles" : true, + "token_separator" : "_" + } + } + } + } +} \ No newline at end of file