mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-02-17 18:35:25 +00:00
Add Lucene CommonGrams/CommonGramsQuery token fiter
Both filters merged in a single "common_grams" tokenfilter. Closes #3202
This commit is contained in:
parent
5aa0a8438f
commit
71849668e9
@ -142,74 +142,51 @@ public class Analysis {
|
||||
.put("_turkish_", TurkishAnalyzer.getDefaultStopSet())
|
||||
.immutableMap();
|
||||
|
||||
public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
|
||||
String value = settings.get("articles");
|
||||
public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
|
||||
String value = settings.get(name);
|
||||
if (value != null) {
|
||||
if ("_none_".equals(value)) {
|
||||
return CharArraySet.EMPTY_SET;
|
||||
} else {
|
||||
return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("articles_case", false));
|
||||
return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, version, ignoreCase);
|
||||
}
|
||||
}
|
||||
String[] articles = settings.getAsArray("articles", null);
|
||||
if (articles != null) {
|
||||
return new CharArraySet(version, Arrays.asList(articles), settings.getAsBoolean("articles_case", false));
|
||||
}
|
||||
CharArraySet pathLoadedArticles = getWordSet(env, settings, "articles", version);
|
||||
if (pathLoadedArticles != null) {
|
||||
return pathLoadedArticles;
|
||||
List<String> pathLoadedWords = getWordList(env, settings, name);
|
||||
if (pathLoadedWords != null) {
|
||||
return resolveNamedWords(pathLoadedWords, namedWords, version, ignoreCase);
|
||||
}
|
||||
return defaultWords;
|
||||
}
|
||||
|
||||
return null;
|
||||
public static CharArraySet parseCommonWords(Environment env, Settings settings, CharArraySet defaultCommonWords, Version version, boolean ignoreCase) {
|
||||
return parseWords(env, settings, "common_words", defaultCommonWords, namedStopWords, version, ignoreCase);
|
||||
}
|
||||
|
||||
public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
|
||||
return parseWords(env, settings, "articles", null, null, version, settings.getAsBoolean("articles_case", false));
|
||||
}
|
||||
|
||||
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version) {
|
||||
return parseStopWords(env, settings, defaultStopWords, version, settings.getAsBoolean("stopwords_case", false));
|
||||
}
|
||||
|
||||
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignore_case) {
|
||||
String value = settings.get("stopwords");
|
||||
if (value != null) {
|
||||
if ("_none_".equals(value)) {
|
||||
return CharArraySet.EMPTY_SET;
|
||||
} else {
|
||||
return resolveNamedStopWords(Strings.commaDelimitedListToSet(value), version, ignore_case);
|
||||
}
|
||||
}
|
||||
String[] stopWords = settings.getAsArray("stopwords", null);
|
||||
if (stopWords != null) {
|
||||
return resolveNamedStopWords(stopWords, version, ignore_case);
|
||||
}
|
||||
List<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
|
||||
if (pathLoadedStopWords != null) {
|
||||
return resolveNamedStopWords(pathLoadedStopWords, version, ignore_case);
|
||||
}
|
||||
|
||||
return defaultStopWords;
|
||||
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignoreCase) {
|
||||
return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, version, ignoreCase);
|
||||
}
|
||||
|
||||
private static CharArraySet resolveNamedStopWords(Collection<String> words, Version version, boolean ignore_case) {
|
||||
CharArraySet setStopWords = new CharArraySet(version, words.size(), ignore_case);
|
||||
for (String stopWord : words) {
|
||||
if (namedStopWords.containsKey(stopWord)) {
|
||||
setStopWords.addAll(namedStopWords.get(stopWord));
|
||||
private static CharArraySet resolveNamedWords(Collection<String> words, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
|
||||
if (namedWords == null) {
|
||||
return new CharArraySet(version, words, ignoreCase);
|
||||
}
|
||||
CharArraySet setWords = new CharArraySet(version, words.size(), ignoreCase);
|
||||
for (String word : words) {
|
||||
if (namedWords.containsKey(word)) {
|
||||
setWords.addAll(namedWords.get(word));
|
||||
} else {
|
||||
setStopWords.add(stopWord);
|
||||
setWords.add(word);
|
||||
}
|
||||
}
|
||||
return setStopWords;
|
||||
}
|
||||
|
||||
private static CharArraySet resolveNamedStopWords(String[] words, Version version, boolean ignore_case) {
|
||||
CharArraySet setStopWords = new CharArraySet(version, words.length, ignore_case);
|
||||
for (String stopWord : words) {
|
||||
if (namedStopWords.containsKey(stopWord)) {
|
||||
setStopWords.addAll(namedStopWords.get(stopWord));
|
||||
} else {
|
||||
setStopWords.add(stopWord);
|
||||
}
|
||||
}
|
||||
return setStopWords;
|
||||
return setWords;
|
||||
}
|
||||
|
||||
public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix, Version version) {
|
||||
|
@ -438,6 +438,7 @@ public class AnalysisModule extends AbstractModule {
|
||||
tokenFiltersBindings.processTokenFilter("truncate", TruncateTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("trim", TrimTokenFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("limit", LimitTokenCountFilterFactory.class);
|
||||
tokenFiltersBindings.processTokenFilter("common_grams", CommonGramsTokenFilterFactory.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -0,0 +1,68 @@
|
||||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.index.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
|
||||
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.env.Environment;
|
||||
import org.elasticsearch.index.Index;
|
||||
import org.elasticsearch.index.settings.IndexSettings;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
@AnalysisSettingsRequired
|
||||
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
|
||||
|
||||
private final CharArraySet words;
|
||||
|
||||
private final boolean ignoreCase;
|
||||
|
||||
private final boolean queryMode;
|
||||
|
||||
@Inject
|
||||
public CommonGramsTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
|
||||
super(index, indexSettings, name, settings);
|
||||
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
|
||||
this.queryMode = settings.getAsBoolean("query_mode", false);
|
||||
this.words = Analysis.parseCommonWords(env, settings, null, version, ignoreCase);
|
||||
|
||||
if (this.words == null) {
|
||||
throw new ElasticSearchIllegalArgumentException("mising or empty [common_words] or [common_words_path] configuration for common_grams token filter");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
CommonGramsFilter filter = new CommonGramsFilter(version, tokenStream, words);
|
||||
if (queryMode) {
|
||||
return new CommonGramsQueryFilter(filter);
|
||||
} else {
|
||||
return filter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -31,6 +31,7 @@ import org.apache.lucene.analysis.ca.CatalanAnalyzer;
|
||||
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
|
||||
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
|
||||
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
|
||||
import org.apache.lucene.analysis.commongrams.*;
|
||||
import org.apache.lucene.analysis.core.*;
|
||||
import org.apache.lucene.analysis.cz.CzechAnalyzer;
|
||||
import org.apache.lucene.analysis.cz.CzechStemFilter;
|
||||
@ -78,6 +79,7 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer;
|
||||
import org.apache.lucene.analysis.th.ThaiAnalyzer;
|
||||
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
|
||||
import org.apache.lucene.analysis.util.ElisionFilter;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.elasticsearch.ElasticSearchIllegalStateException;
|
||||
import org.elasticsearch.common.component.AbstractComponent;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
@ -393,6 +395,18 @@ public class IndicesAnalysisService extends AbstractComponent {
|
||||
}
|
||||
}));
|
||||
|
||||
tokenFilterFactories.put("common_grams", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
return "common_grams";
|
||||
}
|
||||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
return new CommonGramsFilter(Lucene.ANALYZER_VERSION, tokenStream, CharArraySet.EMPTY_SET);
|
||||
}
|
||||
}));
|
||||
|
||||
tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
|
||||
@Override
|
||||
public String name() {
|
||||
|
@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Licensed to ElasticSearch and Shay Banon under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. ElasticSearch licenses this
|
||||
* file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
|
||||
package org.elasticsearch.test.unit.index.analysis.commongrams;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.ElasticSearchIllegalArgumentException;
|
||||
import org.elasticsearch.common.settings.ImmutableSettings;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.index.analysis.AnalysisService;
|
||||
import org.elasticsearch.index.analysis.TokenFilterFactory;
|
||||
import org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper;
|
||||
import org.testng.annotations.Test;
|
||||
import org.testng.Assert;
|
||||
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
|
||||
public class CommonGramsTokenFilterFactoryTests {
|
||||
|
||||
@Test
|
||||
public void testDefault() throws IOException {
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams").build();
|
||||
|
||||
try {
|
||||
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
Assert.fail("[common_words] or [common_words_path] is set");
|
||||
} catch (Exception e) {
|
||||
assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
|
||||
}
|
||||
}
|
||||
@Test
|
||||
public void testWithoutCommonWordsMatch() throws IOException {
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.build();
|
||||
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
{
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_default.query_mode", false)
|
||||
.putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
{
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSettings() throws IOException {
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
|
||||
String source = "the quick brown is a fox or noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "fox_or", "or", "or_noT", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "or", "why", "why_noT", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.putArray("index.analysis.filter.common_grams_3.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "Or", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommonGramsAnalysis() throws IOException {
|
||||
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams.json").build();
|
||||
{
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
|
||||
}
|
||||
{
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQueryModeSettings() throws IOException {
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_1.query_mode", true)
|
||||
.putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_1.ignore_case", true)
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
|
||||
String source = "the quick brown is a fox or noT";
|
||||
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox_or", "or_noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_2.query_mode", true)
|
||||
.putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.put("index.analysis.filter.common_grams_2.ignore_case", false)
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_3.query_mode", true)
|
||||
.putArray("index.analysis.filter.common_grams_3.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
|
||||
String source = "the quick brown is a fox or why noT";
|
||||
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
{
|
||||
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_4.type", "common_grams")
|
||||
.put("index.analysis.filter.common_grams_4.query_mode", true)
|
||||
.putArray("index.analysis.filter.common_grams_4.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
|
||||
.build();
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_4");
|
||||
String source = "the quick brown is a fox Or noT";
|
||||
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "Or", "noT" };
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQueryModeCommonGramsAnalysis() throws IOException {
|
||||
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams_query_mode.json").build();
|
||||
{
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
|
||||
}
|
||||
{
|
||||
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
|
||||
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
|
||||
String source = "the quick brown is a fox or not";
|
||||
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
|
||||
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,2 @@
|
||||
brown
|
||||
fox
|
@ -0,0 +1,29 @@
|
||||
{
|
||||
"index":{
|
||||
"analysis":{
|
||||
"analyzer":{
|
||||
"commongramsAnalyzer":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "common_grams" ]
|
||||
},
|
||||
"commongramsAnalyzer_file":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "common_grams_file" ]
|
||||
}
|
||||
},
|
||||
"filter":{
|
||||
"common_grams":{
|
||||
"type":"common_grams",
|
||||
"common_words":[
|
||||
"brown",
|
||||
"fox"
|
||||
]
|
||||
},
|
||||
"common_grams_file":{
|
||||
"type":"common_grams",
|
||||
"common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
{
|
||||
"index":{
|
||||
"analysis":{
|
||||
"analyzer":{
|
||||
"commongramsAnalyzer":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "common_grams" ]
|
||||
},
|
||||
"commongramsAnalyzer_file":{
|
||||
"tokenizer":"whitespace",
|
||||
"filter":[ "common_grams_file" ]
|
||||
}
|
||||
},
|
||||
"filter":{
|
||||
"common_grams":{
|
||||
"type":"common_grams",
|
||||
"query_mode" : true,
|
||||
"common_words":[
|
||||
"brown",
|
||||
"fox"
|
||||
]
|
||||
},
|
||||
"common_grams_file":{
|
||||
"type":"common_grams",
|
||||
"query_mode" : true,
|
||||
"common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user