Add Lucene CommonGrams/CommonGramsQuery token fiter

Both filters merged in a single "common_grams" tokenfilter.

Closes #3202
This commit is contained in:
Cédric HOURCADE 2013-06-18 22:10:45 +01:00
parent 5aa0a8438f
commit 71849668e9
8 changed files with 389 additions and 49 deletions

View File

@ -142,74 +142,51 @@ public class Analysis {
.put("_turkish_", TurkishAnalyzer.getDefaultStopSet())
.immutableMap();
public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
String value = settings.get("articles");
public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
String value = settings.get(name);
if (value != null) {
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
} else {
return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("articles_case", false));
return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, version, ignoreCase);
}
}
String[] articles = settings.getAsArray("articles", null);
if (articles != null) {
return new CharArraySet(version, Arrays.asList(articles), settings.getAsBoolean("articles_case", false));
}
CharArraySet pathLoadedArticles = getWordSet(env, settings, "articles", version);
if (pathLoadedArticles != null) {
return pathLoadedArticles;
List<String> pathLoadedWords = getWordList(env, settings, name);
if (pathLoadedWords != null) {
return resolveNamedWords(pathLoadedWords, namedWords, version, ignoreCase);
}
return defaultWords;
}
return null;
public static CharArraySet parseCommonWords(Environment env, Settings settings, CharArraySet defaultCommonWords, Version version, boolean ignoreCase) {
return parseWords(env, settings, "common_words", defaultCommonWords, namedStopWords, version, ignoreCase);
}
public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
return parseWords(env, settings, "articles", null, null, version, settings.getAsBoolean("articles_case", false));
}
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version) {
return parseStopWords(env, settings, defaultStopWords, version, settings.getAsBoolean("stopwords_case", false));
}
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignore_case) {
String value = settings.get("stopwords");
if (value != null) {
if ("_none_".equals(value)) {
return CharArraySet.EMPTY_SET;
} else {
return resolveNamedStopWords(Strings.commaDelimitedListToSet(value), version, ignore_case);
}
}
String[] stopWords = settings.getAsArray("stopwords", null);
if (stopWords != null) {
return resolveNamedStopWords(stopWords, version, ignore_case);
}
List<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
if (pathLoadedStopWords != null) {
return resolveNamedStopWords(pathLoadedStopWords, version, ignore_case);
}
return defaultStopWords;
public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignoreCase) {
return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, version, ignoreCase);
}
private static CharArraySet resolveNamedStopWords(Collection<String> words, Version version, boolean ignore_case) {
CharArraySet setStopWords = new CharArraySet(version, words.size(), ignore_case);
for (String stopWord : words) {
if (namedStopWords.containsKey(stopWord)) {
setStopWords.addAll(namedStopWords.get(stopWord));
private static CharArraySet resolveNamedWords(Collection<String> words, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
if (namedWords == null) {
return new CharArraySet(version, words, ignoreCase);
}
CharArraySet setWords = new CharArraySet(version, words.size(), ignoreCase);
for (String word : words) {
if (namedWords.containsKey(word)) {
setWords.addAll(namedWords.get(word));
} else {
setStopWords.add(stopWord);
setWords.add(word);
}
}
return setStopWords;
}
private static CharArraySet resolveNamedStopWords(String[] words, Version version, boolean ignore_case) {
CharArraySet setStopWords = new CharArraySet(version, words.length, ignore_case);
for (String stopWord : words) {
if (namedStopWords.containsKey(stopWord)) {
setStopWords.addAll(namedStopWords.get(stopWord));
} else {
setStopWords.add(stopWord);
}
}
return setStopWords;
return setWords;
}
public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix, Version version) {

View File

@ -438,6 +438,7 @@ public class AnalysisModule extends AbstractModule {
tokenFiltersBindings.processTokenFilter("truncate", TruncateTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("trim", TrimTokenFilterFactory.class);
tokenFiltersBindings.processTokenFilter("limit", LimitTokenCountFilterFactory.class);
tokenFiltersBindings.processTokenFilter("common_grams", CommonGramsTokenFilterFactory.class);
}
@Override

View File

@ -0,0 +1,68 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
/**
*
*/
@AnalysisSettingsRequired
public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
private final CharArraySet words;
private final boolean ignoreCase;
private final boolean queryMode;
@Inject
public CommonGramsTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettings, name, settings);
this.ignoreCase = settings.getAsBoolean("ignore_case", false);
this.queryMode = settings.getAsBoolean("query_mode", false);
this.words = Analysis.parseCommonWords(env, settings, null, version, ignoreCase);
if (this.words == null) {
throw new ElasticSearchIllegalArgumentException("mising or empty [common_words] or [common_words_path] configuration for common_grams token filter");
}
}
@Override
public TokenStream create(TokenStream tokenStream) {
CommonGramsFilter filter = new CommonGramsFilter(version, tokenStream, words);
if (queryMode) {
return new CommonGramsQueryFilter(filter);
} else {
return filter;
}
}
}

View File

@ -31,6 +31,7 @@ import org.apache.lucene.analysis.ca.CatalanAnalyzer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.analysis.commongrams.*;
import org.apache.lucene.analysis.core.*;
import org.apache.lucene.analysis.cz.CzechAnalyzer;
import org.apache.lucene.analysis.cz.CzechStemFilter;
@ -78,6 +79,7 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer;
import org.apache.lucene.analysis.th.ThaiAnalyzer;
import org.apache.lucene.analysis.tr.TurkishAnalyzer;
import org.apache.lucene.analysis.util.ElisionFilter;
import org.apache.lucene.analysis.util.CharArraySet;
import org.elasticsearch.ElasticSearchIllegalStateException;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
@ -393,6 +395,18 @@ public class IndicesAnalysisService extends AbstractComponent {
}
}));
tokenFilterFactories.put("common_grams", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {
return "common_grams";
}
@Override
public TokenStream create(TokenStream tokenStream) {
return new CommonGramsFilter(Lucene.ANALYZER_VERSION, tokenStream, CharArraySet.EMPTY_SET);
}
}));
tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
@Override
public String name() {

View File

@ -0,0 +1,218 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.index.analysis.commongrams;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.util.Version;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.analysis.AnalysisService;
import org.elasticsearch.index.analysis.TokenFilterFactory;
import org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper;
import org.testng.annotations.Test;
import org.testng.Assert;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.instanceOf;
public class CommonGramsTokenFilterFactoryTests {
@Test
public void testDefault() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams").build();
try {
AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Assert.fail("[common_words] or [common_words_path] is set");
} catch (Exception e) {
assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
}
}
@Test
public void testWithoutCommonWordsMatch() throws IOException {
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
.putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
.put("index.analysis.filter.common_grams_default.query_mode", false)
.putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
{
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}
}
@Test
public void testSettings() throws IOException {
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
.put("index.analysis.filter.common_grams_1.ignore_case", true)
.putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
String source = "the quick brown is a fox or noT";
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "fox_or", "or", "or_noT", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
.put("index.analysis.filter.common_grams_2.ignore_case", false)
.putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
String source = "the quick brown is a fox or why noT";
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "or", "why", "why_noT", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
.putArray("index.analysis.filter.common_grams_3.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}
@Test
public void testCommonGramsAnalysis() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams.json").build();
{
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
String source = "the quick brown is a fox or not";
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
}
{
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
String source = "the quick brown is a fox or not";
String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
}
}
@Test
public void testQueryModeSettings() throws IOException {
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
.put("index.analysis.filter.common_grams_1.query_mode", true)
.putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
.put("index.analysis.filter.common_grams_1.ignore_case", true)
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
String source = "the quick brown is a fox or noT";
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox_or", "or_noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
.put("index.analysis.filter.common_grams_2.query_mode", true)
.putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
.put("index.analysis.filter.common_grams_2.ignore_case", false)
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
String source = "the quick brown is a fox or why noT";
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
.put("index.analysis.filter.common_grams_3.query_mode", true)
.putArray("index.analysis.filter.common_grams_3.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
String source = "the quick brown is a fox or why noT";
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
{
Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_4.type", "common_grams")
.put("index.analysis.filter.common_grams_4.query_mode", true)
.putArray("index.analysis.filter.common_grams_4.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
.build();
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_4");
String source = "the quick brown is a fox Or noT";
String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "Or", "noT" };
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
}
@Test
public void testQueryModeCommonGramsAnalysis() throws IOException {
Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams_query_mode.json").build();
{
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
String source = "the quick brown is a fox or not";
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
}
{
AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
String source = "the quick brown is a fox or not";
String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
}
}
}

View File

@ -0,0 +1,29 @@
{
"index":{
"analysis":{
"analyzer":{
"commongramsAnalyzer":{
"tokenizer":"whitespace",
"filter":[ "common_grams" ]
},
"commongramsAnalyzer_file":{
"tokenizer":"whitespace",
"filter":[ "common_grams_file" ]
}
},
"filter":{
"common_grams":{
"type":"common_grams",
"common_words":[
"brown",
"fox"
]
},
"common_grams_file":{
"type":"common_grams",
"common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
}
}
}
}
}

View File

@ -0,0 +1,31 @@
{
"index":{
"analysis":{
"analyzer":{
"commongramsAnalyzer":{
"tokenizer":"whitespace",
"filter":[ "common_grams" ]
},
"commongramsAnalyzer_file":{
"tokenizer":"whitespace",
"filter":[ "common_grams_file" ]
}
},
"filter":{
"common_grams":{
"type":"common_grams",
"query_mode" : true,
"common_words":[
"brown",
"fox"
]
},
"common_grams_file":{
"type":"common_grams",
"query_mode" : true,
"common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
}
}
}
}
}