Add Lucene CommonGrams/CommonGramsQuery token fiter

Both filters merged in a single "common_grams" tokenfilter. Closes #3202
2025-03-24 17:09:48 +00:00 · 2013-06-18 22:10:45 +01:00 · 2013-06-18 22:10:45 +01:00 · 71849668e9
commit 71849668e9
parent 5aa0a8438f
8 changed files with 389 additions and 49 deletions
--- a/src/main/java/org/elasticsearch/index/analysis/Analysis.java
+++ b/src/main/java/org/elasticsearch/index/analysis/Analysis.java
@ -142,74 +142,51 @@ public class Analysis {
            .put("_turkish_", TurkishAnalyzer.getDefaultStopSet())
            .immutableMap();

-    public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
-        String value = settings.get("articles");
+    public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
+        String value = settings.get(name);
        if (value != null) {
            if ("_none_".equals(value)) {
                return CharArraySet.EMPTY_SET;
            } else {
-                return new CharArraySet(version, Strings.commaDelimitedListToSet(value), settings.getAsBoolean("articles_case", false));
+                return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, version, ignoreCase);
            }
        }
-        String[] articles = settings.getAsArray("articles", null);
-        if (articles != null) {
-            return new CharArraySet(version, Arrays.asList(articles), settings.getAsBoolean("articles_case", false));
-        }
-        CharArraySet pathLoadedArticles = getWordSet(env, settings, "articles", version);
-        if (pathLoadedArticles != null) {
-            return pathLoadedArticles;
+        List<String> pathLoadedWords = getWordList(env, settings, name);
+        if (pathLoadedWords != null) {
+            return resolveNamedWords(pathLoadedWords, namedWords, version, ignoreCase);
        }
+        return defaultWords;
+    }

-        return null;
+    public static CharArraySet parseCommonWords(Environment env, Settings settings, CharArraySet defaultCommonWords, Version version, boolean ignoreCase) {
+        return parseWords(env, settings, "common_words", defaultCommonWords, namedStopWords, version, ignoreCase);
+    }
+
+    public static CharArraySet parseArticles(Environment env, Settings settings, Version version) {
+        return parseWords(env, settings, "articles", null, null, version, settings.getAsBoolean("articles_case", false));
    }

    public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version) {
        return parseStopWords(env, settings, defaultStopWords, version, settings.getAsBoolean("stopwords_case", false));
    }

-    public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignore_case) {
-        String value = settings.get("stopwords");
-        if (value != null) {
-            if ("_none_".equals(value)) {
-                return CharArraySet.EMPTY_SET;
-            } else {
-                return resolveNamedStopWords(Strings.commaDelimitedListToSet(value), version, ignore_case);
-            }
-        }
-        String[] stopWords = settings.getAsArray("stopwords", null);
-        if (stopWords != null) {
-            return resolveNamedStopWords(stopWords, version, ignore_case);
-        }
-        List<String> pathLoadedStopWords = getWordList(env, settings, "stopwords");
-        if (pathLoadedStopWords != null) {
-            return resolveNamedStopWords(pathLoadedStopWords, version, ignore_case);
-        }
-
-        return defaultStopWords;
+    public static CharArraySet parseStopWords(Environment env, Settings settings, CharArraySet defaultStopWords, Version version, boolean ignoreCase) {
+        return parseWords(env, settings, "stopwords", defaultStopWords, namedStopWords, version, ignoreCase);
    }

-    private static CharArraySet resolveNamedStopWords(Collection<String> words, Version version, boolean ignore_case) {
-        CharArraySet setStopWords = new CharArraySet(version, words.size(), ignore_case);
-        for (String stopWord : words) {
-            if (namedStopWords.containsKey(stopWord)) {
-                setStopWords.addAll(namedStopWords.get(stopWord));
+    private static CharArraySet resolveNamedWords(Collection<String> words, ImmutableMap<String, Set<?>> namedWords, Version version, boolean ignoreCase) {
+        if (namedWords == null) {
+            return new CharArraySet(version, words, ignoreCase);
+        }
+        CharArraySet setWords = new CharArraySet(version, words.size(), ignoreCase);
+        for (String word : words) {
+            if (namedWords.containsKey(word)) {
+                setWords.addAll(namedWords.get(word));
            } else {
-                setStopWords.add(stopWord);
+                setWords.add(word);
            }
        }
-        return setStopWords;
-    }
-
-    private static CharArraySet resolveNamedStopWords(String[] words, Version version, boolean ignore_case) {
-        CharArraySet setStopWords = new CharArraySet(version, words.length, ignore_case);
-        for (String stopWord : words) {
-            if (namedStopWords.containsKey(stopWord)) {
-                setStopWords.addAll(namedStopWords.get(stopWord));
-            } else {
-                setStopWords.add(stopWord);
-            }
-        }
-        return setStopWords;
+        return setWords;
    }

    public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix, Version version) {
--- a/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
+++ b/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java
@ -438,6 +438,7 @@ public class AnalysisModule extends AbstractModule {
            tokenFiltersBindings.processTokenFilter("truncate", TruncateTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("trim", TrimTokenFilterFactory.class);
            tokenFiltersBindings.processTokenFilter("limit", LimitTokenCountFilterFactory.class);
+            tokenFiltersBindings.processTokenFilter("common_grams", CommonGramsTokenFilterFactory.class);
        }

        @Override
--- a/src/main/java/org/elasticsearch/index/analysis/CommonGramsTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/CommonGramsTokenFilterFactory.java
@ -0,0 +1,68 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.index.analysis;
+
+import org.apache.lucene.analysis.util.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
+import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
+import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.assistedinject.Assisted;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.env.Environment;
+import org.elasticsearch.index.Index;
+import org.elasticsearch.index.settings.IndexSettings;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
+
+/**
+ *
+ */
+@AnalysisSettingsRequired
+public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
+
+    private final CharArraySet words;
+
+    private final boolean ignoreCase;
+
+    private final boolean queryMode;
+
+    @Inject
+    public CommonGramsTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) {
+        super(index, indexSettings, name, settings);
+        this.ignoreCase = settings.getAsBoolean("ignore_case", false);
+        this.queryMode = settings.getAsBoolean("query_mode", false);
+        this.words = Analysis.parseCommonWords(env, settings, null, version, ignoreCase);
+
+        if (this.words == null) {
+            throw new ElasticSearchIllegalArgumentException("mising or empty [common_words] or [common_words_path] configuration for common_grams token filter");
+        }
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        CommonGramsFilter filter = new CommonGramsFilter(version, tokenStream, words);
+        if (queryMode) {
+            return new CommonGramsQueryFilter(filter);
+        } else {
+            return filter;
+        }
+    }
+}
+
--- a/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java
+++ b/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java
@ -31,6 +31,7 @@ import org.apache.lucene.analysis.ca.CatalanAnalyzer;
 import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 import org.apache.lucene.analysis.cjk.CJKAnalyzer;
 import org.apache.lucene.analysis.cn.ChineseAnalyzer;
+import org.apache.lucene.analysis.commongrams.*;
 import org.apache.lucene.analysis.core.*;
 import org.apache.lucene.analysis.cz.CzechAnalyzer;
 import org.apache.lucene.analysis.cz.CzechStemFilter;
@ -78,6 +79,7 @@ import org.apache.lucene.analysis.sv.SwedishAnalyzer;
 import org.apache.lucene.analysis.th.ThaiAnalyzer;
 import org.apache.lucene.analysis.tr.TurkishAnalyzer;
 import org.apache.lucene.analysis.util.ElisionFilter;
+import org.apache.lucene.analysis.util.CharArraySet;
 import org.elasticsearch.ElasticSearchIllegalStateException;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
@ -393,6 +395,18 @@ public class IndicesAnalysisService extends AbstractComponent {
            }
        }));

+        tokenFilterFactories.put("common_grams", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
+            @Override
+            public String name() {
+                return "common_grams";
+            }
+
+            @Override
+            public TokenStream create(TokenStream tokenStream) {
+                return new CommonGramsFilter(Lucene.ANALYZER_VERSION, tokenStream, CharArraySet.EMPTY_SET);
+            }
+        }));
+
        tokenFilterFactories.put("lowercase", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
            @Override
            public String name() {
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/CommonGramsTokenFilterFactoryTests.java
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/CommonGramsTokenFilterFactoryTests.java
@ -0,0 +1,218 @@
+/*
+ * Licensed to ElasticSearch and Shay Banon under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. ElasticSearch licenses this
+ * file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.test.unit.index.analysis.commongrams;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.util.Version;
+import org.elasticsearch.ElasticSearchIllegalArgumentException;
+import org.elasticsearch.common.settings.ImmutableSettings;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.analysis.AnalysisService;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
+import org.elasticsearch.test.unit.index.analysis.AnalysisTestsHelper;
+import org.testng.annotations.Test;
+import org.testng.Assert;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.Matchers.instanceOf;
+
+public class CommonGramsTokenFilterFactoryTests {
+
+    @Test
+    public void testDefault() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams").build();
+
+        try {
+            AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            Assert.fail("[common_words] or [common_words_path] is set");
+        } catch (Exception e) {
+            assertThat(e.getCause(), instanceOf(ElasticSearchIllegalArgumentException.class));
+        }
+    }
+    @Test
+    public void testWithoutCommonWordsMatch() throws IOException {
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
+                     .putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
+                     .build();
+
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            {
+                TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
+                String source = "the quick brown is a fox Or noT";
+                String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
+                Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+                AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+            }
+        }
+
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_default.type", "common_grams")
+                     .put("index.analysis.filter.common_grams_default.query_mode", false)
+                     .putArray("index.analysis.filter.common_grams_default.common_words", "chromosome", "protein")
+                     .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            {
+                TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_default");
+                String source = "the quick brown is a fox Or noT";
+                String[] expected = new String[] { "the", "quick", "brown", "is", "a", "fox", "Or", "noT" };
+                Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+                AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+            }
+        }
+    }
+
+    @Test
+    public void testSettings() throws IOException {
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_1.ignore_case", true)
+                    .putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
+            String source = "the quick brown is a fox or noT";
+            String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "fox_or", "or", "or_noT", "noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_2.ignore_case", false)
+                    .putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
+            String source = "the quick brown is a fox or why noT";
+            String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "or", "why", "why_noT", "noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
+                    .putArray("index.analysis.filter.common_grams_3.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
+            String source = "the quick brown is a fox Or noT";
+            String[] expected = new String[] { "the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", "a_fox", "fox", "Or", "noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+    }
+
+    @Test
+    public void testCommonGramsAnalysis() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams.json").build();
+        {
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
+            String source = "the quick brown is a fox or not";
+            String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
+            AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
+        }
+        {
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
+            String source = "the quick brown is a fox or not";
+            String[] expected = new String[] { "the", "quick", "quick_brown", "brown", "brown_is", "is", "a", "a_fox", "fox", "fox_or", "or", "not" };
+            AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
+        }
+    }
+
+    @Test
+    public void testQueryModeSettings() throws IOException {
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_1.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_1.query_mode", true)
+                    .putArray("index.analysis.filter.common_grams_1.common_words", "the", "Or", "Not", "a", "is", "an", "they", "are")
+                    .put("index.analysis.filter.common_grams_1.ignore_case", true)
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_1");
+            String source = "the quick brown is a fox or noT";
+            String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox_or", "or_noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_2.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_2.query_mode", true)
+                    .putArray("index.analysis.filter.common_grams_2.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
+                    .put("index.analysis.filter.common_grams_2.ignore_case", false)
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_2");
+            String source = "the quick brown is a fox or why noT";
+            String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_3.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_3.query_mode", true)
+                    .putArray("index.analysis.filter.common_grams_3.common_words", "the", "Or", "noT", "a", "is", "an", "they", "are")
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_3");
+            String source = "the quick brown is a fox or why noT";
+            String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "or", "why_noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+        {
+            Settings settings = ImmutableSettings.settingsBuilder().put("index.analysis.filter.common_grams_4.type", "common_grams")
+                    .put("index.analysis.filter.common_grams_4.query_mode", true)
+                    .putArray("index.analysis.filter.common_grams_4.common_words", "the", "or", "not", "a", "is", "an", "they", "are")
+                    .build();
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            TokenFilterFactory tokenFilter = analysisService.tokenFilter("common_grams_4");
+            String source = "the quick brown is a fox Or noT";
+            String[] expected = new String[] { "the_quick", "quick", "brown_is", "is_a", "a_fox", "fox", "Or", "noT" };
+            Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader(source));
+            AnalysisTestsHelper.assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
+        }
+    }
+
+    @Test
+    public void testQueryModeCommonGramsAnalysis() throws IOException {
+        Settings settings = ImmutableSettings.settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/commongrams/commongrams_query_mode.json").build();
+        {
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer").analyzer();
+            String source = "the quick brown is a fox or not";
+            String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
+            AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
+        }
+        {
+            AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(settings);
+            Analyzer analyzer = analysisService.analyzer("commongramsAnalyzer_file").analyzer();
+            String source = "the quick brown is a fox or not";
+            String[] expected = new String[] { "the", "quick_brown", "brown_is", "is", "a_fox", "fox_or", "or", "not" };
+            AnalysisTestsHelper.assertSimpleTSOutput(analyzer.tokenStream("test", new StringReader(source)), expected);
+        }
+    }
+
+}
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt
@ -0,0 +1,2 @@
+brown
+fox
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/commongrams.json
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/commongrams.json
@ -0,0 +1,29 @@
+{
+    "index":{
+        "analysis":{
+            "analyzer":{
+                "commongramsAnalyzer":{
+                    "tokenizer":"whitespace",
+                    "filter":[ "common_grams" ]
+                },
+                "commongramsAnalyzer_file":{
+                    "tokenizer":"whitespace",
+                    "filter":[ "common_grams_file" ]
+                }
+            },
+            "filter":{
+                "common_grams":{
+                    "type":"common_grams",
+                    "common_words":[
+                        "brown",
+                        "fox"
+                    ]
+                },
+                "common_grams_file":{
+                    "type":"common_grams",
+                    "common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
+                }
+            }
+        }
+    }
+}
--- a/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/commongrams_query_mode.json
+++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/commongrams/commongrams_query_mode.json
@ -0,0 +1,31 @@
+{
+    "index":{
+        "analysis":{
+            "analyzer":{
+                "commongramsAnalyzer":{
+                    "tokenizer":"whitespace",
+                    "filter":[ "common_grams" ]
+                },
+                "commongramsAnalyzer_file":{
+                    "tokenizer":"whitespace",
+                    "filter":[ "common_grams_file" ]
+                }
+            },
+            "filter":{
+                "common_grams":{
+                    "type":"common_grams",
+                    "query_mode" : true,
+                    "common_words":[
+                        "brown",
+                        "fox"
+                    ]
+                },
+                "common_grams_file":{
+                    "type":"common_grams",
+                    "query_mode" : true,
+                    "common_words_path":"org/elasticsearch/test/unit/index/analysis/commongrams/common_words.txt"
+                }
+            }
+        }
+    }
+}