diff --git a/src/main/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java b/src/main/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java new file mode 100644 index 00000000000..ec8ad4d12f3 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/miscellaneous/KeywordRepeatFilter.java @@ -0,0 +1,71 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.KeywordAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; + +import java.io.IOException; + + +/** + * This TokenFilterĀ emits each incoming token twice once as keyword and once non-keyword, in other words once with + * {@link KeywordAttribute#setKeyword(boolean)} set to true and once set to false. + * This is useful if used with a stem filter that respects the {@link KeywordAttribute} to index the stemmed and the + * un-stemmed version of a term into the same field. + */ +//LUCENE MONITOR - this will be included in Lucene 4.3. (it's a plain copy of the lucene version) + +public final class KeywordRepeatFilter extends TokenFilter { + private final KeywordAttribute keywordAttribute = addAttribute(KeywordAttribute.class); + private final PositionIncrementAttribute posIncAttr = addAttribute(PositionIncrementAttribute.class); + private State state; + + /** + * Construct a token stream filtering the given input. + */ + public KeywordRepeatFilter(TokenStream input) { + super(input); + } + + @Override + public boolean incrementToken() throws IOException { + if (state != null) { + restoreState(state); + posIncAttr.setPositionIncrement(0); + keywordAttribute.setKeyword(false); + state = null; + return true; + } + if (input.incrementToken()) { + state = captureState(); + keywordAttribute.setKeyword(true); + return true; + } + return false; + } + + @Override + public void reset() throws IOException { + super.reset(); + state = null; + } +} diff --git a/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index 32f9c2f1b55..40ebfa9d16b 100644 --- a/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -642,6 +642,17 @@ public class IndicesAnalysisService extends AbstractComponent { return new SnowballFilter(tokenStream, "Russian"); } })); + tokenFilterFactories.put("keyword_repeat", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override + public String name() { + return "keyword_repeat"; + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new KeywordRepeatFilter(tokenStream); + } + })); // Char Filter charFilterFactories.put("html_strip", new PreBuiltCharFilterFactoryFactory(new CharFilterFactory() { diff --git a/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java index 165d3748c7f..56820b36fed 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java +++ b/src/test/java/org/elasticsearch/test/unit/index/analysis/AnalysisModuleTests.java @@ -20,7 +20,12 @@ package org.elasticsearch.test.unit.index.analysis; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.miscellaneous.KeywordRepeatFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.util.Version; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.ModulesBuilder; import org.elasticsearch.common.lucene.Lucene; @@ -42,6 +47,7 @@ import org.testng.annotations.Test; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; +import java.io.StringReader; import java.util.Set; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; @@ -65,6 +71,15 @@ public class AnalysisModuleTests { Settings settings = settingsBuilder().loadFromClasspath("org/elasticsearch/test/unit/index/analysis/test1.yml").build(); testSimpleConfiguration(settings); } + + @Test + public void testDefaultFactory() { + AnalysisService analysisService = AnalysisTestsHelper.createAnalysisServiceFromSettings(ImmutableSettings.settingsBuilder().build()); + TokenFilterFactory tokenFilter = analysisService.tokenFilter("keyword_repeat"); + Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_36, new StringReader("foo bar")); + TokenStream stream = tokenFilter.create(tokenizer); + assertThat(stream, instanceOf(KeywordRepeatFilter.class)); + } private void testSimpleConfiguration(Settings settings) { Index index = new Index("test");