diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java new file mode 100644 index 00000000000..e6a3e350c7e --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilter.java @@ -0,0 +1,55 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.io.IOException; + +/** + * A token filter that truncates tokens. + */ +public class TruncateTokenFilter extends TokenFilter { + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + + private final int size; + + public TruncateTokenFilter(TokenStream in, int size) { + super(in); + this.size = size; + } + + @Override public final boolean incrementToken() throws IOException { + if (input.incrementToken()) { + final int length = termAttribute.length(); + if (length > size) { + termAttribute.setLength(size); + } + return true; + } else { + return false; + } + } +} + + diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index e3b0ca755d5..dcd2bc133c5 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -338,6 +338,7 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("unique", UniqueTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("truncate", TruncateTokenFilterFactory.class); } @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java new file mode 100644 index 00000000000..32f47a51906 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/TruncateTokenFilterFactory.java @@ -0,0 +1,51 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * + */ +@AnalysisSettingsRequired +public class TruncateTokenFilterFactory extends AbstractTokenFilterFactory { + + private final int length; + + @Inject public TruncateTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, + @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.length = settings.getAsInt("length", -1); + if (length <= 0) { + throw new ElasticSearchIllegalArgumentException("length parameter must be provided"); + } + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new TruncateTokenFilter(tokenStream, length); + } +} diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index e944cf0fe56..4a87af70ba1 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -49,6 +49,7 @@ import org.apache.lucene.analysis.hy.ArmenianAnalyzer; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.analysis.it.ItalianAnalyzer; import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; +import org.apache.lucene.analysis.miscellaneous.TruncateTokenFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; @@ -433,6 +434,16 @@ public class IndicesAnalysisService extends AbstractComponent { } })); + tokenFilterFactories.put("truncate", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "truncate"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new TruncateTokenFilter(tokenStream, 10); + } + })); + // Extended Token Filters tokenFilterFactories.put("snowball", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { diff --git a/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java b/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java new file mode 100644 index 00000000000..87dfa970644 --- /dev/null +++ b/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/TruncateTokenFilterTests.java @@ -0,0 +1,72 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.lucene.Lucene; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import static org.hamcrest.MatcherAssert.*; +import static org.hamcrest.Matchers.*; + +/** + */ +@Test +public class TruncateTokenFilterTests { + + @Test public void simpleTest() throws IOException { + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader); + return new TokenStreamComponents(t, new TruncateTokenFilter(t, 3)); + } + }; + + TokenStream test = analyzer.reusableTokenStream("test", new StringReader("a bb ccc dddd eeeee")); + CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("a")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("bb")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("ccc")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("ddd")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("eee")); + + assertThat(test.incrementToken(), equalTo(false)); + } +}