From f4889dd846e9956a93a6d76c6377c53e17e54592 Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Fri, 29 Jul 2011 22:45:26 +0300 Subject: [PATCH] Analysis: Unique token filter, closes #1185. --- .../miscellaneous/UniqueTokenFilter.java | 88 +++++++++++++++++++ .../index/analysis/AnalysisModule.java | 1 + .../analysis/UniqueTokenFilterFactory.java | 46 ++++++++++ .../analysis/IndicesAnalysisService.java | 11 +++ .../miscellaneous/UniqueTokenFilterTests.java | 66 ++++++++++++++ 5 files changed, 212 insertions(+) create mode 100644 modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java create mode 100644 modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java create mode 100644 modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java diff --git a/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java new file mode 100644 index 00000000000..bdbbe8c27f5 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilter.java @@ -0,0 +1,88 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.Version; + +import java.io.IOException; + +/** + * A token filter that generates unique tokens. Can remove unique tokens only on the same + * position increments as well. + */ +public class UniqueTokenFilter extends TokenFilter { + + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAttribute = addAttribute(PositionIncrementAttribute.class); + + // use a fixed version, as we don't care about case sensitivity. + private final CharArraySet previous = new CharArraySet(Version.LUCENE_31, 8, false); + private final boolean onlyOnSamePosition; + + public UniqueTokenFilter(TokenStream in) { + this(in, false); + } + + public UniqueTokenFilter(TokenStream in, boolean onlyOnSamePosition) { + super(in); + this.onlyOnSamePosition = onlyOnSamePosition; + } + + @Override public final boolean incrementToken() throws IOException { + while (input.incrementToken()) { + final char term[] = termAttribute.buffer(); + final int length = termAttribute.length(); + + boolean duplicate; + if (onlyOnSamePosition) { + final int posIncrement = posIncAttribute.getPositionIncrement(); + if (posIncrement > 0) { + previous.clear(); + } + + duplicate = (posIncrement == 0 && previous.contains(term, 0, length)); + } else { + duplicate = previous.contains(term, 0, length); + } + + // clone the term, and add to the set of seen terms. + char saved[] = new char[length]; + System.arraycopy(term, 0, saved, 0, length); + previous.add(saved); + + if (!duplicate) { + return true; + } + } + return false; + } + + @Override public final void reset() throws IOException { + super.reset(); + previous.clear(); + } +} + + diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java index da646261b93..e3b0ca755d5 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/AnalysisModule.java @@ -337,6 +337,7 @@ public class AnalysisModule extends AbstractModule { tokenFiltersBindings.processTokenFilter("edgeNGram", EdgeNGramTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("edge_ngram", EdgeNGramTokenFilterFactory.class); tokenFiltersBindings.processTokenFilter("shingle", ShingleTokenFilterFactory.class); + tokenFiltersBindings.processTokenFilter("unique", UniqueTokenFilterFactory.class); } @Override public void processTokenizers(TokenizersBindings tokenizersBindings) { diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java new file mode 100644 index 00000000000..86590df2781 --- /dev/null +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/index/analysis/UniqueTokenFilterFactory.java @@ -0,0 +1,46 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.analysis; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.settings.IndexSettings; + +/** + * + */ +public class UniqueTokenFilterFactory extends AbstractTokenFilterFactory { + + private final boolean onlyOnSamePosition; + + @Inject public UniqueTokenFilterFactory(Index index, @IndexSettings Settings indexSettings, + @Assisted String name, @Assisted Settings settings) { + super(index, indexSettings, name, settings); + this.onlyOnSamePosition = settings.getAsBoolean("only_on_same_position", false); + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new UniqueTokenFilter(tokenStream, onlyOnSamePosition); + } +} diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java index 169217fb30f..e944cf0fe56 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/indices/analysis/IndicesAnalysisService.java @@ -49,6 +49,7 @@ import org.apache.lucene.analysis.hy.ArmenianAnalyzer; import org.apache.lucene.analysis.id.IndonesianAnalyzer; import org.apache.lucene.analysis.it.ItalianAnalyzer; import org.apache.lucene.analysis.miscellaneous.PatternAnalyzer; +import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter; import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; @@ -422,6 +423,16 @@ public class IndicesAnalysisService extends AbstractComponent { } })); + tokenFilterFactories.put("unique", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { + @Override public String name() { + return "unique"; + } + + @Override public TokenStream create(TokenStream tokenStream) { + return new UniqueTokenFilter(tokenStream); + } + })); + // Extended Token Filters tokenFilterFactories.put("snowball", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { diff --git a/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java b/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java new file mode 100644 index 00000000000..0bdbf26aee9 --- /dev/null +++ b/modules/elasticsearch/src/test/java/org/apache/lucene/analysis/miscellaneous/UniqueTokenFilterTests.java @@ -0,0 +1,66 @@ +/* + * Licensed to Elastic Search and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. Elastic Search licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.lucene.analysis.miscellaneous; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.ReusableAnalyzerBase; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.lucene.Lucene; +import org.testng.annotations.Test; + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; + +import static org.hamcrest.MatcherAssert.*; +import static org.hamcrest.Matchers.*; + +/** + */ +@Test +public class UniqueTokenFilterTests { + + @Test public void simpleTest() throws IOException { + Analyzer analyzer = new ReusableAnalyzerBase() { + @Override + protected TokenStreamComponents createComponents(String fieldName, + Reader reader) { + Tokenizer t = new WhitespaceTokenizer(Lucene.VERSION, reader); + return new TokenStreamComponents(t, new UniqueTokenFilter(t)); + } + }; + + TokenStream test = analyzer.reusableTokenStream("test", new StringReader("this test with test")); + CharTermAttribute termAttribute = test.addAttribute(CharTermAttribute.class); + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("this")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("test")); + + assertThat(test.incrementToken(), equalTo(true)); + assertThat(termAttribute.toString(), equalTo("with")); + + assertThat(test.incrementToken(), equalTo(false)); + } +}