diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index ec3914b7866..a7df800f265 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -163,6 +163,8 @@ New features constructs. - o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words into subwords and performs optional transformations on subword groups. + - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which + filters out Tokens at the same position and Term text as the previous token. (... in progress) Build diff --git a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java similarity index 98% rename from solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java rename to lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java index 2978115867d..200ae0a1956 100644 --- a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java +++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/RemoveDuplicatesTokenFilter.java @@ -15,7 +15,7 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenFilter; diff --git a/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java similarity index 93% rename from solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java rename to lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java index 5a51117346a..75b8b88cb69 100644 --- a/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilter.java +++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/miscellaneous/TestRemoveDuplicatesTokenFilter.java @@ -15,8 +15,9 @@ * limitations under the License. */ -package org.apache.solr.analysis; +package org.apache.lucene.analysis.miscellaneous; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; @@ -26,7 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.util.Iterator; import java.util.Arrays; -public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase { +public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase { public static Token tok(int pos, String t, int start, int end) { Token tok = new Token(t,start,end); @@ -41,8 +42,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase { throws Exception { final Iterator toks = Arrays.asList(tokens).iterator(); - RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); - final TokenStream ts = factory.create + final TokenStream ts = new RemoveDuplicatesTokenFilter( (new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -59,7 +59,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase { return false; } } - }); + })); assertTokenStreamContents(ts, expected.split("\\s")); } diff --git a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java index 202eb0e6b38..a2017552817 100644 --- a/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java @@ -18,6 +18,7 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter; /** * @version $Id:$ diff --git a/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilterFactory.java b/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilterFactory.java new file mode 100644 index 00000000000..4a629171011 --- /dev/null +++ b/solr/src/test/org/apache/solr/analysis/TestRemoveDuplicatesTokenFilterFactory.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.solr.analysis; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +import java.util.Iterator; +import java.util.Arrays; + +/** Simple tests to ensure this factory is working */ +public class TestRemoveDuplicatesTokenFilterFactory extends BaseTokenTestCase { + + public static Token tok(int pos, String t, int start, int end) { + Token tok = new Token(t,start,end); + tok.setPositionIncrement(pos); + return tok; + } + public static Token tok(int pos, String t) { + return tok(pos, t, 0,0); + } + + public void testDups(final String expected, final Token... tokens) + throws Exception { + + final Iterator toks = Arrays.asList(tokens).iterator(); + RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory(); + final TokenStream ts = factory.create + (new TokenStream() { + CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + public boolean incrementToken() { + if (toks.hasNext()) { + clearAttributes(); + Token tok = toks.next(); + termAtt.setEmpty().append(tok.term()); + offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); + posIncAtt.setPositionIncrement(tok.getPositionIncrement()); + return true; + } else { + return false; + } + } + }); + + assertTokenStreamContents(ts, expected.split("\\s")); + } + + public void testSimpleDups() throws Exception { + testDups("A B C D E" + ,tok(1,"A", 0, 4) + ,tok(1,"B", 5, 10) + ,tok(0,"B",11, 15) + ,tok(1,"C",16, 20) + ,tok(0,"D",16, 20) + ,tok(1,"E",21, 25) + ); + } +}