LUCENE-2413: consolidate RemoveDuplicatesTokenFilter to contrib/analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940788 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-05-04 09:55:43 +00:00
parent 9238c5f5ca
commit 6fa480a5ed
5 changed files with 87 additions and 6 deletions

View File

@ -163,6 +163,8 @@ New features
constructs.
- o.a.l.analysis.miscellaneous.WordDelimiterFilter: TokenFilter that splits words
into subwords and performs optional transformations on subword groups.
- o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which
filters out Tokens at the same position and Term text as the previous token.
(... in progress)
Build

View File

@ -15,7 +15,7 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;

View File

@ -15,8 +15,9 @@
* limitations under the License.
*/
package org.apache.solr.analysis;
package org.apache.lucene.analysis.miscellaneous;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
@ -26,7 +27,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.util.Iterator;
import java.util.Arrays;
public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t,start,end);
@ -41,8 +42,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
final TokenStream ts = factory.create
final TokenStream ts = new RemoveDuplicatesTokenFilter(
(new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@ -59,7 +59,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
return false;
}
}
});
}));
assertTokenStreamContents(ts, expected.split("\\s"));
}

View File

@ -18,6 +18,7 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
/**
* @version $Id:$

View File

@ -0,0 +1,78 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.util.Iterator;
import java.util.Arrays;
/** Simple tests to ensure this factory is working */
public class TestRemoveDuplicatesTokenFilterFactory extends BaseTokenTestCase {
public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t,start,end);
tok.setPositionIncrement(pos);
return tok;
}
public static Token tok(int pos, String t) {
return tok(pos, t, 0,0);
}
public void testDups(final String expected, final Token... tokens)
throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
final TokenStream ts = factory.create
(new TokenStream() {
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setEmpty().append(tok.term());
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
});
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testSimpleDups() throws Exception {
testDups("A B C D E"
,tok(1,"A", 0, 4)
,tok(1,"B", 5, 10)
,tok(0,"B",11, 15)
,tok(1,"C",16, 20)
,tok(0,"D",16, 20)
,tok(1,"E",21, 25)
);
}
}