mirror of https://github.com/apache/lucene.git
LUCENE-1466: add test case for MappingCharFilter
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@787875 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
629c7ed4c7
commit
7792aebe92
|
@ -0,0 +1,185 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public abstract class BaseTokenTestCase extends LuceneTestCase {
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(new String(t.termBuffer(), 0, t.termLength()));
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(new String(t.termBuffer(), 0, t.termLength()));
|
||||
}
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
/*
|
||||
public List<String> tok2str(Iterable<Token> tokLst) {
|
||||
ArrayList<String> lst = new ArrayList<String>();
|
||||
for ( Token t : tokLst ) {
|
||||
lst.add( new String(t.termBuffer(), 0, t.termLength()));
|
||||
}
|
||||
return lst;
|
||||
}
|
||||
*/
|
||||
|
||||
public void assertTokEqual(List/*<Token>*/ a, List/*<Token>*/ b) {
|
||||
assertTokEq(a,b,false);
|
||||
assertTokEq(b,a,false);
|
||||
}
|
||||
|
||||
public void assertTokEqualOff(List/*<Token>*/ a, List/*<Token>*/ b) {
|
||||
assertTokEq(a,b,true);
|
||||
assertTokEq(b,a,true);
|
||||
}
|
||||
|
||||
private void assertTokEq(List/*<Token>*/ a, List/*<Token>*/ b, boolean checkOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = a.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
|
||||
, checkOff ? tok.startOffset() : -1
|
||||
, checkOff ? tok.endOffset() : -1
|
||||
))
|
||||
{
|
||||
fail(a + "!=" + b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean tokAt(List/*<Token>*/ lst, String val, int tokPos, int startOff, int endOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = lst.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val)
|
||||
&& (startOff==-1 || tok.startOffset()==startOff)
|
||||
&& (endOff ==-1 || tok.endOffset() ==endOff )
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* Return a list of tokens according to a test string format:
|
||||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
*/
|
||||
public List/*<Token>*/ tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
List/*<Token>*/ result = new ArrayList/*<Token>*/();
|
||||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
t.setPositionIncrement(0);
|
||||
result.add(t);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// These may be useful beyond test cases...
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
static List/*<Token>*/ getTokens(TokenStream tstream) throws IOException {
|
||||
List/*<Token>*/ tokens = new ArrayList/*<Token>*/();
|
||||
while (true) {
|
||||
Token t = tstream.next();
|
||||
if (t==null) break;
|
||||
tokens.add(t);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
/*
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
}
|
||||
public IterTokenStream(Iterable<Token> toks) {
|
||||
this.toks = toks.iterator();
|
||||
}
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
public IterTokenStream(String ... text) {
|
||||
int off = 0;
|
||||
ArrayList<Token> t = new ArrayList<Token>( text.length );
|
||||
for( String txt : text ) {
|
||||
t.add( new Token( txt, off, off+txt.length() ) );
|
||||
off += txt.length() + 2;
|
||||
}
|
||||
this.toks = t.iterator();
|
||||
}
|
||||
@Override
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
|
@ -0,0 +1,158 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
|
||||
public class TestMappingCharFilter extends BaseTokenTestCase {
|
||||
|
||||
NormalizeCharMap normMap;
|
||||
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
normMap = new NormalizeCharMap();
|
||||
|
||||
normMap.add( "aa", "a" );
|
||||
normMap.add( "bbb", "b" );
|
||||
normMap.add( "cccc", "cc" );
|
||||
|
||||
normMap.add( "h", "i" );
|
||||
normMap.add( "j", "jj" );
|
||||
normMap.add( "k", "kkk" );
|
||||
normMap.add( "ll", "llll" );
|
||||
|
||||
normMap.add( "empty", "" );
|
||||
}
|
||||
|
||||
public void testNothingChange() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "x" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "x" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "i" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "j" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "jj,1,0,1" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test1to3() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "k" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "kkk,1,0,1" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test2to4() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "ll" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "llll,1,0,2" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test2to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "aa" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "a,1,0,2" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test3to1() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "bbb" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "b,1,0,3" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test4to2() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "cccc" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "cc,1,0,4" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public void test5to0() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "empty" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
assertEquals( 0, real.size() );
|
||||
}
|
||||
|
||||
//
|
||||
// 1111111111222
|
||||
// 01234567890123456789012
|
||||
//(in) h i j k ll cccc bbb aa
|
||||
//
|
||||
// 1111111111222
|
||||
// 01234567890123456789012
|
||||
//(out) i i jj kkk llll cc b a
|
||||
//
|
||||
// h, 0, 1 => i, 0, 1
|
||||
// i, 2, 3 => i, 2, 3
|
||||
// j, 4, 5 => jj, 4, 5
|
||||
// k, 6, 7 => kkk, 6, 7
|
||||
// ll, 8,10 => llll, 8,10
|
||||
// cccc,11,15 => cc,11,15
|
||||
// bbb,16,19 => b,16,19
|
||||
// aa,20,22 => a,20,22
|
||||
//
|
||||
public void testTokenStream() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap, CharReader.get( new StringReader( "h i j k ll cccc bbb aa" ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
//
|
||||
//
|
||||
// 0123456789
|
||||
//(in) aaaa ll h
|
||||
//(out-1) aa llll i
|
||||
//(out-2) a llllllll i
|
||||
//
|
||||
// aaaa,0,4 => a,0,4
|
||||
// ll,5,7 => llllllll,5,7
|
||||
// h,8,9 => i,8,9
|
||||
public void testChained() throws Exception {
|
||||
CharStream cs = new MappingCharFilter( normMap,
|
||||
new MappingCharFilter( normMap, CharReader.get( new StringReader( "aaaa ll h" ) ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
List real = getTokens( ts );
|
||||
List expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue