SOLR-1400: handle zero length term buffer in TrimFilter

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@812494 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2009-09-08 13:07:08 +00:00
parent 18a105e35c
commit fe06ba88ff
3 changed files with 48 additions and 27 deletions

View File

@ -532,6 +532,8 @@ Bug Fixes
63. SOLR-1398: Add offset corrections in PatternTokenizerFactory.
(Anders Melchiorsen, koji)
64. SOLR-1400: Properly handle zero-length tokens in TrimFilter (Peter Wolanin, gsingers)
Other Changes
----------------------
1. Upgraded to Lucene 2.4.0 (yonik)

View File

@ -51,6 +51,11 @@ public final class TrimFilter extends TokenFilter {
char[] termBuffer = termAtt.termBuffer();
int len = termAtt.termLength();
//TODO: Is this the right behavior or should we return false? Currently, " ", returns true, so I think this should
//also return true
if (len == 0){
return true;
}
int start = 0;
int end = 0;
int endOff = 0;

View File

@ -17,50 +17,64 @@
package org.apache.solr.analysis;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.List;
/**
* @version $Id:$
*/
public class TestTrimFilter extends BaseTokenTestCase {
public void testTrim() throws Exception {
TokenStream ts = new TrimFilter
(new IterTokenStream(new Token(" a ", 1, 5),
new Token("b ",6,10),
new Token("cCc",11,15),
new Token(" ",16,20)), false );
Token token = ts.next();
public void testTrim() throws Exception {
char[] a = " a ".toCharArray();
char[] b = "b ".toCharArray();
char[] ccc = "cCc".toCharArray();
char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray();
TokenStream ts = new TrimFilter
(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21)), false);
TermAttribute token;
assertTrue(ts.incrementToken());
token = (TermAttribute) ts.getAttribute(TermAttribute.class);
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertTrue(ts.incrementToken());
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertTrue(ts.incrementToken());
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
assertFalse(ts.incrementToken());
a = " a".toCharArray();
b = "b ".toCharArray();
ccc = " c ".toCharArray();
whitespace = " ".toCharArray();
ts = new TrimFilter(new IterTokenStream(
new Token(a, 0, a.length, 0, 2),
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3)), true);
ts = new TrimFilter( new IterTokenStream(
new Token(" a", 0,2),
new Token("b ", 0,2),
new Token(" c ",0,3),
new Token(" ",0,3)), true );
List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
List<Token> real = getTokens(ts);
for( Token t : expect ) {
System.out.println( "TEST:" + t );
for (Token t : expect) {
System.out.println("TEST:" + t);
}
for( Token t : real ) {
System.out.println( "REAL:" + t );
for (Token t : real) {
System.out.println("REAL:" + t);
}
assertTokEqualOff( expect, real );
assertTokEqualOff(expect, real);
}
}