LUCENE-3043: GermanStemmer threw IOOBE on zero-length tokens

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1096194 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-04-23 17:48:17 +00:00
parent c0c695053c
commit 7db98455e7
3 changed files with 11 additions and 3 deletions

View File

@ -55,6 +55,9 @@ Bug Fixes
when using CachingTokenStream. This can be a significant performance bug for
large documents. (Mark Miller)
* LUCENE-3043: GermanStemmer threw IndexOutOfBoundsException if it encountered
a zero-length token. (Robert Muir)
New Features
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)

View File

@ -132,7 +132,8 @@ public class GermanStemmer
strip( buffer );
}
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
// NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
if ( buffer.length() > 0 && buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
buffer.setCharAt( buffer.length() - 1, 'x' );
}
}

View File

@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
import org.junit.Ignore;
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
@ -53,8 +52,13 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
vocOut.close();
}
// LUCENE-3043: we use keywordtokenizer in this test,
// so ensure the stemmer does not crash on zero-length strings.
public void testEmpty() throws Exception {
assertAnalyzesTo(analyzer, "", new String[] { "" });
}
/** blast some random strings through the analyzer */
@Ignore("bugs!")
public void testRandomStrings() throws Exception {
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
}