mirror of https://github.com/apache/lucene.git
LUCENE-3043: GermanStemmer threw IOOBE on zero-length tokens
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1096194 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
c0c695053c
commit
7db98455e7
|
@ -55,6 +55,9 @@ Bug Fixes
|
||||||
when using CachingTokenStream. This can be a significant performance bug for
|
when using CachingTokenStream. This can be a significant performance bug for
|
||||||
large documents. (Mark Miller)
|
large documents. (Mark Miller)
|
||||||
|
|
||||||
|
* LUCENE-3043: GermanStemmer threw IndexOutOfBoundsException if it encountered
|
||||||
|
a zero-length token. (Robert Muir)
|
||||||
|
|
||||||
New Features
|
New Features
|
||||||
|
|
||||||
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
* LUCENE-3016: Add analyzer for Latvian. (Robert Muir)
|
||||||
|
|
|
@ -132,7 +132,8 @@ public class GermanStemmer
|
||||||
strip( buffer );
|
strip( buffer );
|
||||||
}
|
}
|
||||||
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
|
// Additional step for irregular plural nouns like "Matrizen -> Matrix".
|
||||||
if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
|
// NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
|
||||||
|
if ( buffer.length() > 0 && buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
|
||||||
buffer.setCharAt( buffer.length() - 1, 'x' );
|
buffer.setCharAt( buffer.length() - 1, 'x' );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.util.ReusableAnalyzerBase;
|
||||||
import org.junit.Ignore;
|
|
||||||
|
|
||||||
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
import static org.apache.lucene.analysis.util.VocabularyAssert.*;
|
||||||
|
|
||||||
|
@ -53,8 +52,13 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
vocOut.close();
|
vocOut.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LUCENE-3043: we use keywordtokenizer in this test,
|
||||||
|
// so ensure the stemmer does not crash on zero-length strings.
|
||||||
|
public void testEmpty() throws Exception {
|
||||||
|
assertAnalyzesTo(analyzer, "", new String[] { "" });
|
||||||
|
}
|
||||||
|
|
||||||
/** blast some random strings through the analyzer */
|
/** blast some random strings through the analyzer */
|
||||||
@Ignore("bugs!")
|
|
||||||
public void testRandomStrings() throws Exception {
|
public void testRandomStrings() throws Exception {
|
||||||
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue