mirror of https://github.com/apache/lucene.git
LUCENE-10541: Test-framework: limit the default length of MockTokenizer tokens to 255.
This commit is contained in:
parent
c28f575b6d
commit
6e6c61eb13
|
@ -167,6 +167,9 @@ Other
|
||||||
|
|
||||||
* LUCENE-10525: Test-framework: Add detection of illegal windows filenames to WindowsFS. (Gautam Worah)
|
* LUCENE-10525: Test-framework: Add detection of illegal windows filenames to WindowsFS. (Gautam Worah)
|
||||||
|
|
||||||
|
* LUCENE-10541: Test-framework: limit the default length of MockTokenizer tokens to 255.
|
||||||
|
(Robert Muir, Uwe Schindler, Tomoko Uchida, Dawid Weiss)
|
||||||
|
|
||||||
======================= Lucene 9.1.0 =======================
|
======================= Lucene 9.1.0 =======================
|
||||||
|
|
||||||
API Changes
|
API Changes
|
||||||
|
|
|
@ -27,6 +27,7 @@ import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.MockTokenizer;
|
import org.apache.lucene.tests.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.tests.util.TestUtil;
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
|
@ -37,7 +38,8 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase {
|
||||||
return new Analyzer() {
|
return new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
return new TokenStreamComponents(tokenizer, tokenizer);
|
return new TokenStreamComponents(tokenizer, tokenizer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
import org.apache.lucene.analysis.en.EnglishAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.CannedTokenStream;
|
import org.apache.lucene.tests.analysis.CannedTokenStream;
|
||||||
import org.apache.lucene.tests.analysis.MockTokenizer;
|
import org.apache.lucene.tests.analysis.MockTokenizer;
|
||||||
|
@ -579,7 +580,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
return new TokenStreamComponents(
|
return new TokenStreamComponents(
|
||||||
tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.CannedTokenStream;
|
import org.apache.lucene.tests.analysis.CannedTokenStream;
|
||||||
import org.apache.lucene.tests.analysis.MockTokenizer;
|
import org.apache.lucene.tests.analysis.MockTokenizer;
|
||||||
|
@ -1240,7 +1241,8 @@ public class TestShingleFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
|
return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
|
@ -1276,7 +1277,8 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
|
||||||
return new Analyzer() {
|
return new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
// Make a local variable so testRandomHuge doesn't share it across threads!
|
// Make a local variable so testRandomHuge doesn't share it across threads!
|
||||||
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
|
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
|
||||||
TestSynonymGraphFilter.this.flattenFilter = null;
|
TestSynonymGraphFilter.this.flattenFilter = null;
|
||||||
|
@ -1292,7 +1294,8 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase {
|
||||||
return new Analyzer() {
|
return new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(MockTokenizer.WHITESPACE, true, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
// Make a local variable so testRandomHuge doesn't share it across threads!
|
// Make a local variable so testRandomHuge doesn't share it across threads!
|
||||||
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
|
SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase);
|
||||||
FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter);
|
FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter);
|
||||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
import org.apache.lucene.tests.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
|
import org.apache.lucene.tests.analysis.MockGraphTokenFilter;
|
||||||
|
@ -629,7 +630,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(MockTokenizer.SIMPLE, true, IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
return new TokenStreamComponents(
|
return new TokenStreamComponents(
|
||||||
tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
|
tokenizer, new SynonymFilter(tokenizer, map, ignoreCase));
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.lucene.analysis.CharFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.tests.analysis.MockTokenizer;
|
import org.apache.lucene.tests.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.tests.util.TestUtil;
|
import org.apache.lucene.tests.util.TestUtil;
|
||||||
|
@ -218,7 +219,8 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false));
|
return new TokenStreamComponents(
|
||||||
|
new MockTokenizer(MockTokenizer.KEYWORD, false, IndexWriter.MAX_TERM_LENGTH / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -326,7 +328,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
return new TokenStreamComponents(
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -346,7 +350,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
return new TokenStreamComponents(
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -366,7 +372,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
return new TokenStreamComponents(
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -386,7 +394,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase {
|
||||||
new Analyzer() {
|
new Analyzer() {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false));
|
return new TokenStreamComponents(
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -35,7 +35,7 @@ import org.junit.Before;
|
||||||
public class TestExceedMaxTermLength extends LuceneTestCase {
|
public class TestExceedMaxTermLength extends LuceneTestCase {
|
||||||
|
|
||||||
private static final int minTestTermLength = IndexWriter.MAX_TERM_LENGTH + 1;
|
private static final int minTestTermLength = IndexWriter.MAX_TERM_LENGTH + 1;
|
||||||
private static final int maxTestTermLegnth = IndexWriter.MAX_TERM_LENGTH * 2;
|
private static final int maxTestTermLength = IndexWriter.MAX_TERM_LENGTH * 2;
|
||||||
|
|
||||||
Directory dir = null;
|
Directory dir = null;
|
||||||
|
|
||||||
|
@ -52,8 +52,9 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
|
||||||
|
|
||||||
public void test() throws Exception {
|
public void test() throws Exception {
|
||||||
|
|
||||||
IndexWriter w =
|
MockAnalyzer mockAnalyzer = new MockAnalyzer(random());
|
||||||
new IndexWriter(dir, newIndexWriterConfig(random(), new MockAnalyzer(random())));
|
mockAnalyzer.setMaxTokenLength(Integer.MAX_VALUE);
|
||||||
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer));
|
||||||
try {
|
try {
|
||||||
final FieldType ft = new FieldType();
|
final FieldType ft = new FieldType();
|
||||||
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||||
|
@ -72,7 +73,7 @@ public class TestExceedMaxTermLength extends LuceneTestCase {
|
||||||
// problematic field
|
// problematic field
|
||||||
final String name = TestUtil.randomSimpleString(random(), 1, 50);
|
final String name = TestUtil.randomSimpleString(random(), 1, 50);
|
||||||
final String value =
|
final String value =
|
||||||
TestUtil.randomSimpleString(random(), minTestTermLength, maxTestTermLegnth);
|
TestUtil.randomSimpleString(random(), minTestTermLength, maxTestTermLength);
|
||||||
final Field f = new Field(name, value, ft);
|
final Field f = new Field(name, value, ft);
|
||||||
if (random().nextBoolean()) {
|
if (random().nextBoolean()) {
|
||||||
// totally ok short field value
|
// totally ok short field value
|
||||||
|
|
|
@ -62,10 +62,19 @@ public class MockTokenizer extends Tokenizer {
|
||||||
new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton(),
|
new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton(),
|
||||||
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));
|
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Limit the default token length to a size that doesn't cause random analyzer failures on
|
||||||
|
* unpredictable data like the enwiki data set.
|
||||||
|
*
|
||||||
|
* <p>This value defaults to {@code CharTokenizer.DEFAULT_MAX_WORD_LEN} (255).
|
||||||
|
*
|
||||||
|
* @see "https://issues.apache.org/jira/browse/LUCENE-10541"
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
||||||
|
|
||||||
private final CharacterRunAutomaton runAutomaton;
|
private final CharacterRunAutomaton runAutomaton;
|
||||||
private final boolean lowerCase;
|
private final boolean lowerCase;
|
||||||
private final int maxTokenLength;
|
private final int maxTokenLength;
|
||||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
|
|
||||||
private int state;
|
private int state;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
|
@ -21,6 +21,7 @@ import java.util.Random;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
|
||||||
public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
|
public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -30,7 +31,11 @@ public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, random.nextBoolean());
|
Tokenizer tokenizer =
|
||||||
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE,
|
||||||
|
random.nextBoolean(),
|
||||||
|
IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer);
|
TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer);
|
||||||
return new TokenStreamComponents(tokenizer, output);
|
return new TokenStreamComponents(tokenizer, output);
|
||||||
}
|
}
|
||||||
|
@ -62,7 +67,10 @@ public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName) {
|
protected TokenStreamComponents createComponents(String fieldName) {
|
||||||
Tokenizer tokenizer =
|
Tokenizer tokenizer =
|
||||||
new MockTokenizer(MockTokenizer.WHITESPACE, random().nextBoolean());
|
new MockTokenizer(
|
||||||
|
MockTokenizer.WHITESPACE,
|
||||||
|
random().nextBoolean(),
|
||||||
|
IndexWriter.MAX_TERM_LENGTH / 2);
|
||||||
TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer);
|
TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer);
|
||||||
return new TokenStreamComponents(tokenizer, output);
|
return new TokenStreamComponents(tokenizer, output);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue