diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4dfee8b9e04..956200420b3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -167,6 +167,9 @@ Other * LUCENE-10525: Test-framework: Add detection of illegal windows filenames to WindowsFS. (Gautam Worah) +* LUCENE-10541: Test-framework: limit the default length of MockTokenizer tokens to 255. + (Robert Muir, Uwe Schindler, Tomoko Uchida, Dawid Weiss) + ======================= Lucene 9.1.0 ======================= API Changes diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java index de07c9d9878..9c02b4c9a5c 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/TestHTMLStripCharFilter.java @@ -27,6 +27,7 @@ import java.util.HashSet; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.util.TestUtil; @@ -37,7 +38,8 @@ public class TestHTMLStripCharFilter extends BaseTokenStreamTestCase { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + Tokenizer tokenizer = + new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2); return new TokenStreamComponents(tokenizer, tokenizer); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java index 9eccfd53f9f..71d35452db1 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestWordDelimiterFilter.java @@ -30,6 +30,7 @@ import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.CannedTokenStream; import org.apache.lucene.tests.analysis.MockTokenizer; @@ -579,7 +580,9 @@ public class TestWordDelimiterFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + Tokenizer tokenizer = + new MockTokenizer( + MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2); return new TokenStreamComponents( tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords)); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilter.java index 34076678f5f..c2c8a7382ba 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/shingle/TestShingleFilter.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.core.WhitespaceTokenizer; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.CannedTokenStream; import org.apache.lucene.tests.analysis.MockTokenizer; @@ -1240,7 +1241,8 @@ public class TestShingleFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + Tokenizer tokenizer = + new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2); return new TokenStreamComponents(tokenizer, new ShingleFilter(tokenizer)); } }; diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java index fca26b50910..b0a3e35348a 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymGraphFilter.java @@ -33,6 +33,7 @@ import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.store.ByteArrayDataInput; @@ -1276,7 +1277,8 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); + Tokenizer tokenizer = + new MockTokenizer(MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2); // Make a local variable so testRandomHuge doesn't share it across threads! SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); TestSynonymGraphFilter.this.flattenFilter = null; @@ -1292,7 +1294,8 @@ public class TestSynonymGraphFilter extends BaseTokenStreamTestCase { return new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true); + Tokenizer tokenizer = + new MockTokenizer(MockTokenizer.WHITESPACE, true, IndexWriter.MAX_TERM_LENGTH / 2); // Make a local variable so testRandomHuge doesn't share it across threads! SynonymGraphFilter synFilter = new SynonymGraphFilter(tokenizer, map, ignoreCase); FlattenGraphFilter flattenFilter = new FlattenGraphFilter(synFilter); diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java index 0415fae77b5..d02565effff 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/synonym/TestSynonymMapFilter.java @@ -31,6 +31,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockAnalyzer; import org.apache.lucene.tests.analysis.MockGraphTokenFilter; @@ -629,7 +630,8 @@ public class TestSynonymMapFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true); + Tokenizer tokenizer = + new MockTokenizer(MockTokenizer.SIMPLE, true, IndexWriter.MAX_TERM_LENGTH / 2); return new TokenStreamComponents( tokenizer, new SynonymFilter(tokenizer, map, ignoreCase)); } diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java index 42891c2a59a..c84b8c6887e 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/TestICUNormalizer2CharFilter.java @@ -26,6 +26,7 @@ import org.apache.lucene.analysis.CharFilter; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.KeywordTokenizer; import org.apache.lucene.analysis.ngram.NGramTokenizer; +import org.apache.lucene.index.IndexWriter; import org.apache.lucene.tests.analysis.BaseTokenStreamTestCase; import org.apache.lucene.tests.analysis.MockTokenizer; import org.apache.lucene.tests.util.TestUtil; @@ -218,7 +219,8 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(MockTokenizer.KEYWORD, false)); + return new TokenStreamComponents( + new MockTokenizer(MockTokenizer.KEYWORD, false, IndexWriter.MAX_TERM_LENGTH / 2)); } @Override @@ -326,7 +328,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false)); + return new TokenStreamComponents( + new MockTokenizer( + MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2)); } @Override @@ -346,7 +350,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false)); + return new TokenStreamComponents( + new MockTokenizer( + MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2)); } @Override @@ -366,7 +372,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false)); + return new TokenStreamComponents( + new MockTokenizer( + MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2)); } @Override @@ -386,7 +394,9 @@ public class TestICUNormalizer2CharFilter extends BaseTokenStreamTestCase { new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { - return new TokenStreamComponents(new MockTokenizer(MockTokenizer.WHITESPACE, false)); + return new TokenStreamComponents( + new MockTokenizer( + MockTokenizer.WHITESPACE, false, IndexWriter.MAX_TERM_LENGTH / 2)); } @Override diff --git a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java index e992bb8c638..11f58b88348 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestExceedMaxTermLength.java @@ -35,7 +35,7 @@ import org.junit.Before; public class TestExceedMaxTermLength extends LuceneTestCase { private static final int minTestTermLength = IndexWriter.MAX_TERM_LENGTH + 1; - private static final int maxTestTermLegnth = IndexWriter.MAX_TERM_LENGTH * 2; + private static final int maxTestTermLength = IndexWriter.MAX_TERM_LENGTH * 2; Directory dir = null; @@ -52,8 +52,9 @@ public class TestExceedMaxTermLength extends LuceneTestCase { public void test() throws Exception { - IndexWriter w = - new IndexWriter(dir, newIndexWriterConfig(random(), new MockAnalyzer(random()))); + MockAnalyzer mockAnalyzer = new MockAnalyzer(random()); + mockAnalyzer.setMaxTokenLength(Integer.MAX_VALUE); + IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(random(), mockAnalyzer)); try { final FieldType ft = new FieldType(); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); @@ -72,7 +73,7 @@ public class TestExceedMaxTermLength extends LuceneTestCase { // problematic field final String name = TestUtil.randomSimpleString(random(), 1, 50); final String value = - TestUtil.randomSimpleString(random(), minTestTermLength, maxTestTermLegnth); + TestUtil.randomSimpleString(random(), minTestTermLength, maxTestTermLength); final Field f = new Field(name, value, ft); if (random().nextBoolean()) { // totally ok short field value diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/MockTokenizer.java index a3e0343b896..9546e639373 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/analysis/MockTokenizer.java @@ -62,10 +62,19 @@ public class MockTokenizer extends Tokenizer { new RegExp("[A-Za-zªµºÀ-ÖØ-öø-ˁ一-鿌]+").toAutomaton(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT)); + /** + * Limit the default token length to a size that doesn't cause random analyzer failures on + * unpredictable data like the enwiki data set. + * + *
This value defaults to {@code CharTokenizer.DEFAULT_MAX_WORD_LEN} (255). + * + * @see "https://issues.apache.org/jira/browse/LUCENE-10541" + */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + private final CharacterRunAutomaton runAutomaton; private final boolean lowerCase; private final int maxTokenLength; - public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE; private int state; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); diff --git a/lucene/test-framework/src/test/org/apache/lucene/tests/analysis/TestLookaheadTokenFilter.java b/lucene/test-framework/src/test/org/apache/lucene/tests/analysis/TestLookaheadTokenFilter.java index 5003ac8a0bb..faa213bc5fe 100644 --- a/lucene/test-framework/src/test/org/apache/lucene/tests/analysis/TestLookaheadTokenFilter.java +++ b/lucene/test-framework/src/test/org/apache/lucene/tests/analysis/TestLookaheadTokenFilter.java @@ -21,6 +21,7 @@ import java.util.Random; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.index.IndexWriter; public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase { @@ -30,7 +31,11 @@ public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Random random = random(); - Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, random.nextBoolean()); + Tokenizer tokenizer = + new MockTokenizer( + MockTokenizer.WHITESPACE, + random.nextBoolean(), + IndexWriter.MAX_TERM_LENGTH / 2); TokenStream output = new MockRandomLookaheadTokenFilter(random, tokenizer); return new TokenStreamComponents(tokenizer, output); } @@ -62,7 +67,10 @@ public class TestLookaheadTokenFilter extends BaseTokenStreamTestCase { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = - new MockTokenizer(MockTokenizer.WHITESPACE, random().nextBoolean()); + new MockTokenizer( + MockTokenizer.WHITESPACE, + random().nextBoolean(), + IndexWriter.MAX_TERM_LENGTH / 2); TokenStream output = new NeverPeeksLookaheadTokenFilter(tokenizer); return new TokenStreamComponents(tokenizer, output); }